xref: /kernel/linux/linux-5.10/drivers/md/raid5.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * raid5.c : Multiple Devices driver for Linux
4 *	   Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
5 *	   Copyright (C) 1999, 2000 Ingo Molnar
6 *	   Copyright (C) 2002, 2003 H. Peter Anvin
7 *
8 * RAID-4/5/6 management functions.
9 * Thanks to Penguin Computing for making the RAID-6 development possible
10 * by donating a test server!
11 */
12
13/*
14 * BITMAP UNPLUGGING:
15 *
16 * The sequencing for updating the bitmap reliably is a little
17 * subtle (and I got it wrong the first time) so it deserves some
18 * explanation.
19 *
20 * We group bitmap updates into batches.  Each batch has a number.
21 * We may write out several batches at once, but that isn't very important.
22 * conf->seq_write is the number of the last batch successfully written.
23 * conf->seq_flush is the number of the last batch that was closed to
24 *    new additions.
25 * When we discover that we will need to write to any block in a stripe
26 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
27 * the number of the batch it will be in. This is seq_flush+1.
28 * When we are ready to do a write, if that batch hasn't been written yet,
29 *   we plug the array and queue the stripe for later.
30 * When an unplug happens, we increment bm_flush, thus closing the current
31 *   batch.
32 * When we notice that bm_flush > bm_write, we write out all pending updates
33 * to the bitmap, and advance bm_write to where bm_flush was.
34 * This may occasionally write a bit out twice, but is sure never to
35 * miss any bits.
36 */
37
38#include <linux/blkdev.h>
39#include <linux/kthread.h>
40#include <linux/raid/pq.h>
41#include <linux/async_tx.h>
42#include <linux/module.h>
43#include <linux/async.h>
44#include <linux/seq_file.h>
45#include <linux/cpu.h>
46#include <linux/slab.h>
47#include <linux/ratelimit.h>
48#include <linux/nodemask.h>
49
50#include <trace/events/block.h>
51#include <linux/list_sort.h>
52
53#include "md.h"
54#include "raid5.h"
55#include "raid0.h"
56#include "md-bitmap.h"
57#include "raid5-log.h"
58
59#define UNSUPPORTED_MDDEV_FLAGS	(1L << MD_FAILFAST_SUPPORTED)
60
61#define cpu_to_group(cpu) cpu_to_node(cpu)
62#define ANY_GROUP NUMA_NO_NODE
63
64static bool devices_handle_discard_safely = false;
65module_param(devices_handle_discard_safely, bool, 0644);
66MODULE_PARM_DESC(devices_handle_discard_safely,
67		 "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
68static struct workqueue_struct *raid5_wq;
69
70static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
71{
72	int hash = (sect >> RAID5_STRIPE_SHIFT(conf)) & HASH_MASK;
73	return &conf->stripe_hashtbl[hash];
74}
75
76static inline int stripe_hash_locks_hash(struct r5conf *conf, sector_t sect)
77{
78	return (sect >> RAID5_STRIPE_SHIFT(conf)) & STRIPE_HASH_LOCKS_MASK;
79}
80
81static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
82{
83	spin_lock_irq(conf->hash_locks + hash);
84	spin_lock(&conf->device_lock);
85}
86
87static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
88{
89	spin_unlock(&conf->device_lock);
90	spin_unlock_irq(conf->hash_locks + hash);
91}
92
93static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
94{
95	int i;
96	spin_lock_irq(conf->hash_locks);
97	for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
98		spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
99	spin_lock(&conf->device_lock);
100}
101
102static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
103{
104	int i;
105	spin_unlock(&conf->device_lock);
106	for (i = NR_STRIPE_HASH_LOCKS - 1; i; i--)
107		spin_unlock(conf->hash_locks + i);
108	spin_unlock_irq(conf->hash_locks);
109}
110
111/* Find first data disk in a raid6 stripe */
112static inline int raid6_d0(struct stripe_head *sh)
113{
114	if (sh->ddf_layout)
115		/* ddf always start from first device */
116		return 0;
117	/* md starts just after Q block */
118	if (sh->qd_idx == sh->disks - 1)
119		return 0;
120	else
121		return sh->qd_idx + 1;
122}
123static inline int raid6_next_disk(int disk, int raid_disks)
124{
125	disk++;
126	return (disk < raid_disks) ? disk : 0;
127}
128
129/* When walking through the disks in a raid5, starting at raid6_d0,
130 * We need to map each disk to a 'slot', where the data disks are slot
131 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk
132 * is raid_disks-1.  This help does that mapping.
133 */
134static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
135			     int *count, int syndrome_disks)
136{
137	int slot = *count;
138
139	if (sh->ddf_layout)
140		(*count)++;
141	if (idx == sh->pd_idx)
142		return syndrome_disks;
143	if (idx == sh->qd_idx)
144		return syndrome_disks + 1;
145	if (!sh->ddf_layout)
146		(*count)++;
147	return slot;
148}
149
150static void print_raid5_conf (struct r5conf *conf);
151
152static int stripe_operations_active(struct stripe_head *sh)
153{
154	return sh->check_state || sh->reconstruct_state ||
155	       test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
156	       test_bit(STRIPE_COMPUTE_RUN, &sh->state);
157}
158
159static bool stripe_is_lowprio(struct stripe_head *sh)
160{
161	return (test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) ||
162		test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) &&
163	       !test_bit(STRIPE_R5C_CACHING, &sh->state);
164}
165
166static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
167{
168	struct r5conf *conf = sh->raid_conf;
169	struct r5worker_group *group;
170	int thread_cnt;
171	int i, cpu = sh->cpu;
172
173	if (!cpu_online(cpu)) {
174		cpu = cpumask_any(cpu_online_mask);
175		sh->cpu = cpu;
176	}
177
178	if (list_empty(&sh->lru)) {
179		struct r5worker_group *group;
180		group = conf->worker_groups + cpu_to_group(cpu);
181		if (stripe_is_lowprio(sh))
182			list_add_tail(&sh->lru, &group->loprio_list);
183		else
184			list_add_tail(&sh->lru, &group->handle_list);
185		group->stripes_cnt++;
186		sh->group = group;
187	}
188
189	if (conf->worker_cnt_per_group == 0) {
190		md_wakeup_thread(conf->mddev->thread);
191		return;
192	}
193
194	group = conf->worker_groups + cpu_to_group(sh->cpu);
195
196	group->workers[0].working = true;
197	/* at least one worker should run to avoid race */
198	queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work);
199
200	thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1;
201	/* wakeup more workers */
202	for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) {
203		if (group->workers[i].working == false) {
204			group->workers[i].working = true;
205			queue_work_on(sh->cpu, raid5_wq,
206				      &group->workers[i].work);
207			thread_cnt--;
208		}
209	}
210}
211
212static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
213			      struct list_head *temp_inactive_list)
214{
215	int i;
216	int injournal = 0;	/* number of date pages with R5_InJournal */
217
218	BUG_ON(!list_empty(&sh->lru));
219	BUG_ON(atomic_read(&conf->active_stripes)==0);
220
221	if (r5c_is_writeback(conf->log))
222		for (i = sh->disks; i--; )
223			if (test_bit(R5_InJournal, &sh->dev[i].flags))
224				injournal++;
225	/*
226	 * In the following cases, the stripe cannot be released to cached
227	 * lists. Therefore, we make the stripe write out and set
228	 * STRIPE_HANDLE:
229	 *   1. when quiesce in r5c write back;
230	 *   2. when resync is requested fot the stripe.
231	 */
232	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) ||
233	    (conf->quiesce && r5c_is_writeback(conf->log) &&
234	     !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0)) {
235		if (test_bit(STRIPE_R5C_CACHING, &sh->state))
236			r5c_make_stripe_write_out(sh);
237		set_bit(STRIPE_HANDLE, &sh->state);
238	}
239
240	if (test_bit(STRIPE_HANDLE, &sh->state)) {
241		if (test_bit(STRIPE_DELAYED, &sh->state) &&
242		    !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
243			list_add_tail(&sh->lru, &conf->delayed_list);
244		else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
245			   sh->bm_seq - conf->seq_write > 0)
246			list_add_tail(&sh->lru, &conf->bitmap_list);
247		else {
248			clear_bit(STRIPE_DELAYED, &sh->state);
249			clear_bit(STRIPE_BIT_DELAY, &sh->state);
250			if (conf->worker_cnt_per_group == 0) {
251				if (stripe_is_lowprio(sh))
252					list_add_tail(&sh->lru,
253							&conf->loprio_list);
254				else
255					list_add_tail(&sh->lru,
256							&conf->handle_list);
257			} else {
258				raid5_wakeup_stripe_thread(sh);
259				return;
260			}
261		}
262		md_wakeup_thread(conf->mddev->thread);
263	} else {
264		BUG_ON(stripe_operations_active(sh));
265		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
266			if (atomic_dec_return(&conf->preread_active_stripes)
267			    < IO_THRESHOLD)
268				md_wakeup_thread(conf->mddev->thread);
269		atomic_dec(&conf->active_stripes);
270		if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
271			if (!r5c_is_writeback(conf->log))
272				list_add_tail(&sh->lru, temp_inactive_list);
273			else {
274				WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags));
275				if (injournal == 0)
276					list_add_tail(&sh->lru, temp_inactive_list);
277				else if (injournal == conf->raid_disks - conf->max_degraded) {
278					/* full stripe */
279					if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state))
280						atomic_inc(&conf->r5c_cached_full_stripes);
281					if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
282						atomic_dec(&conf->r5c_cached_partial_stripes);
283					list_add_tail(&sh->lru, &conf->r5c_full_stripe_list);
284					r5c_check_cached_full_stripe(conf);
285				} else
286					/*
287					 * STRIPE_R5C_PARTIAL_STRIPE is set in
288					 * r5c_try_caching_write(). No need to
289					 * set it again.
290					 */
291					list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list);
292			}
293		}
294	}
295}
296
297static void __release_stripe(struct r5conf *conf, struct stripe_head *sh,
298			     struct list_head *temp_inactive_list)
299{
300	if (atomic_dec_and_test(&sh->count))
301		do_release_stripe(conf, sh, temp_inactive_list);
302}
303
304/*
305 * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list
306 *
307 * Be careful: Only one task can add/delete stripes from temp_inactive_list at
308 * given time. Adding stripes only takes device lock, while deleting stripes
309 * only takes hash lock.
310 */
311static void release_inactive_stripe_list(struct r5conf *conf,
312					 struct list_head *temp_inactive_list,
313					 int hash)
314{
315	int size;
316	bool do_wakeup = false;
317	unsigned long flags;
318
319	if (hash == NR_STRIPE_HASH_LOCKS) {
320		size = NR_STRIPE_HASH_LOCKS;
321		hash = NR_STRIPE_HASH_LOCKS - 1;
322	} else
323		size = 1;
324	while (size) {
325		struct list_head *list = &temp_inactive_list[size - 1];
326
327		/*
328		 * We don't hold any lock here yet, raid5_get_active_stripe() might
329		 * remove stripes from the list
330		 */
331		if (!list_empty_careful(list)) {
332			spin_lock_irqsave(conf->hash_locks + hash, flags);
333			if (list_empty(conf->inactive_list + hash) &&
334			    !list_empty(list))
335				atomic_dec(&conf->empty_inactive_list_nr);
336			list_splice_tail_init(list, conf->inactive_list + hash);
337			do_wakeup = true;
338			spin_unlock_irqrestore(conf->hash_locks + hash, flags);
339		}
340		size--;
341		hash--;
342	}
343
344	if (do_wakeup) {
345		wake_up(&conf->wait_for_stripe);
346		if (atomic_read(&conf->active_stripes) == 0)
347			wake_up(&conf->wait_for_quiescent);
348		if (conf->retry_read_aligned)
349			md_wakeup_thread(conf->mddev->thread);
350	}
351}
352
353/* should hold conf->device_lock already */
354static int release_stripe_list(struct r5conf *conf,
355			       struct list_head *temp_inactive_list)
356{
357	struct stripe_head *sh, *t;
358	int count = 0;
359	struct llist_node *head;
360
361	head = llist_del_all(&conf->released_stripes);
362	head = llist_reverse_order(head);
363	llist_for_each_entry_safe(sh, t, head, release_list) {
364		int hash;
365
366		/* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */
367		smp_mb();
368		clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state);
369		/*
370		 * Don't worry the bit is set here, because if the bit is set
371		 * again, the count is always > 1. This is true for
372		 * STRIPE_ON_UNPLUG_LIST bit too.
373		 */
374		hash = sh->hash_lock_index;
375		__release_stripe(conf, sh, &temp_inactive_list[hash]);
376		count++;
377	}
378
379	return count;
380}
381
382void raid5_release_stripe(struct stripe_head *sh)
383{
384	struct r5conf *conf = sh->raid_conf;
385	unsigned long flags;
386	struct list_head list;
387	int hash;
388	bool wakeup;
389
390	/* Avoid release_list until the last reference.
391	 */
392	if (atomic_add_unless(&sh->count, -1, 1))
393		return;
394
395	if (unlikely(!conf->mddev->thread) ||
396		test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
397		goto slow_path;
398	wakeup = llist_add(&sh->release_list, &conf->released_stripes);
399	if (wakeup)
400		md_wakeup_thread(conf->mddev->thread);
401	return;
402slow_path:
403	/* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */
404	if (atomic_dec_and_lock_irqsave(&sh->count, &conf->device_lock, flags)) {
405		INIT_LIST_HEAD(&list);
406		hash = sh->hash_lock_index;
407		do_release_stripe(conf, sh, &list);
408		spin_unlock_irqrestore(&conf->device_lock, flags);
409		release_inactive_stripe_list(conf, &list, hash);
410	}
411}
412
413static inline void remove_hash(struct stripe_head *sh)
414{
415	pr_debug("remove_hash(), stripe %llu\n",
416		(unsigned long long)sh->sector);
417
418	hlist_del_init(&sh->hash);
419}
420
421static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)
422{
423	struct hlist_head *hp = stripe_hash(conf, sh->sector);
424
425	pr_debug("insert_hash(), stripe %llu\n",
426		(unsigned long long)sh->sector);
427
428	hlist_add_head(&sh->hash, hp);
429}
430
431/* find an idle stripe, make sure it is unhashed, and return it. */
432static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash)
433{
434	struct stripe_head *sh = NULL;
435	struct list_head *first;
436
437	if (list_empty(conf->inactive_list + hash))
438		goto out;
439	first = (conf->inactive_list + hash)->next;
440	sh = list_entry(first, struct stripe_head, lru);
441	list_del_init(first);
442	remove_hash(sh);
443	atomic_inc(&conf->active_stripes);
444	BUG_ON(hash != sh->hash_lock_index);
445	if (list_empty(conf->inactive_list + hash))
446		atomic_inc(&conf->empty_inactive_list_nr);
447out:
448	return sh;
449}
450
451#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
452static void free_stripe_pages(struct stripe_head *sh)
453{
454	int i;
455	struct page *p;
456
457	/* Have not allocate page pool */
458	if (!sh->pages)
459		return;
460
461	for (i = 0; i < sh->nr_pages; i++) {
462		p = sh->pages[i];
463		if (p)
464			put_page(p);
465		sh->pages[i] = NULL;
466	}
467}
468
469static int alloc_stripe_pages(struct stripe_head *sh, gfp_t gfp)
470{
471	int i;
472	struct page *p;
473
474	for (i = 0; i < sh->nr_pages; i++) {
475		/* The page have allocated. */
476		if (sh->pages[i])
477			continue;
478
479		p = alloc_page(gfp);
480		if (!p) {
481			free_stripe_pages(sh);
482			return -ENOMEM;
483		}
484		sh->pages[i] = p;
485	}
486	return 0;
487}
488
489static int
490init_stripe_shared_pages(struct stripe_head *sh, struct r5conf *conf, int disks)
491{
492	int nr_pages, cnt;
493
494	if (sh->pages)
495		return 0;
496
497	/* Each of the sh->dev[i] need one conf->stripe_size */
498	cnt = PAGE_SIZE / conf->stripe_size;
499	nr_pages = (disks + cnt - 1) / cnt;
500
501	sh->pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
502	if (!sh->pages)
503		return -ENOMEM;
504	sh->nr_pages = nr_pages;
505	sh->stripes_per_page = cnt;
506	return 0;
507}
508#endif
509
510static void shrink_buffers(struct stripe_head *sh)
511{
512	int i;
513	int num = sh->raid_conf->pool_size;
514
515#if PAGE_SIZE == DEFAULT_STRIPE_SIZE
516	for (i = 0; i < num ; i++) {
517		struct page *p;
518
519		WARN_ON(sh->dev[i].page != sh->dev[i].orig_page);
520		p = sh->dev[i].page;
521		if (!p)
522			continue;
523		sh->dev[i].page = NULL;
524		put_page(p);
525	}
526#else
527	for (i = 0; i < num; i++)
528		sh->dev[i].page = NULL;
529	free_stripe_pages(sh); /* Free pages */
530#endif
531}
532
533static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
534{
535	int i;
536	int num = sh->raid_conf->pool_size;
537
538#if PAGE_SIZE == DEFAULT_STRIPE_SIZE
539	for (i = 0; i < num; i++) {
540		struct page *page;
541
542		if (!(page = alloc_page(gfp))) {
543			return 1;
544		}
545		sh->dev[i].page = page;
546		sh->dev[i].orig_page = page;
547		sh->dev[i].offset = 0;
548	}
549#else
550	if (alloc_stripe_pages(sh, gfp))
551		return -ENOMEM;
552
553	for (i = 0; i < num; i++) {
554		sh->dev[i].page = raid5_get_dev_page(sh, i);
555		sh->dev[i].orig_page = sh->dev[i].page;
556		sh->dev[i].offset = raid5_get_page_offset(sh, i);
557	}
558#endif
559	return 0;
560}
561
562static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
563			    struct stripe_head *sh);
564
565static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
566{
567	struct r5conf *conf = sh->raid_conf;
568	int i, seq;
569
570	BUG_ON(atomic_read(&sh->count) != 0);
571	BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
572	BUG_ON(stripe_operations_active(sh));
573	BUG_ON(sh->batch_head);
574
575	pr_debug("init_stripe called, stripe %llu\n",
576		(unsigned long long)sector);
577retry:
578	seq = read_seqcount_begin(&conf->gen_lock);
579	sh->generation = conf->generation - previous;
580	sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
581	sh->sector = sector;
582	stripe_set_idx(sector, conf, previous, sh);
583	sh->state = 0;
584
585	for (i = sh->disks; i--; ) {
586		struct r5dev *dev = &sh->dev[i];
587
588		if (dev->toread || dev->read || dev->towrite || dev->written ||
589		    test_bit(R5_LOCKED, &dev->flags)) {
590			pr_err("sector=%llx i=%d %p %p %p %p %d\n",
591			       (unsigned long long)sh->sector, i, dev->toread,
592			       dev->read, dev->towrite, dev->written,
593			       test_bit(R5_LOCKED, &dev->flags));
594			WARN_ON(1);
595		}
596		dev->flags = 0;
597		dev->sector = raid5_compute_blocknr(sh, i, previous);
598	}
599	if (read_seqcount_retry(&conf->gen_lock, seq))
600		goto retry;
601	sh->overwrite_disks = 0;
602	insert_hash(conf, sh);
603	sh->cpu = smp_processor_id();
604	set_bit(STRIPE_BATCH_READY, &sh->state);
605}
606
607static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
608					 short generation)
609{
610	struct stripe_head *sh;
611
612	pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
613	hlist_for_each_entry(sh, stripe_hash(conf, sector), hash)
614		if (sh->sector == sector && sh->generation == generation)
615			return sh;
616	pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
617	return NULL;
618}
619
620/*
621 * Need to check if array has failed when deciding whether to:
622 *  - start an array
623 *  - remove non-faulty devices
624 *  - add a spare
625 *  - allow a reshape
626 * This determination is simple when no reshape is happening.
627 * However if there is a reshape, we need to carefully check
628 * both the before and after sections.
629 * This is because some failed devices may only affect one
630 * of the two sections, and some non-in_sync devices may
631 * be insync in the section most affected by failed devices.
632 */
633int raid5_calc_degraded(struct r5conf *conf)
634{
635	int degraded, degraded2;
636	int i;
637
638	rcu_read_lock();
639	degraded = 0;
640	for (i = 0; i < conf->previous_raid_disks; i++) {
641		struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
642		if (rdev && test_bit(Faulty, &rdev->flags))
643			rdev = rcu_dereference(conf->disks[i].replacement);
644		if (!rdev || test_bit(Faulty, &rdev->flags))
645			degraded++;
646		else if (test_bit(In_sync, &rdev->flags))
647			;
648		else
649			/* not in-sync or faulty.
650			 * If the reshape increases the number of devices,
651			 * this is being recovered by the reshape, so
652			 * this 'previous' section is not in_sync.
653			 * If the number of devices is being reduced however,
654			 * the device can only be part of the array if
655			 * we are reverting a reshape, so this section will
656			 * be in-sync.
657			 */
658			if (conf->raid_disks >= conf->previous_raid_disks)
659				degraded++;
660	}
661	rcu_read_unlock();
662	if (conf->raid_disks == conf->previous_raid_disks)
663		return degraded;
664	rcu_read_lock();
665	degraded2 = 0;
666	for (i = 0; i < conf->raid_disks; i++) {
667		struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
668		if (rdev && test_bit(Faulty, &rdev->flags))
669			rdev = rcu_dereference(conf->disks[i].replacement);
670		if (!rdev || test_bit(Faulty, &rdev->flags))
671			degraded2++;
672		else if (test_bit(In_sync, &rdev->flags))
673			;
674		else
675			/* not in-sync or faulty.
676			 * If reshape increases the number of devices, this
677			 * section has already been recovered, else it
678			 * almost certainly hasn't.
679			 */
680			if (conf->raid_disks <= conf->previous_raid_disks)
681				degraded2++;
682	}
683	rcu_read_unlock();
684	if (degraded2 > degraded)
685		return degraded2;
686	return degraded;
687}
688
689static bool has_failed(struct r5conf *conf)
690{
691	int degraded = conf->mddev->degraded;
692
693	if (test_bit(MD_BROKEN, &conf->mddev->flags))
694		return true;
695
696	if (conf->mddev->reshape_position != MaxSector)
697		degraded = raid5_calc_degraded(conf);
698
699	return degraded > conf->max_degraded;
700}
701
702struct stripe_head *
703raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
704			int previous, int noblock, int noquiesce)
705{
706	struct stripe_head *sh;
707	int hash = stripe_hash_locks_hash(conf, sector);
708	int inc_empty_inactive_list_flag;
709
710	pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
711
712	spin_lock_irq(conf->hash_locks + hash);
713
714	do {
715		wait_event_lock_irq(conf->wait_for_quiescent,
716				    conf->quiesce == 0 || noquiesce,
717				    *(conf->hash_locks + hash));
718		sh = __find_stripe(conf, sector, conf->generation - previous);
719		if (!sh) {
720			if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
721				sh = get_free_stripe(conf, hash);
722				if (!sh && !test_bit(R5_DID_ALLOC,
723						     &conf->cache_state))
724					set_bit(R5_ALLOC_MORE,
725						&conf->cache_state);
726			}
727			if (noblock && sh == NULL)
728				break;
729
730			r5c_check_stripe_cache_usage(conf);
731			if (!sh) {
732				set_bit(R5_INACTIVE_BLOCKED,
733					&conf->cache_state);
734				r5l_wake_reclaim(conf->log, 0);
735				wait_event_lock_irq(
736					conf->wait_for_stripe,
737					!list_empty(conf->inactive_list + hash) &&
738					(atomic_read(&conf->active_stripes)
739					 < (conf->max_nr_stripes * 3 / 4)
740					 || !test_bit(R5_INACTIVE_BLOCKED,
741						      &conf->cache_state)),
742					*(conf->hash_locks + hash));
743				clear_bit(R5_INACTIVE_BLOCKED,
744					  &conf->cache_state);
745			} else {
746				init_stripe(sh, sector, previous);
747				atomic_inc(&sh->count);
748			}
749		} else if (!atomic_inc_not_zero(&sh->count)) {
750			spin_lock(&conf->device_lock);
751			if (!atomic_read(&sh->count)) {
752				if (!test_bit(STRIPE_HANDLE, &sh->state))
753					atomic_inc(&conf->active_stripes);
754				BUG_ON(list_empty(&sh->lru) &&
755				       !test_bit(STRIPE_EXPANDING, &sh->state));
756				inc_empty_inactive_list_flag = 0;
757				if (!list_empty(conf->inactive_list + hash))
758					inc_empty_inactive_list_flag = 1;
759				list_del_init(&sh->lru);
760				if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag)
761					atomic_inc(&conf->empty_inactive_list_nr);
762				if (sh->group) {
763					sh->group->stripes_cnt--;
764					sh->group = NULL;
765				}
766			}
767			atomic_inc(&sh->count);
768			spin_unlock(&conf->device_lock);
769		}
770	} while (sh == NULL);
771
772	spin_unlock_irq(conf->hash_locks + hash);
773	return sh;
774}
775
776static bool is_full_stripe_write(struct stripe_head *sh)
777{
778	BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded));
779	return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded);
780}
781
782static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
783		__acquires(&sh1->stripe_lock)
784		__acquires(&sh2->stripe_lock)
785{
786	if (sh1 > sh2) {
787		spin_lock_irq(&sh2->stripe_lock);
788		spin_lock_nested(&sh1->stripe_lock, 1);
789	} else {
790		spin_lock_irq(&sh1->stripe_lock);
791		spin_lock_nested(&sh2->stripe_lock, 1);
792	}
793}
794
795static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
796		__releases(&sh1->stripe_lock)
797		__releases(&sh2->stripe_lock)
798{
799	spin_unlock(&sh1->stripe_lock);
800	spin_unlock_irq(&sh2->stripe_lock);
801}
802
803/* Only freshly new full stripe normal write stripe can be added to a batch list */
804static bool stripe_can_batch(struct stripe_head *sh)
805{
806	struct r5conf *conf = sh->raid_conf;
807
808	if (raid5_has_log(conf) || raid5_has_ppl(conf))
809		return false;
810	return test_bit(STRIPE_BATCH_READY, &sh->state) &&
811		!test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
812		is_full_stripe_write(sh);
813}
814
815/* we only do back search */
816static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh)
817{
818	struct stripe_head *head;
819	sector_t head_sector, tmp_sec;
820	int hash;
821	int dd_idx;
822	int inc_empty_inactive_list_flag;
823
824	/* Don't cross chunks, so stripe pd_idx/qd_idx is the same */
825	tmp_sec = sh->sector;
826	if (!sector_div(tmp_sec, conf->chunk_sectors))
827		return;
828	head_sector = sh->sector - RAID5_STRIPE_SECTORS(conf);
829
830	hash = stripe_hash_locks_hash(conf, head_sector);
831	spin_lock_irq(conf->hash_locks + hash);
832	head = __find_stripe(conf, head_sector, conf->generation);
833	if (head && !atomic_inc_not_zero(&head->count)) {
834		spin_lock(&conf->device_lock);
835		if (!atomic_read(&head->count)) {
836			if (!test_bit(STRIPE_HANDLE, &head->state))
837				atomic_inc(&conf->active_stripes);
838			BUG_ON(list_empty(&head->lru) &&
839			       !test_bit(STRIPE_EXPANDING, &head->state));
840			inc_empty_inactive_list_flag = 0;
841			if (!list_empty(conf->inactive_list + hash))
842				inc_empty_inactive_list_flag = 1;
843			list_del_init(&head->lru);
844			if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag)
845				atomic_inc(&conf->empty_inactive_list_nr);
846			if (head->group) {
847				head->group->stripes_cnt--;
848				head->group = NULL;
849			}
850		}
851		atomic_inc(&head->count);
852		spin_unlock(&conf->device_lock);
853	}
854	spin_unlock_irq(conf->hash_locks + hash);
855
856	if (!head)
857		return;
858	if (!stripe_can_batch(head))
859		goto out;
860
861	lock_two_stripes(head, sh);
862	/* clear_batch_ready clear the flag */
863	if (!stripe_can_batch(head) || !stripe_can_batch(sh))
864		goto unlock_out;
865
866	if (sh->batch_head)
867		goto unlock_out;
868
869	dd_idx = 0;
870	while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
871		dd_idx++;
872	if (head->dev[dd_idx].towrite->bi_opf != sh->dev[dd_idx].towrite->bi_opf ||
873	    bio_op(head->dev[dd_idx].towrite) != bio_op(sh->dev[dd_idx].towrite))
874		goto unlock_out;
875
876	if (head->batch_head) {
877		spin_lock(&head->batch_head->batch_lock);
878		/* This batch list is already running */
879		if (!stripe_can_batch(head)) {
880			spin_unlock(&head->batch_head->batch_lock);
881			goto unlock_out;
882		}
883		/*
884		 * We must assign batch_head of this stripe within the
885		 * batch_lock, otherwise clear_batch_ready of batch head
886		 * stripe could clear BATCH_READY bit of this stripe and
887		 * this stripe->batch_head doesn't get assigned, which
888		 * could confuse clear_batch_ready for this stripe
889		 */
890		sh->batch_head = head->batch_head;
891
892		/*
893		 * at this point, head's BATCH_READY could be cleared, but we
894		 * can still add the stripe to batch list
895		 */
896		list_add(&sh->batch_list, &head->batch_list);
897		spin_unlock(&head->batch_head->batch_lock);
898	} else {
899		head->batch_head = head;
900		sh->batch_head = head->batch_head;
901		spin_lock(&head->batch_lock);
902		list_add_tail(&sh->batch_list, &head->batch_list);
903		spin_unlock(&head->batch_lock);
904	}
905
906	if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
907		if (atomic_dec_return(&conf->preread_active_stripes)
908		    < IO_THRESHOLD)
909			md_wakeup_thread(conf->mddev->thread);
910
911	if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) {
912		int seq = sh->bm_seq;
913		if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) &&
914		    sh->batch_head->bm_seq > seq)
915			seq = sh->batch_head->bm_seq;
916		set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state);
917		sh->batch_head->bm_seq = seq;
918	}
919
920	atomic_inc(&sh->count);
921unlock_out:
922	unlock_two_stripes(head, sh);
923out:
924	raid5_release_stripe(head);
925}
926
927/* Determine if 'data_offset' or 'new_data_offset' should be used
928 * in this stripe_head.
929 */
930static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
931{
932	sector_t progress = conf->reshape_progress;
933	/* Need a memory barrier to make sure we see the value
934	 * of conf->generation, or ->data_offset that was set before
935	 * reshape_progress was updated.
936	 */
937	smp_rmb();
938	if (progress == MaxSector)
939		return 0;
940	if (sh->generation == conf->generation - 1)
941		return 0;
942	/* We are in a reshape, and this is a new-generation stripe,
943	 * so use new_data_offset.
944	 */
945	return 1;
946}
947
948static void dispatch_bio_list(struct bio_list *tmp)
949{
950	struct bio *bio;
951
952	while ((bio = bio_list_pop(tmp)))
953		submit_bio_noacct(bio);
954}
955
956static int cmp_stripe(void *priv, const struct list_head *a,
957		      const struct list_head *b)
958{
959	const struct r5pending_data *da = list_entry(a,
960				struct r5pending_data, sibling);
961	const struct r5pending_data *db = list_entry(b,
962				struct r5pending_data, sibling);
963	if (da->sector > db->sector)
964		return 1;
965	if (da->sector < db->sector)
966		return -1;
967	return 0;
968}
969
970static void dispatch_defer_bios(struct r5conf *conf, int target,
971				struct bio_list *list)
972{
973	struct r5pending_data *data;
974	struct list_head *first, *next = NULL;
975	int cnt = 0;
976
977	if (conf->pending_data_cnt == 0)
978		return;
979
980	list_sort(NULL, &conf->pending_list, cmp_stripe);
981
982	first = conf->pending_list.next;
983
984	/* temporarily move the head */
985	if (conf->next_pending_data)
986		list_move_tail(&conf->pending_list,
987				&conf->next_pending_data->sibling);
988
989	while (!list_empty(&conf->pending_list)) {
990		data = list_first_entry(&conf->pending_list,
991			struct r5pending_data, sibling);
992		if (&data->sibling == first)
993			first = data->sibling.next;
994		next = data->sibling.next;
995
996		bio_list_merge(list, &data->bios);
997		list_move(&data->sibling, &conf->free_list);
998		cnt++;
999		if (cnt >= target)
1000			break;
1001	}
1002	conf->pending_data_cnt -= cnt;
1003	BUG_ON(conf->pending_data_cnt < 0 || cnt < target);
1004
1005	if (next != &conf->pending_list)
1006		conf->next_pending_data = list_entry(next,
1007				struct r5pending_data, sibling);
1008	else
1009		conf->next_pending_data = NULL;
1010	/* list isn't empty */
1011	if (first != &conf->pending_list)
1012		list_move_tail(&conf->pending_list, first);
1013}
1014
1015static void flush_deferred_bios(struct r5conf *conf)
1016{
1017	struct bio_list tmp = BIO_EMPTY_LIST;
1018
1019	if (conf->pending_data_cnt == 0)
1020		return;
1021
1022	spin_lock(&conf->pending_bios_lock);
1023	dispatch_defer_bios(conf, conf->pending_data_cnt, &tmp);
1024	BUG_ON(conf->pending_data_cnt != 0);
1025	spin_unlock(&conf->pending_bios_lock);
1026
1027	dispatch_bio_list(&tmp);
1028}
1029
1030static void defer_issue_bios(struct r5conf *conf, sector_t sector,
1031				struct bio_list *bios)
1032{
1033	struct bio_list tmp = BIO_EMPTY_LIST;
1034	struct r5pending_data *ent;
1035
1036	spin_lock(&conf->pending_bios_lock);
1037	ent = list_first_entry(&conf->free_list, struct r5pending_data,
1038							sibling);
1039	list_move_tail(&ent->sibling, &conf->pending_list);
1040	ent->sector = sector;
1041	bio_list_init(&ent->bios);
1042	bio_list_merge(&ent->bios, bios);
1043	conf->pending_data_cnt++;
1044	if (conf->pending_data_cnt >= PENDING_IO_MAX)
1045		dispatch_defer_bios(conf, PENDING_IO_ONE_FLUSH, &tmp);
1046
1047	spin_unlock(&conf->pending_bios_lock);
1048
1049	dispatch_bio_list(&tmp);
1050}
1051
1052static void
1053raid5_end_read_request(struct bio *bi);
1054static void
1055raid5_end_write_request(struct bio *bi);
1056
1057static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
1058{
1059	struct r5conf *conf = sh->raid_conf;
1060	int i, disks = sh->disks;
1061	struct stripe_head *head_sh = sh;
1062	struct bio_list pending_bios = BIO_EMPTY_LIST;
1063	bool should_defer;
1064
1065	might_sleep();
1066
1067	if (log_stripe(sh, s) == 0)
1068		return;
1069
1070	should_defer = conf->batch_bio_dispatch && conf->group_cnt;
1071
1072	for (i = disks; i--; ) {
1073		int op, op_flags = 0;
1074		int replace_only = 0;
1075		struct bio *bi, *rbi;
1076		struct md_rdev *rdev, *rrdev = NULL;
1077
1078		sh = head_sh;
1079		if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
1080			op = REQ_OP_WRITE;
1081			if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
1082				op_flags = REQ_FUA;
1083			if (test_bit(R5_Discard, &sh->dev[i].flags))
1084				op = REQ_OP_DISCARD;
1085		} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
1086			op = REQ_OP_READ;
1087		else if (test_and_clear_bit(R5_WantReplace,
1088					    &sh->dev[i].flags)) {
1089			op = REQ_OP_WRITE;
1090			replace_only = 1;
1091		} else
1092			continue;
1093		if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags))
1094			op_flags |= REQ_SYNC;
1095
1096again:
1097		bi = &sh->dev[i].req;
1098		rbi = &sh->dev[i].rreq; /* For writing to replacement */
1099
1100		rcu_read_lock();
1101		rrdev = rcu_dereference(conf->disks[i].replacement);
1102		smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */
1103		rdev = rcu_dereference(conf->disks[i].rdev);
1104		if (!rdev) {
1105			rdev = rrdev;
1106			rrdev = NULL;
1107		}
1108		if (op_is_write(op)) {
1109			if (replace_only)
1110				rdev = NULL;
1111			if (rdev == rrdev)
1112				/* We raced and saw duplicates */
1113				rrdev = NULL;
1114		} else {
1115			if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev)
1116				rdev = rrdev;
1117			rrdev = NULL;
1118		}
1119
1120		if (rdev && test_bit(Faulty, &rdev->flags))
1121			rdev = NULL;
1122		if (rdev)
1123			atomic_inc(&rdev->nr_pending);
1124		if (rrdev && test_bit(Faulty, &rrdev->flags))
1125			rrdev = NULL;
1126		if (rrdev)
1127			atomic_inc(&rrdev->nr_pending);
1128		rcu_read_unlock();
1129
1130		/* We have already checked bad blocks for reads.  Now
1131		 * need to check for writes.  We never accept write errors
1132		 * on the replacement, so we don't to check rrdev.
1133		 */
1134		while (op_is_write(op) && rdev &&
1135		       test_bit(WriteErrorSeen, &rdev->flags)) {
1136			sector_t first_bad;
1137			int bad_sectors;
1138			int bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
1139					      &first_bad, &bad_sectors);
1140			if (!bad)
1141				break;
1142
1143			if (bad < 0) {
1144				set_bit(BlockedBadBlocks, &rdev->flags);
1145				if (!conf->mddev->external &&
1146				    conf->mddev->sb_flags) {
1147					/* It is very unlikely, but we might
1148					 * still need to write out the
1149					 * bad block log - better give it
1150					 * a chance*/
1151					md_check_recovery(conf->mddev);
1152				}
1153				/*
1154				 * Because md_wait_for_blocked_rdev
1155				 * will dec nr_pending, we must
1156				 * increment it first.
1157				 */
1158				atomic_inc(&rdev->nr_pending);
1159				md_wait_for_blocked_rdev(rdev, conf->mddev);
1160			} else {
1161				/* Acknowledged bad block - skip the write */
1162				rdev_dec_pending(rdev, conf->mddev);
1163				rdev = NULL;
1164			}
1165		}
1166
1167		if (rdev) {
1168			if (s->syncing || s->expanding || s->expanded
1169			    || s->replacing)
1170				md_sync_acct(rdev->bdev, RAID5_STRIPE_SECTORS(conf));
1171
1172			set_bit(STRIPE_IO_STARTED, &sh->state);
1173
1174			bio_set_dev(bi, rdev->bdev);
1175			bio_set_op_attrs(bi, op, op_flags);
1176			bi->bi_end_io = op_is_write(op)
1177				? raid5_end_write_request
1178				: raid5_end_read_request;
1179			bi->bi_private = sh;
1180
1181			pr_debug("%s: for %llu schedule op %d on disc %d\n",
1182				__func__, (unsigned long long)sh->sector,
1183				bi->bi_opf, i);
1184			atomic_inc(&sh->count);
1185			if (sh != head_sh)
1186				atomic_inc(&head_sh->count);
1187			if (use_new_offset(conf, sh))
1188				bi->bi_iter.bi_sector = (sh->sector
1189						 + rdev->new_data_offset);
1190			else
1191				bi->bi_iter.bi_sector = (sh->sector
1192						 + rdev->data_offset);
1193			if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags))
1194				bi->bi_opf |= REQ_NOMERGE;
1195
1196			if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
1197				WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
1198
1199			if (!op_is_write(op) &&
1200			    test_bit(R5_InJournal, &sh->dev[i].flags))
1201				/*
1202				 * issuing read for a page in journal, this
1203				 * must be preparing for prexor in rmw; read
1204				 * the data into orig_page
1205				 */
1206				sh->dev[i].vec.bv_page = sh->dev[i].orig_page;
1207			else
1208				sh->dev[i].vec.bv_page = sh->dev[i].page;
1209			bi->bi_vcnt = 1;
1210			bi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf);
1211			bi->bi_io_vec[0].bv_offset = sh->dev[i].offset;
1212			bi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
1213			bi->bi_write_hint = sh->dev[i].write_hint;
1214			if (!rrdev)
1215				sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
1216			/*
1217			 * If this is discard request, set bi_vcnt 0. We don't
1218			 * want to confuse SCSI because SCSI will replace payload
1219			 */
1220			if (op == REQ_OP_DISCARD)
1221				bi->bi_vcnt = 0;
1222			if (rrdev)
1223				set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
1224
1225			if (conf->mddev->gendisk)
1226				trace_block_bio_remap(bi->bi_disk->queue,
1227						      bi, disk_devt(conf->mddev->gendisk),
1228						      sh->dev[i].sector);
1229			if (should_defer && op_is_write(op))
1230				bio_list_add(&pending_bios, bi);
1231			else
1232				submit_bio_noacct(bi);
1233		}
1234		if (rrdev) {
1235			if (s->syncing || s->expanding || s->expanded
1236			    || s->replacing)
1237				md_sync_acct(rrdev->bdev, RAID5_STRIPE_SECTORS(conf));
1238
1239			set_bit(STRIPE_IO_STARTED, &sh->state);
1240
1241			bio_set_dev(rbi, rrdev->bdev);
1242			bio_set_op_attrs(rbi, op, op_flags);
1243			BUG_ON(!op_is_write(op));
1244			rbi->bi_end_io = raid5_end_write_request;
1245			rbi->bi_private = sh;
1246
1247			pr_debug("%s: for %llu schedule op %d on "
1248				 "replacement disc %d\n",
1249				__func__, (unsigned long long)sh->sector,
1250				rbi->bi_opf, i);
1251			atomic_inc(&sh->count);
1252			if (sh != head_sh)
1253				atomic_inc(&head_sh->count);
1254			if (use_new_offset(conf, sh))
1255				rbi->bi_iter.bi_sector = (sh->sector
1256						  + rrdev->new_data_offset);
1257			else
1258				rbi->bi_iter.bi_sector = (sh->sector
1259						  + rrdev->data_offset);
1260			if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
1261				WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
1262			sh->dev[i].rvec.bv_page = sh->dev[i].page;
1263			rbi->bi_vcnt = 1;
1264			rbi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf);
1265			rbi->bi_io_vec[0].bv_offset = sh->dev[i].offset;
1266			rbi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
1267			rbi->bi_write_hint = sh->dev[i].write_hint;
1268			sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
1269			/*
1270			 * If this is discard request, set bi_vcnt 0. We don't
1271			 * want to confuse SCSI because SCSI will replace payload
1272			 */
1273			if (op == REQ_OP_DISCARD)
1274				rbi->bi_vcnt = 0;
1275			if (conf->mddev->gendisk)
1276				trace_block_bio_remap(rbi->bi_disk->queue,
1277						      rbi, disk_devt(conf->mddev->gendisk),
1278						      sh->dev[i].sector);
1279			if (should_defer && op_is_write(op))
1280				bio_list_add(&pending_bios, rbi);
1281			else
1282				submit_bio_noacct(rbi);
1283		}
1284		if (!rdev && !rrdev) {
1285			if (op_is_write(op))
1286				set_bit(STRIPE_DEGRADED, &sh->state);
1287			pr_debug("skip op %d on disc %d for sector %llu\n",
1288				bi->bi_opf, i, (unsigned long long)sh->sector);
1289			clear_bit(R5_LOCKED, &sh->dev[i].flags);
1290			set_bit(STRIPE_HANDLE, &sh->state);
1291		}
1292
1293		if (!head_sh->batch_head)
1294			continue;
1295		sh = list_first_entry(&sh->batch_list, struct stripe_head,
1296				      batch_list);
1297		if (sh != head_sh)
1298			goto again;
1299	}
1300
1301	if (should_defer && !bio_list_empty(&pending_bios))
1302		defer_issue_bios(conf, head_sh->sector, &pending_bios);
1303}
1304
1305static struct dma_async_tx_descriptor *
1306async_copy_data(int frombio, struct bio *bio, struct page **page,
1307	unsigned int poff, sector_t sector, struct dma_async_tx_descriptor *tx,
1308	struct stripe_head *sh, int no_skipcopy)
1309{
1310	struct bio_vec bvl;
1311	struct bvec_iter iter;
1312	struct page *bio_page;
1313	int page_offset;
1314	struct async_submit_ctl submit;
1315	enum async_tx_flags flags = 0;
1316	struct r5conf *conf = sh->raid_conf;
1317
1318	if (bio->bi_iter.bi_sector >= sector)
1319		page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512;
1320	else
1321		page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512;
1322
1323	if (frombio)
1324		flags |= ASYNC_TX_FENCE;
1325	init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
1326
1327	bio_for_each_segment(bvl, bio, iter) {
1328		int len = bvl.bv_len;
1329		int clen;
1330		int b_offset = 0;
1331
1332		if (page_offset < 0) {
1333			b_offset = -page_offset;
1334			page_offset += b_offset;
1335			len -= b_offset;
1336		}
1337
1338		if (len > 0 && page_offset + len > RAID5_STRIPE_SIZE(conf))
1339			clen = RAID5_STRIPE_SIZE(conf) - page_offset;
1340		else
1341			clen = len;
1342
1343		if (clen > 0) {
1344			b_offset += bvl.bv_offset;
1345			bio_page = bvl.bv_page;
1346			if (frombio) {
1347				if (conf->skip_copy &&
1348				    b_offset == 0 && page_offset == 0 &&
1349				    clen == RAID5_STRIPE_SIZE(conf) &&
1350				    !no_skipcopy)
1351					*page = bio_page;
1352				else
1353					tx = async_memcpy(*page, bio_page, page_offset + poff,
1354						  b_offset, clen, &submit);
1355			} else
1356				tx = async_memcpy(bio_page, *page, b_offset,
1357						  page_offset + poff, clen, &submit);
1358		}
1359		/* chain the operations */
1360		submit.depend_tx = tx;
1361
1362		if (clen < len) /* hit end of page */
1363			break;
1364		page_offset +=  len;
1365	}
1366
1367	return tx;
1368}
1369
1370static void ops_complete_biofill(void *stripe_head_ref)
1371{
1372	struct stripe_head *sh = stripe_head_ref;
1373	int i;
1374	struct r5conf *conf = sh->raid_conf;
1375
1376	pr_debug("%s: stripe %llu\n", __func__,
1377		(unsigned long long)sh->sector);
1378
1379	/* clear completed biofills */
1380	for (i = sh->disks; i--; ) {
1381		struct r5dev *dev = &sh->dev[i];
1382
1383		/* acknowledge completion of a biofill operation */
1384		/* and check if we need to reply to a read request,
1385		 * new R5_Wantfill requests are held off until
1386		 * !STRIPE_BIOFILL_RUN
1387		 */
1388		if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
1389			struct bio *rbi, *rbi2;
1390
1391			BUG_ON(!dev->read);
1392			rbi = dev->read;
1393			dev->read = NULL;
1394			while (rbi && rbi->bi_iter.bi_sector <
1395				dev->sector + RAID5_STRIPE_SECTORS(conf)) {
1396				rbi2 = r5_next_bio(conf, rbi, dev->sector);
1397				bio_endio(rbi);
1398				rbi = rbi2;
1399			}
1400		}
1401	}
1402	clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
1403
1404	set_bit(STRIPE_HANDLE, &sh->state);
1405	raid5_release_stripe(sh);
1406}
1407
1408static void ops_run_biofill(struct stripe_head *sh)
1409{
1410	struct dma_async_tx_descriptor *tx = NULL;
1411	struct async_submit_ctl submit;
1412	int i;
1413	struct r5conf *conf = sh->raid_conf;
1414
1415	BUG_ON(sh->batch_head);
1416	pr_debug("%s: stripe %llu\n", __func__,
1417		(unsigned long long)sh->sector);
1418
1419	for (i = sh->disks; i--; ) {
1420		struct r5dev *dev = &sh->dev[i];
1421		if (test_bit(R5_Wantfill, &dev->flags)) {
1422			struct bio *rbi;
1423			spin_lock_irq(&sh->stripe_lock);
1424			dev->read = rbi = dev->toread;
1425			dev->toread = NULL;
1426			spin_unlock_irq(&sh->stripe_lock);
1427			while (rbi && rbi->bi_iter.bi_sector <
1428				dev->sector + RAID5_STRIPE_SECTORS(conf)) {
1429				tx = async_copy_data(0, rbi, &dev->page,
1430						     dev->offset,
1431						     dev->sector, tx, sh, 0);
1432				rbi = r5_next_bio(conf, rbi, dev->sector);
1433			}
1434		}
1435	}
1436
1437	atomic_inc(&sh->count);
1438	init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
1439	async_trigger_callback(&submit);
1440}
1441
1442static void mark_target_uptodate(struct stripe_head *sh, int target)
1443{
1444	struct r5dev *tgt;
1445
1446	if (target < 0)
1447		return;
1448
1449	tgt = &sh->dev[target];
1450	set_bit(R5_UPTODATE, &tgt->flags);
1451	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1452	clear_bit(R5_Wantcompute, &tgt->flags);
1453}
1454
1455static void ops_complete_compute(void *stripe_head_ref)
1456{
1457	struct stripe_head *sh = stripe_head_ref;
1458
1459	pr_debug("%s: stripe %llu\n", __func__,
1460		(unsigned long long)sh->sector);
1461
1462	/* mark the computed target(s) as uptodate */
1463	mark_target_uptodate(sh, sh->ops.target);
1464	mark_target_uptodate(sh, sh->ops.target2);
1465
1466	clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
1467	if (sh->check_state == check_state_compute_run)
1468		sh->check_state = check_state_compute_result;
1469	set_bit(STRIPE_HANDLE, &sh->state);
1470	raid5_release_stripe(sh);
1471}
1472
1473/* return a pointer to the address conversion region of the scribble buffer */
1474static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
1475{
1476	return percpu->scribble + i * percpu->scribble_obj_size;
1477}
1478
1479/* return a pointer to the address conversion region of the scribble buffer */
1480static addr_conv_t *to_addr_conv(struct stripe_head *sh,
1481				 struct raid5_percpu *percpu, int i)
1482{
1483	return (void *) (to_addr_page(percpu, i) + sh->disks + 2);
1484}
1485
1486/*
1487 * Return a pointer to record offset address.
1488 */
1489static unsigned int *
1490to_addr_offs(struct stripe_head *sh, struct raid5_percpu *percpu)
1491{
1492	return (unsigned int *) (to_addr_conv(sh, percpu, 0) + sh->disks + 2);
1493}
1494
1495static struct dma_async_tx_descriptor *
1496ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
1497{
1498	int disks = sh->disks;
1499	struct page **xor_srcs = to_addr_page(percpu, 0);
1500	unsigned int *off_srcs = to_addr_offs(sh, percpu);
1501	int target = sh->ops.target;
1502	struct r5dev *tgt = &sh->dev[target];
1503	struct page *xor_dest = tgt->page;
1504	unsigned int off_dest = tgt->offset;
1505	int count = 0;
1506	struct dma_async_tx_descriptor *tx;
1507	struct async_submit_ctl submit;
1508	int i;
1509
1510	BUG_ON(sh->batch_head);
1511
1512	pr_debug("%s: stripe %llu block: %d\n",
1513		__func__, (unsigned long long)sh->sector, target);
1514	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1515
1516	for (i = disks; i--; ) {
1517		if (i != target) {
1518			off_srcs[count] = sh->dev[i].offset;
1519			xor_srcs[count++] = sh->dev[i].page;
1520		}
1521	}
1522
1523	atomic_inc(&sh->count);
1524
1525	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
1526			  ops_complete_compute, sh, to_addr_conv(sh, percpu, 0));
1527	if (unlikely(count == 1))
1528		tx = async_memcpy(xor_dest, xor_srcs[0], off_dest, off_srcs[0],
1529				RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1530	else
1531		tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
1532				RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1533
1534	return tx;
1535}
1536
1537/* set_syndrome_sources - populate source buffers for gen_syndrome
1538 * @srcs - (struct page *) array of size sh->disks
1539 * @offs - (unsigned int) array of offset for each page
1540 * @sh - stripe_head to parse
1541 *
1542 * Populates srcs in proper layout order for the stripe and returns the
1543 * 'count' of sources to be used in a call to async_gen_syndrome.  The P
1544 * destination buffer is recorded in srcs[count] and the Q destination
1545 * is recorded in srcs[count+1]].
1546 */
1547static int set_syndrome_sources(struct page **srcs,
1548				unsigned int *offs,
1549				struct stripe_head *sh,
1550				int srctype)
1551{
1552	int disks = sh->disks;
1553	int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
1554	int d0_idx = raid6_d0(sh);
1555	int count;
1556	int i;
1557
1558	for (i = 0; i < disks; i++)
1559		srcs[i] = NULL;
1560
1561	count = 0;
1562	i = d0_idx;
1563	do {
1564		int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1565		struct r5dev *dev = &sh->dev[i];
1566
1567		if (i == sh->qd_idx || i == sh->pd_idx ||
1568		    (srctype == SYNDROME_SRC_ALL) ||
1569		    (srctype == SYNDROME_SRC_WANT_DRAIN &&
1570		     (test_bit(R5_Wantdrain, &dev->flags) ||
1571		      test_bit(R5_InJournal, &dev->flags))) ||
1572		    (srctype == SYNDROME_SRC_WRITTEN &&
1573		     (dev->written ||
1574		      test_bit(R5_InJournal, &dev->flags)))) {
1575			if (test_bit(R5_InJournal, &dev->flags))
1576				srcs[slot] = sh->dev[i].orig_page;
1577			else
1578				srcs[slot] = sh->dev[i].page;
1579			/*
1580			 * For R5_InJournal, PAGE_SIZE must be 4KB and will
1581			 * not shared page. In that case, dev[i].offset
1582			 * is 0.
1583			 */
1584			offs[slot] = sh->dev[i].offset;
1585		}
1586		i = raid6_next_disk(i, disks);
1587	} while (i != d0_idx);
1588
1589	return syndrome_disks;
1590}
1591
1592static struct dma_async_tx_descriptor *
1593ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
1594{
1595	int disks = sh->disks;
1596	struct page **blocks = to_addr_page(percpu, 0);
1597	unsigned int *offs = to_addr_offs(sh, percpu);
1598	int target;
1599	int qd_idx = sh->qd_idx;
1600	struct dma_async_tx_descriptor *tx;
1601	struct async_submit_ctl submit;
1602	struct r5dev *tgt;
1603	struct page *dest;
1604	unsigned int dest_off;
1605	int i;
1606	int count;
1607
1608	BUG_ON(sh->batch_head);
1609	if (sh->ops.target < 0)
1610		target = sh->ops.target2;
1611	else if (sh->ops.target2 < 0)
1612		target = sh->ops.target;
1613	else
1614		/* we should only have one valid target */
1615		BUG();
1616	BUG_ON(target < 0);
1617	pr_debug("%s: stripe %llu block: %d\n",
1618		__func__, (unsigned long long)sh->sector, target);
1619
1620	tgt = &sh->dev[target];
1621	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1622	dest = tgt->page;
1623	dest_off = tgt->offset;
1624
1625	atomic_inc(&sh->count);
1626
1627	if (target == qd_idx) {
1628		count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_ALL);
1629		blocks[count] = NULL; /* regenerating p is not necessary */
1630		BUG_ON(blocks[count+1] != dest); /* q should already be set */
1631		init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1632				  ops_complete_compute, sh,
1633				  to_addr_conv(sh, percpu, 0));
1634		tx = async_gen_syndrome(blocks, offs, count+2,
1635				RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1636	} else {
1637		/* Compute any data- or p-drive using XOR */
1638		count = 0;
1639		for (i = disks; i-- ; ) {
1640			if (i == target || i == qd_idx)
1641				continue;
1642			offs[count] = sh->dev[i].offset;
1643			blocks[count++] = sh->dev[i].page;
1644		}
1645
1646		init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
1647				  NULL, ops_complete_compute, sh,
1648				  to_addr_conv(sh, percpu, 0));
1649		tx = async_xor_offs(dest, dest_off, blocks, offs, count,
1650				RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1651	}
1652
1653	return tx;
1654}
1655
1656static struct dma_async_tx_descriptor *
1657ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
1658{
1659	int i, count, disks = sh->disks;
1660	int syndrome_disks = sh->ddf_layout ? disks : disks-2;
1661	int d0_idx = raid6_d0(sh);
1662	int faila = -1, failb = -1;
1663	int target = sh->ops.target;
1664	int target2 = sh->ops.target2;
1665	struct r5dev *tgt = &sh->dev[target];
1666	struct r5dev *tgt2 = &sh->dev[target2];
1667	struct dma_async_tx_descriptor *tx;
1668	struct page **blocks = to_addr_page(percpu, 0);
1669	unsigned int *offs = to_addr_offs(sh, percpu);
1670	struct async_submit_ctl submit;
1671
1672	BUG_ON(sh->batch_head);
1673	pr_debug("%s: stripe %llu block1: %d block2: %d\n",
1674		 __func__, (unsigned long long)sh->sector, target, target2);
1675	BUG_ON(target < 0 || target2 < 0);
1676	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1677	BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
1678
1679	/* we need to open-code set_syndrome_sources to handle the
1680	 * slot number conversion for 'faila' and 'failb'
1681	 */
1682	for (i = 0; i < disks ; i++) {
1683		offs[i] = 0;
1684		blocks[i] = NULL;
1685	}
1686	count = 0;
1687	i = d0_idx;
1688	do {
1689		int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1690
1691		offs[slot] = sh->dev[i].offset;
1692		blocks[slot] = sh->dev[i].page;
1693
1694		if (i == target)
1695			faila = slot;
1696		if (i == target2)
1697			failb = slot;
1698		i = raid6_next_disk(i, disks);
1699	} while (i != d0_idx);
1700
1701	BUG_ON(faila == failb);
1702	if (failb < faila)
1703		swap(faila, failb);
1704	pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
1705		 __func__, (unsigned long long)sh->sector, faila, failb);
1706
1707	atomic_inc(&sh->count);
1708
1709	if (failb == syndrome_disks+1) {
1710		/* Q disk is one of the missing disks */
1711		if (faila == syndrome_disks) {
1712			/* Missing P+Q, just recompute */
1713			init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1714					  ops_complete_compute, sh,
1715					  to_addr_conv(sh, percpu, 0));
1716			return async_gen_syndrome(blocks, offs, syndrome_disks+2,
1717						  RAID5_STRIPE_SIZE(sh->raid_conf),
1718						  &submit);
1719		} else {
1720			struct page *dest;
1721			unsigned int dest_off;
1722			int data_target;
1723			int qd_idx = sh->qd_idx;
1724
1725			/* Missing D+Q: recompute D from P, then recompute Q */
1726			if (target == qd_idx)
1727				data_target = target2;
1728			else
1729				data_target = target;
1730
1731			count = 0;
1732			for (i = disks; i-- ; ) {
1733				if (i == data_target || i == qd_idx)
1734					continue;
1735				offs[count] = sh->dev[i].offset;
1736				blocks[count++] = sh->dev[i].page;
1737			}
1738			dest = sh->dev[data_target].page;
1739			dest_off = sh->dev[data_target].offset;
1740			init_async_submit(&submit,
1741					  ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
1742					  NULL, NULL, NULL,
1743					  to_addr_conv(sh, percpu, 0));
1744			tx = async_xor_offs(dest, dest_off, blocks, offs, count,
1745				       RAID5_STRIPE_SIZE(sh->raid_conf),
1746				       &submit);
1747
1748			count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_ALL);
1749			init_async_submit(&submit, ASYNC_TX_FENCE, tx,
1750					  ops_complete_compute, sh,
1751					  to_addr_conv(sh, percpu, 0));
1752			return async_gen_syndrome(blocks, offs, count+2,
1753						  RAID5_STRIPE_SIZE(sh->raid_conf),
1754						  &submit);
1755		}
1756	} else {
1757		init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1758				  ops_complete_compute, sh,
1759				  to_addr_conv(sh, percpu, 0));
1760		if (failb == syndrome_disks) {
1761			/* We're missing D+P. */
1762			return async_raid6_datap_recov(syndrome_disks+2,
1763						RAID5_STRIPE_SIZE(sh->raid_conf),
1764						faila,
1765						blocks, offs, &submit);
1766		} else {
1767			/* We're missing D+D. */
1768			return async_raid6_2data_recov(syndrome_disks+2,
1769						RAID5_STRIPE_SIZE(sh->raid_conf),
1770						faila, failb,
1771						blocks, offs, &submit);
1772		}
1773	}
1774}
1775
1776static void ops_complete_prexor(void *stripe_head_ref)
1777{
1778	struct stripe_head *sh = stripe_head_ref;
1779
1780	pr_debug("%s: stripe %llu\n", __func__,
1781		(unsigned long long)sh->sector);
1782
1783	if (r5c_is_writeback(sh->raid_conf->log))
1784		/*
1785		 * raid5-cache write back uses orig_page during prexor.
1786		 * After prexor, it is time to free orig_page
1787		 */
1788		r5c_release_extra_page(sh);
1789}
1790
1791static struct dma_async_tx_descriptor *
1792ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
1793		struct dma_async_tx_descriptor *tx)
1794{
1795	int disks = sh->disks;
1796	struct page **xor_srcs = to_addr_page(percpu, 0);
1797	unsigned int *off_srcs = to_addr_offs(sh, percpu);
1798	int count = 0, pd_idx = sh->pd_idx, i;
1799	struct async_submit_ctl submit;
1800
1801	/* existing parity data subtracted */
1802	unsigned int off_dest = off_srcs[count] = sh->dev[pd_idx].offset;
1803	struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1804
1805	BUG_ON(sh->batch_head);
1806	pr_debug("%s: stripe %llu\n", __func__,
1807		(unsigned long long)sh->sector);
1808
1809	for (i = disks; i--; ) {
1810		struct r5dev *dev = &sh->dev[i];
1811		/* Only process blocks that are known to be uptodate */
1812		if (test_bit(R5_InJournal, &dev->flags)) {
1813			/*
1814			 * For this case, PAGE_SIZE must be equal to 4KB and
1815			 * page offset is zero.
1816			 */
1817			off_srcs[count] = dev->offset;
1818			xor_srcs[count++] = dev->orig_page;
1819		} else if (test_bit(R5_Wantdrain, &dev->flags)) {
1820			off_srcs[count] = dev->offset;
1821			xor_srcs[count++] = dev->page;
1822		}
1823	}
1824
1825	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
1826			  ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1827	tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
1828			RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1829
1830	return tx;
1831}
1832
1833static struct dma_async_tx_descriptor *
1834ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
1835		struct dma_async_tx_descriptor *tx)
1836{
1837	struct page **blocks = to_addr_page(percpu, 0);
1838	unsigned int *offs = to_addr_offs(sh, percpu);
1839	int count;
1840	struct async_submit_ctl submit;
1841
1842	pr_debug("%s: stripe %llu\n", __func__,
1843		(unsigned long long)sh->sector);
1844
1845	count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_WANT_DRAIN);
1846
1847	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx,
1848			  ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1849	tx = async_gen_syndrome(blocks, offs, count+2,
1850			RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1851
1852	return tx;
1853}
1854
1855static struct dma_async_tx_descriptor *
1856ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1857{
1858	struct r5conf *conf = sh->raid_conf;
1859	int disks = sh->disks;
1860	int i;
1861	struct stripe_head *head_sh = sh;
1862
1863	pr_debug("%s: stripe %llu\n", __func__,
1864		(unsigned long long)sh->sector);
1865
1866	for (i = disks; i--; ) {
1867		struct r5dev *dev;
1868		struct bio *chosen;
1869
1870		sh = head_sh;
1871		if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) {
1872			struct bio *wbi;
1873
1874again:
1875			dev = &sh->dev[i];
1876			/*
1877			 * clear R5_InJournal, so when rewriting a page in
1878			 * journal, it is not skipped by r5l_log_stripe()
1879			 */
1880			clear_bit(R5_InJournal, &dev->flags);
1881			spin_lock_irq(&sh->stripe_lock);
1882			chosen = dev->towrite;
1883			dev->towrite = NULL;
1884			sh->overwrite_disks = 0;
1885			BUG_ON(dev->written);
1886			wbi = dev->written = chosen;
1887			spin_unlock_irq(&sh->stripe_lock);
1888			WARN_ON(dev->page != dev->orig_page);
1889
1890			while (wbi && wbi->bi_iter.bi_sector <
1891				dev->sector + RAID5_STRIPE_SECTORS(conf)) {
1892				if (wbi->bi_opf & REQ_FUA)
1893					set_bit(R5_WantFUA, &dev->flags);
1894				if (wbi->bi_opf & REQ_SYNC)
1895					set_bit(R5_SyncIO, &dev->flags);
1896				if (bio_op(wbi) == REQ_OP_DISCARD)
1897					set_bit(R5_Discard, &dev->flags);
1898				else {
1899					tx = async_copy_data(1, wbi, &dev->page,
1900							     dev->offset,
1901							     dev->sector, tx, sh,
1902							     r5c_is_writeback(conf->log));
1903					if (dev->page != dev->orig_page &&
1904					    !r5c_is_writeback(conf->log)) {
1905						set_bit(R5_SkipCopy, &dev->flags);
1906						clear_bit(R5_UPTODATE, &dev->flags);
1907						clear_bit(R5_OVERWRITE, &dev->flags);
1908					}
1909				}
1910				wbi = r5_next_bio(conf, wbi, dev->sector);
1911			}
1912
1913			if (head_sh->batch_head) {
1914				sh = list_first_entry(&sh->batch_list,
1915						      struct stripe_head,
1916						      batch_list);
1917				if (sh == head_sh)
1918					continue;
1919				goto again;
1920			}
1921		}
1922	}
1923
1924	return tx;
1925}
1926
1927static void ops_complete_reconstruct(void *stripe_head_ref)
1928{
1929	struct stripe_head *sh = stripe_head_ref;
1930	int disks = sh->disks;
1931	int pd_idx = sh->pd_idx;
1932	int qd_idx = sh->qd_idx;
1933	int i;
1934	bool fua = false, sync = false, discard = false;
1935
1936	pr_debug("%s: stripe %llu\n", __func__,
1937		(unsigned long long)sh->sector);
1938
1939	for (i = disks; i--; ) {
1940		fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
1941		sync |= test_bit(R5_SyncIO, &sh->dev[i].flags);
1942		discard |= test_bit(R5_Discard, &sh->dev[i].flags);
1943	}
1944
1945	for (i = disks; i--; ) {
1946		struct r5dev *dev = &sh->dev[i];
1947
1948		if (dev->written || i == pd_idx || i == qd_idx) {
1949			if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) {
1950				set_bit(R5_UPTODATE, &dev->flags);
1951				if (test_bit(STRIPE_EXPAND_READY, &sh->state))
1952					set_bit(R5_Expanded, &dev->flags);
1953			}
1954			if (fua)
1955				set_bit(R5_WantFUA, &dev->flags);
1956			if (sync)
1957				set_bit(R5_SyncIO, &dev->flags);
1958		}
1959	}
1960
1961	if (sh->reconstruct_state == reconstruct_state_drain_run)
1962		sh->reconstruct_state = reconstruct_state_drain_result;
1963	else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
1964		sh->reconstruct_state = reconstruct_state_prexor_drain_result;
1965	else {
1966		BUG_ON(sh->reconstruct_state != reconstruct_state_run);
1967		sh->reconstruct_state = reconstruct_state_result;
1968	}
1969
1970	set_bit(STRIPE_HANDLE, &sh->state);
1971	raid5_release_stripe(sh);
1972}
1973
1974static void
1975ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
1976		     struct dma_async_tx_descriptor *tx)
1977{
1978	int disks = sh->disks;
1979	struct page **xor_srcs;
1980	unsigned int *off_srcs;
1981	struct async_submit_ctl submit;
1982	int count, pd_idx = sh->pd_idx, i;
1983	struct page *xor_dest;
1984	unsigned int off_dest;
1985	int prexor = 0;
1986	unsigned long flags;
1987	int j = 0;
1988	struct stripe_head *head_sh = sh;
1989	int last_stripe;
1990
1991	pr_debug("%s: stripe %llu\n", __func__,
1992		(unsigned long long)sh->sector);
1993
1994	for (i = 0; i < sh->disks; i++) {
1995		if (pd_idx == i)
1996			continue;
1997		if (!test_bit(R5_Discard, &sh->dev[i].flags))
1998			break;
1999	}
2000	if (i >= sh->disks) {
2001		atomic_inc(&sh->count);
2002		set_bit(R5_Discard, &sh->dev[pd_idx].flags);
2003		ops_complete_reconstruct(sh);
2004		return;
2005	}
2006again:
2007	count = 0;
2008	xor_srcs = to_addr_page(percpu, j);
2009	off_srcs = to_addr_offs(sh, percpu);
2010	/* check if prexor is active which means only process blocks
2011	 * that are part of a read-modify-write (written)
2012	 */
2013	if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
2014		prexor = 1;
2015		off_dest = off_srcs[count] = sh->dev[pd_idx].offset;
2016		xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
2017		for (i = disks; i--; ) {
2018			struct r5dev *dev = &sh->dev[i];
2019			if (head_sh->dev[i].written ||
2020			    test_bit(R5_InJournal, &head_sh->dev[i].flags)) {
2021				off_srcs[count] = dev->offset;
2022				xor_srcs[count++] = dev->page;
2023			}
2024		}
2025	} else {
2026		xor_dest = sh->dev[pd_idx].page;
2027		off_dest = sh->dev[pd_idx].offset;
2028		for (i = disks; i--; ) {
2029			struct r5dev *dev = &sh->dev[i];
2030			if (i != pd_idx) {
2031				off_srcs[count] = dev->offset;
2032				xor_srcs[count++] = dev->page;
2033			}
2034		}
2035	}
2036
2037	/* 1/ if we prexor'd then the dest is reused as a source
2038	 * 2/ if we did not prexor then we are redoing the parity
2039	 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
2040	 * for the synchronous xor case
2041	 */
2042	last_stripe = !head_sh->batch_head ||
2043		list_first_entry(&sh->batch_list,
2044				 struct stripe_head, batch_list) == head_sh;
2045	if (last_stripe) {
2046		flags = ASYNC_TX_ACK |
2047			(prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
2048
2049		atomic_inc(&head_sh->count);
2050		init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh,
2051				  to_addr_conv(sh, percpu, j));
2052	} else {
2053		flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST;
2054		init_async_submit(&submit, flags, tx, NULL, NULL,
2055				  to_addr_conv(sh, percpu, j));
2056	}
2057
2058	if (unlikely(count == 1))
2059		tx = async_memcpy(xor_dest, xor_srcs[0], off_dest, off_srcs[0],
2060				RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
2061	else
2062		tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
2063				RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
2064	if (!last_stripe) {
2065		j++;
2066		sh = list_first_entry(&sh->batch_list, struct stripe_head,
2067				      batch_list);
2068		goto again;
2069	}
2070}
2071
2072static void
2073ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
2074		     struct dma_async_tx_descriptor *tx)
2075{
2076	struct async_submit_ctl submit;
2077	struct page **blocks;
2078	unsigned int *offs;
2079	int count, i, j = 0;
2080	struct stripe_head *head_sh = sh;
2081	int last_stripe;
2082	int synflags;
2083	unsigned long txflags;
2084
2085	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
2086
2087	for (i = 0; i < sh->disks; i++) {
2088		if (sh->pd_idx == i || sh->qd_idx == i)
2089			continue;
2090		if (!test_bit(R5_Discard, &sh->dev[i].flags))
2091			break;
2092	}
2093	if (i >= sh->disks) {
2094		atomic_inc(&sh->count);
2095		set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
2096		set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
2097		ops_complete_reconstruct(sh);
2098		return;
2099	}
2100
2101again:
2102	blocks = to_addr_page(percpu, j);
2103	offs = to_addr_offs(sh, percpu);
2104
2105	if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
2106		synflags = SYNDROME_SRC_WRITTEN;
2107		txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST;
2108	} else {
2109		synflags = SYNDROME_SRC_ALL;
2110		txflags = ASYNC_TX_ACK;
2111	}
2112
2113	count = set_syndrome_sources(blocks, offs, sh, synflags);
2114	last_stripe = !head_sh->batch_head ||
2115		list_first_entry(&sh->batch_list,
2116				 struct stripe_head, batch_list) == head_sh;
2117
2118	if (last_stripe) {
2119		atomic_inc(&head_sh->count);
2120		init_async_submit(&submit, txflags, tx, ops_complete_reconstruct,
2121				  head_sh, to_addr_conv(sh, percpu, j));
2122	} else
2123		init_async_submit(&submit, 0, tx, NULL, NULL,
2124				  to_addr_conv(sh, percpu, j));
2125	tx = async_gen_syndrome(blocks, offs, count+2,
2126			RAID5_STRIPE_SIZE(sh->raid_conf),  &submit);
2127	if (!last_stripe) {
2128		j++;
2129		sh = list_first_entry(&sh->batch_list, struct stripe_head,
2130				      batch_list);
2131		goto again;
2132	}
2133}
2134
2135static void ops_complete_check(void *stripe_head_ref)
2136{
2137	struct stripe_head *sh = stripe_head_ref;
2138
2139	pr_debug("%s: stripe %llu\n", __func__,
2140		(unsigned long long)sh->sector);
2141
2142	sh->check_state = check_state_check_result;
2143	set_bit(STRIPE_HANDLE, &sh->state);
2144	raid5_release_stripe(sh);
2145}
2146
2147static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
2148{
2149	int disks = sh->disks;
2150	int pd_idx = sh->pd_idx;
2151	int qd_idx = sh->qd_idx;
2152	struct page *xor_dest;
2153	unsigned int off_dest;
2154	struct page **xor_srcs = to_addr_page(percpu, 0);
2155	unsigned int *off_srcs = to_addr_offs(sh, percpu);
2156	struct dma_async_tx_descriptor *tx;
2157	struct async_submit_ctl submit;
2158	int count;
2159	int i;
2160
2161	pr_debug("%s: stripe %llu\n", __func__,
2162		(unsigned long long)sh->sector);
2163
2164	BUG_ON(sh->batch_head);
2165	count = 0;
2166	xor_dest = sh->dev[pd_idx].page;
2167	off_dest = sh->dev[pd_idx].offset;
2168	off_srcs[count] = off_dest;
2169	xor_srcs[count++] = xor_dest;
2170	for (i = disks; i--; ) {
2171		if (i == pd_idx || i == qd_idx)
2172			continue;
2173		off_srcs[count] = sh->dev[i].offset;
2174		xor_srcs[count++] = sh->dev[i].page;
2175	}
2176
2177	init_async_submit(&submit, 0, NULL, NULL, NULL,
2178			  to_addr_conv(sh, percpu, 0));
2179	tx = async_xor_val_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
2180			   RAID5_STRIPE_SIZE(sh->raid_conf),
2181			   &sh->ops.zero_sum_result, &submit);
2182
2183	atomic_inc(&sh->count);
2184	init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL);
2185	tx = async_trigger_callback(&submit);
2186}
2187
2188static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
2189{
2190	struct page **srcs = to_addr_page(percpu, 0);
2191	unsigned int *offs = to_addr_offs(sh, percpu);
2192	struct async_submit_ctl submit;
2193	int count;
2194
2195	pr_debug("%s: stripe %llu checkp: %d\n", __func__,
2196		(unsigned long long)sh->sector, checkp);
2197
2198	BUG_ON(sh->batch_head);
2199	count = set_syndrome_sources(srcs, offs, sh, SYNDROME_SRC_ALL);
2200	if (!checkp)
2201		srcs[count] = NULL;
2202
2203	atomic_inc(&sh->count);
2204	init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
2205			  sh, to_addr_conv(sh, percpu, 0));
2206	async_syndrome_val(srcs, offs, count+2,
2207			   RAID5_STRIPE_SIZE(sh->raid_conf),
2208			   &sh->ops.zero_sum_result, percpu->spare_page, 0, &submit);
2209}
2210
2211static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
2212{
2213	int overlap_clear = 0, i, disks = sh->disks;
2214	struct dma_async_tx_descriptor *tx = NULL;
2215	struct r5conf *conf = sh->raid_conf;
2216	int level = conf->level;
2217	struct raid5_percpu *percpu;
2218	unsigned long cpu;
2219
2220	cpu = get_cpu();
2221	percpu = per_cpu_ptr(conf->percpu, cpu);
2222	if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
2223		ops_run_biofill(sh);
2224		overlap_clear++;
2225	}
2226
2227	if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
2228		if (level < 6)
2229			tx = ops_run_compute5(sh, percpu);
2230		else {
2231			if (sh->ops.target2 < 0 || sh->ops.target < 0)
2232				tx = ops_run_compute6_1(sh, percpu);
2233			else
2234				tx = ops_run_compute6_2(sh, percpu);
2235		}
2236		/* terminate the chain if reconstruct is not set to be run */
2237		if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
2238			async_tx_ack(tx);
2239	}
2240
2241	if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
2242		if (level < 6)
2243			tx = ops_run_prexor5(sh, percpu, tx);
2244		else
2245			tx = ops_run_prexor6(sh, percpu, tx);
2246	}
2247
2248	if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request))
2249		tx = ops_run_partial_parity(sh, percpu, tx);
2250
2251	if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
2252		tx = ops_run_biodrain(sh, tx);
2253		overlap_clear++;
2254	}
2255
2256	if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
2257		if (level < 6)
2258			ops_run_reconstruct5(sh, percpu, tx);
2259		else
2260			ops_run_reconstruct6(sh, percpu, tx);
2261	}
2262
2263	if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
2264		if (sh->check_state == check_state_run)
2265			ops_run_check_p(sh, percpu);
2266		else if (sh->check_state == check_state_run_q)
2267			ops_run_check_pq(sh, percpu, 0);
2268		else if (sh->check_state == check_state_run_pq)
2269			ops_run_check_pq(sh, percpu, 1);
2270		else
2271			BUG();
2272	}
2273
2274	if (overlap_clear && !sh->batch_head)
2275		for (i = disks; i--; ) {
2276			struct r5dev *dev = &sh->dev[i];
2277			if (test_and_clear_bit(R5_Overlap, &dev->flags))
2278				wake_up(&sh->raid_conf->wait_for_overlap);
2279		}
2280	put_cpu();
2281}
2282
2283static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh)
2284{
2285#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2286	kfree(sh->pages);
2287#endif
2288	if (sh->ppl_page)
2289		__free_page(sh->ppl_page);
2290	kmem_cache_free(sc, sh);
2291}
2292
2293static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
2294	int disks, struct r5conf *conf)
2295{
2296	struct stripe_head *sh;
2297	int i;
2298
2299	sh = kmem_cache_zalloc(sc, gfp);
2300	if (sh) {
2301		spin_lock_init(&sh->stripe_lock);
2302		spin_lock_init(&sh->batch_lock);
2303		INIT_LIST_HEAD(&sh->batch_list);
2304		INIT_LIST_HEAD(&sh->lru);
2305		INIT_LIST_HEAD(&sh->r5c);
2306		INIT_LIST_HEAD(&sh->log_list);
2307		atomic_set(&sh->count, 1);
2308		sh->raid_conf = conf;
2309		sh->log_start = MaxSector;
2310		for (i = 0; i < disks; i++) {
2311			struct r5dev *dev = &sh->dev[i];
2312
2313			bio_init(&dev->req, &dev->vec, 1);
2314			bio_init(&dev->rreq, &dev->rvec, 1);
2315		}
2316
2317		if (raid5_has_ppl(conf)) {
2318			sh->ppl_page = alloc_page(gfp);
2319			if (!sh->ppl_page) {
2320				free_stripe(sc, sh);
2321				return NULL;
2322			}
2323		}
2324#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2325		if (init_stripe_shared_pages(sh, conf, disks)) {
2326			free_stripe(sc, sh);
2327			return NULL;
2328		}
2329#endif
2330	}
2331	return sh;
2332}
2333static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
2334{
2335	struct stripe_head *sh;
2336
2337	sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size, conf);
2338	if (!sh)
2339		return 0;
2340
2341	if (grow_buffers(sh, gfp)) {
2342		shrink_buffers(sh);
2343		free_stripe(conf->slab_cache, sh);
2344		return 0;
2345	}
2346	sh->hash_lock_index =
2347		conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
2348	/* we just created an active stripe so... */
2349	atomic_inc(&conf->active_stripes);
2350
2351	raid5_release_stripe(sh);
2352	conf->max_nr_stripes++;
2353	return 1;
2354}
2355
2356static int grow_stripes(struct r5conf *conf, int num)
2357{
2358	struct kmem_cache *sc;
2359	size_t namelen = sizeof(conf->cache_name[0]);
2360	int devs = max(conf->raid_disks, conf->previous_raid_disks);
2361
2362	if (conf->mddev->gendisk)
2363		snprintf(conf->cache_name[0], namelen,
2364			"raid%d-%s", conf->level, mdname(conf->mddev));
2365	else
2366		snprintf(conf->cache_name[0], namelen,
2367			"raid%d-%p", conf->level, conf->mddev);
2368	snprintf(conf->cache_name[1], namelen, "%.27s-alt", conf->cache_name[0]);
2369
2370	conf->active_name = 0;
2371	sc = kmem_cache_create(conf->cache_name[conf->active_name],
2372			       sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
2373			       0, 0, NULL);
2374	if (!sc)
2375		return 1;
2376	conf->slab_cache = sc;
2377	conf->pool_size = devs;
2378	while (num--)
2379		if (!grow_one_stripe(conf, GFP_KERNEL))
2380			return 1;
2381
2382	return 0;
2383}
2384
2385/**
2386 * scribble_alloc - allocate percpu scribble buffer for required size
2387 *		    of the scribble region
2388 * @percpu: from for_each_present_cpu() of the caller
2389 * @num: total number of disks in the array
2390 * @cnt: scribble objs count for required size of the scribble region
2391 *
2392 * The scribble buffer size must be enough to contain:
2393 * 1/ a struct page pointer for each device in the array +2
2394 * 2/ room to convert each entry in (1) to its corresponding dma
2395 *    (dma_map_page()) or page (page_address()) address.
2396 *
2397 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we
2398 * calculate over all devices (not just the data blocks), using zeros in place
2399 * of the P and Q blocks.
2400 */
2401static int scribble_alloc(struct raid5_percpu *percpu,
2402			  int num, int cnt)
2403{
2404	size_t obj_size =
2405		sizeof(struct page *) * (num + 2) +
2406		sizeof(addr_conv_t) * (num + 2) +
2407		sizeof(unsigned int) * (num + 2);
2408	void *scribble;
2409
2410	/*
2411	 * If here is in raid array suspend context, it is in memalloc noio
2412	 * context as well, there is no potential recursive memory reclaim
2413	 * I/Os with the GFP_KERNEL flag.
2414	 */
2415	scribble = kvmalloc_array(cnt, obj_size, GFP_KERNEL);
2416	if (!scribble)
2417		return -ENOMEM;
2418
2419	kvfree(percpu->scribble);
2420
2421	percpu->scribble = scribble;
2422	percpu->scribble_obj_size = obj_size;
2423	return 0;
2424}
2425
2426static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
2427{
2428	unsigned long cpu;
2429	int err = 0;
2430
2431	/*
2432	 * Never shrink. And mddev_suspend() could deadlock if this is called
2433	 * from raid5d. In that case, scribble_disks and scribble_sectors
2434	 * should equal to new_disks and new_sectors
2435	 */
2436	if (conf->scribble_disks >= new_disks &&
2437	    conf->scribble_sectors >= new_sectors)
2438		return 0;
2439	mddev_suspend(conf->mddev);
2440	get_online_cpus();
2441
2442	for_each_present_cpu(cpu) {
2443		struct raid5_percpu *percpu;
2444
2445		percpu = per_cpu_ptr(conf->percpu, cpu);
2446		err = scribble_alloc(percpu, new_disks,
2447				     new_sectors / RAID5_STRIPE_SECTORS(conf));
2448		if (err)
2449			break;
2450	}
2451
2452	put_online_cpus();
2453	mddev_resume(conf->mddev);
2454	if (!err) {
2455		conf->scribble_disks = new_disks;
2456		conf->scribble_sectors = new_sectors;
2457	}
2458	return err;
2459}
2460
2461static int resize_stripes(struct r5conf *conf, int newsize)
2462{
2463	/* Make all the stripes able to hold 'newsize' devices.
2464	 * New slots in each stripe get 'page' set to a new page.
2465	 *
2466	 * This happens in stages:
2467	 * 1/ create a new kmem_cache and allocate the required number of
2468	 *    stripe_heads.
2469	 * 2/ gather all the old stripe_heads and transfer the pages across
2470	 *    to the new stripe_heads.  This will have the side effect of
2471	 *    freezing the array as once all stripe_heads have been collected,
2472	 *    no IO will be possible.  Old stripe heads are freed once their
2473	 *    pages have been transferred over, and the old kmem_cache is
2474	 *    freed when all stripes are done.
2475	 * 3/ reallocate conf->disks to be suitable bigger.  If this fails,
2476	 *    we simple return a failure status - no need to clean anything up.
2477	 * 4/ allocate new pages for the new slots in the new stripe_heads.
2478	 *    If this fails, we don't bother trying the shrink the
2479	 *    stripe_heads down again, we just leave them as they are.
2480	 *    As each stripe_head is processed the new one is released into
2481	 *    active service.
2482	 *
2483	 * Once step2 is started, we cannot afford to wait for a write,
2484	 * so we use GFP_NOIO allocations.
2485	 */
2486	struct stripe_head *osh, *nsh;
2487	LIST_HEAD(newstripes);
2488	struct disk_info *ndisks;
2489	int err = 0;
2490	struct kmem_cache *sc;
2491	int i;
2492	int hash, cnt;
2493
2494	md_allow_write(conf->mddev);
2495
2496	/* Step 1 */
2497	sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
2498			       sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
2499			       0, 0, NULL);
2500	if (!sc)
2501		return -ENOMEM;
2502
2503	/* Need to ensure auto-resizing doesn't interfere */
2504	mutex_lock(&conf->cache_size_mutex);
2505
2506	for (i = conf->max_nr_stripes; i; i--) {
2507		nsh = alloc_stripe(sc, GFP_KERNEL, newsize, conf);
2508		if (!nsh)
2509			break;
2510
2511		list_add(&nsh->lru, &newstripes);
2512	}
2513	if (i) {
2514		/* didn't get enough, give up */
2515		while (!list_empty(&newstripes)) {
2516			nsh = list_entry(newstripes.next, struct stripe_head, lru);
2517			list_del(&nsh->lru);
2518			free_stripe(sc, nsh);
2519		}
2520		kmem_cache_destroy(sc);
2521		mutex_unlock(&conf->cache_size_mutex);
2522		return -ENOMEM;
2523	}
2524	/* Step 2 - Must use GFP_NOIO now.
2525	 * OK, we have enough stripes, start collecting inactive
2526	 * stripes and copying them over
2527	 */
2528	hash = 0;
2529	cnt = 0;
2530	list_for_each_entry(nsh, &newstripes, lru) {
2531		lock_device_hash_lock(conf, hash);
2532		wait_event_cmd(conf->wait_for_stripe,
2533				    !list_empty(conf->inactive_list + hash),
2534				    unlock_device_hash_lock(conf, hash),
2535				    lock_device_hash_lock(conf, hash));
2536		osh = get_free_stripe(conf, hash);
2537		unlock_device_hash_lock(conf, hash);
2538
2539#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2540	for (i = 0; i < osh->nr_pages; i++) {
2541		nsh->pages[i] = osh->pages[i];
2542		osh->pages[i] = NULL;
2543	}
2544#endif
2545		for(i=0; i<conf->pool_size; i++) {
2546			nsh->dev[i].page = osh->dev[i].page;
2547			nsh->dev[i].orig_page = osh->dev[i].page;
2548			nsh->dev[i].offset = osh->dev[i].offset;
2549		}
2550		nsh->hash_lock_index = hash;
2551		free_stripe(conf->slab_cache, osh);
2552		cnt++;
2553		if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
2554		    !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
2555			hash++;
2556			cnt = 0;
2557		}
2558	}
2559	kmem_cache_destroy(conf->slab_cache);
2560
2561	/* Step 3.
2562	 * At this point, we are holding all the stripes so the array
2563	 * is completely stalled, so now is a good time to resize
2564	 * conf->disks and the scribble region
2565	 */
2566	ndisks = kcalloc(newsize, sizeof(struct disk_info), GFP_NOIO);
2567	if (ndisks) {
2568		for (i = 0; i < conf->pool_size; i++)
2569			ndisks[i] = conf->disks[i];
2570
2571		for (i = conf->pool_size; i < newsize; i++) {
2572			ndisks[i].extra_page = alloc_page(GFP_NOIO);
2573			if (!ndisks[i].extra_page)
2574				err = -ENOMEM;
2575		}
2576
2577		if (err) {
2578			for (i = conf->pool_size; i < newsize; i++)
2579				if (ndisks[i].extra_page)
2580					put_page(ndisks[i].extra_page);
2581			kfree(ndisks);
2582		} else {
2583			kfree(conf->disks);
2584			conf->disks = ndisks;
2585		}
2586	} else
2587		err = -ENOMEM;
2588
2589	conf->slab_cache = sc;
2590	conf->active_name = 1-conf->active_name;
2591
2592	/* Step 4, return new stripes to service */
2593	while(!list_empty(&newstripes)) {
2594		nsh = list_entry(newstripes.next, struct stripe_head, lru);
2595		list_del_init(&nsh->lru);
2596
2597#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2598		for (i = 0; i < nsh->nr_pages; i++) {
2599			if (nsh->pages[i])
2600				continue;
2601			nsh->pages[i] = alloc_page(GFP_NOIO);
2602			if (!nsh->pages[i])
2603				err = -ENOMEM;
2604		}
2605
2606		for (i = conf->raid_disks; i < newsize; i++) {
2607			if (nsh->dev[i].page)
2608				continue;
2609			nsh->dev[i].page = raid5_get_dev_page(nsh, i);
2610			nsh->dev[i].orig_page = nsh->dev[i].page;
2611			nsh->dev[i].offset = raid5_get_page_offset(nsh, i);
2612		}
2613#else
2614		for (i=conf->raid_disks; i < newsize; i++)
2615			if (nsh->dev[i].page == NULL) {
2616				struct page *p = alloc_page(GFP_NOIO);
2617				nsh->dev[i].page = p;
2618				nsh->dev[i].orig_page = p;
2619				nsh->dev[i].offset = 0;
2620				if (!p)
2621					err = -ENOMEM;
2622			}
2623#endif
2624		raid5_release_stripe(nsh);
2625	}
2626	/* critical section pass, GFP_NOIO no longer needed */
2627
2628	if (!err)
2629		conf->pool_size = newsize;
2630	mutex_unlock(&conf->cache_size_mutex);
2631
2632	return err;
2633}
2634
2635static int drop_one_stripe(struct r5conf *conf)
2636{
2637	struct stripe_head *sh;
2638	int hash = (conf->max_nr_stripes - 1) & STRIPE_HASH_LOCKS_MASK;
2639
2640	spin_lock_irq(conf->hash_locks + hash);
2641	sh = get_free_stripe(conf, hash);
2642	spin_unlock_irq(conf->hash_locks + hash);
2643	if (!sh)
2644		return 0;
2645	BUG_ON(atomic_read(&sh->count));
2646	shrink_buffers(sh);
2647	free_stripe(conf->slab_cache, sh);
2648	atomic_dec(&conf->active_stripes);
2649	conf->max_nr_stripes--;
2650	return 1;
2651}
2652
2653static void shrink_stripes(struct r5conf *conf)
2654{
2655	while (conf->max_nr_stripes &&
2656	       drop_one_stripe(conf))
2657		;
2658
2659	kmem_cache_destroy(conf->slab_cache);
2660	conf->slab_cache = NULL;
2661}
2662
2663static void raid5_end_read_request(struct bio * bi)
2664{
2665	struct stripe_head *sh = bi->bi_private;
2666	struct r5conf *conf = sh->raid_conf;
2667	int disks = sh->disks, i;
2668	char b[BDEVNAME_SIZE];
2669	struct md_rdev *rdev = NULL;
2670	sector_t s;
2671
2672	for (i=0 ; i<disks; i++)
2673		if (bi == &sh->dev[i].req)
2674			break;
2675
2676	pr_debug("end_read_request %llu/%d, count: %d, error %d.\n",
2677		(unsigned long long)sh->sector, i, atomic_read(&sh->count),
2678		bi->bi_status);
2679	if (i == disks) {
2680		bio_reset(bi);
2681		BUG();
2682		return;
2683	}
2684	if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2685		/* If replacement finished while this request was outstanding,
2686		 * 'replacement' might be NULL already.
2687		 * In that case it moved down to 'rdev'.
2688		 * rdev is not removed until all requests are finished.
2689		 */
2690		rdev = conf->disks[i].replacement;
2691	if (!rdev)
2692		rdev = conf->disks[i].rdev;
2693
2694	if (use_new_offset(conf, sh))
2695		s = sh->sector + rdev->new_data_offset;
2696	else
2697		s = sh->sector + rdev->data_offset;
2698	if (!bi->bi_status) {
2699		set_bit(R5_UPTODATE, &sh->dev[i].flags);
2700		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
2701			/* Note that this cannot happen on a
2702			 * replacement device.  We just fail those on
2703			 * any error
2704			 */
2705			pr_info_ratelimited(
2706				"md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n",
2707				mdname(conf->mddev), RAID5_STRIPE_SECTORS(conf),
2708				(unsigned long long)s,
2709				bdevname(rdev->bdev, b));
2710			atomic_add(RAID5_STRIPE_SECTORS(conf), &rdev->corrected_errors);
2711			clear_bit(R5_ReadError, &sh->dev[i].flags);
2712			clear_bit(R5_ReWrite, &sh->dev[i].flags);
2713		} else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2714			clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2715
2716		if (test_bit(R5_InJournal, &sh->dev[i].flags))
2717			/*
2718			 * end read for a page in journal, this
2719			 * must be preparing for prexor in rmw
2720			 */
2721			set_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);
2722
2723		if (atomic_read(&rdev->read_errors))
2724			atomic_set(&rdev->read_errors, 0);
2725	} else {
2726		const char *bdn = bdevname(rdev->bdev, b);
2727		int retry = 0;
2728		int set_bad = 0;
2729
2730		clear_bit(R5_UPTODATE, &sh->dev[i].flags);
2731		if (!(bi->bi_status == BLK_STS_PROTECTION))
2732			atomic_inc(&rdev->read_errors);
2733		if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2734			pr_warn_ratelimited(
2735				"md/raid:%s: read error on replacement device (sector %llu on %s).\n",
2736				mdname(conf->mddev),
2737				(unsigned long long)s,
2738				bdn);
2739		else if (conf->mddev->degraded >= conf->max_degraded) {
2740			set_bad = 1;
2741			pr_warn_ratelimited(
2742				"md/raid:%s: read error not correctable (sector %llu on %s).\n",
2743				mdname(conf->mddev),
2744				(unsigned long long)s,
2745				bdn);
2746		} else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) {
2747			/* Oh, no!!! */
2748			set_bad = 1;
2749			pr_warn_ratelimited(
2750				"md/raid:%s: read error NOT corrected!! (sector %llu on %s).\n",
2751				mdname(conf->mddev),
2752				(unsigned long long)s,
2753				bdn);
2754		} else if (atomic_read(&rdev->read_errors)
2755			 > conf->max_nr_stripes) {
2756			if (!test_bit(Faulty, &rdev->flags)) {
2757				pr_warn("md/raid:%s: %d read_errors > %d stripes\n",
2758				    mdname(conf->mddev),
2759				    atomic_read(&rdev->read_errors),
2760				    conf->max_nr_stripes);
2761				pr_warn("md/raid:%s: Too many read errors, failing device %s.\n",
2762				    mdname(conf->mddev), bdn);
2763			}
2764		} else
2765			retry = 1;
2766		if (set_bad && test_bit(In_sync, &rdev->flags)
2767		    && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2768			retry = 1;
2769		if (retry)
2770			if (sh->qd_idx >= 0 && sh->pd_idx == i)
2771				set_bit(R5_ReadError, &sh->dev[i].flags);
2772			else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
2773				set_bit(R5_ReadError, &sh->dev[i].flags);
2774				clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2775			} else
2776				set_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2777		else {
2778			clear_bit(R5_ReadError, &sh->dev[i].flags);
2779			clear_bit(R5_ReWrite, &sh->dev[i].flags);
2780			if (!(set_bad
2781			      && test_bit(In_sync, &rdev->flags)
2782			      && rdev_set_badblocks(
2783				      rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0)))
2784				md_error(conf->mddev, rdev);
2785		}
2786	}
2787	rdev_dec_pending(rdev, conf->mddev);
2788	bio_reset(bi);
2789	clear_bit(R5_LOCKED, &sh->dev[i].flags);
2790	set_bit(STRIPE_HANDLE, &sh->state);
2791	raid5_release_stripe(sh);
2792}
2793
2794static void raid5_end_write_request(struct bio *bi)
2795{
2796	struct stripe_head *sh = bi->bi_private;
2797	struct r5conf *conf = sh->raid_conf;
2798	int disks = sh->disks, i;
2799	struct md_rdev *rdev;
2800	sector_t first_bad;
2801	int bad_sectors;
2802	int replacement = 0;
2803
2804	for (i = 0 ; i < disks; i++) {
2805		if (bi == &sh->dev[i].req) {
2806			rdev = conf->disks[i].rdev;
2807			break;
2808		}
2809		if (bi == &sh->dev[i].rreq) {
2810			rdev = conf->disks[i].replacement;
2811			if (rdev)
2812				replacement = 1;
2813			else
2814				/* rdev was removed and 'replacement'
2815				 * replaced it.  rdev is not removed
2816				 * until all requests are finished.
2817				 */
2818				rdev = conf->disks[i].rdev;
2819			break;
2820		}
2821	}
2822	pr_debug("end_write_request %llu/%d, count %d, error: %d.\n",
2823		(unsigned long long)sh->sector, i, atomic_read(&sh->count),
2824		bi->bi_status);
2825	if (i == disks) {
2826		bio_reset(bi);
2827		BUG();
2828		return;
2829	}
2830
2831	if (replacement) {
2832		if (bi->bi_status)
2833			md_error(conf->mddev, rdev);
2834		else if (is_badblock(rdev, sh->sector,
2835				     RAID5_STRIPE_SECTORS(conf),
2836				     &first_bad, &bad_sectors))
2837			set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
2838	} else {
2839		if (bi->bi_status) {
2840			set_bit(STRIPE_DEGRADED, &sh->state);
2841			set_bit(WriteErrorSeen, &rdev->flags);
2842			set_bit(R5_WriteError, &sh->dev[i].flags);
2843			if (!test_and_set_bit(WantReplacement, &rdev->flags))
2844				set_bit(MD_RECOVERY_NEEDED,
2845					&rdev->mddev->recovery);
2846		} else if (is_badblock(rdev, sh->sector,
2847				       RAID5_STRIPE_SECTORS(conf),
2848				       &first_bad, &bad_sectors)) {
2849			set_bit(R5_MadeGood, &sh->dev[i].flags);
2850			if (test_bit(R5_ReadError, &sh->dev[i].flags))
2851				/* That was a successful write so make
2852				 * sure it looks like we already did
2853				 * a re-write.
2854				 */
2855				set_bit(R5_ReWrite, &sh->dev[i].flags);
2856		}
2857	}
2858	rdev_dec_pending(rdev, conf->mddev);
2859
2860	if (sh->batch_head && bi->bi_status && !replacement)
2861		set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
2862
2863	bio_reset(bi);
2864	if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
2865		clear_bit(R5_LOCKED, &sh->dev[i].flags);
2866	set_bit(STRIPE_HANDLE, &sh->state);
2867
2868	if (sh->batch_head && sh != sh->batch_head)
2869		raid5_release_stripe(sh->batch_head);
2870	raid5_release_stripe(sh);
2871}
2872
2873static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
2874{
2875	char b[BDEVNAME_SIZE];
2876	struct r5conf *conf = mddev->private;
2877	unsigned long flags;
2878	pr_debug("raid456: error called\n");
2879
2880	pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n",
2881		mdname(mddev), bdevname(rdev->bdev, b));
2882
2883	spin_lock_irqsave(&conf->device_lock, flags);
2884	set_bit(Faulty, &rdev->flags);
2885	clear_bit(In_sync, &rdev->flags);
2886	mddev->degraded = raid5_calc_degraded(conf);
2887
2888	if (has_failed(conf)) {
2889		set_bit(MD_BROKEN, &conf->mddev->flags);
2890		conf->recovery_disabled = mddev->recovery_disabled;
2891
2892		pr_crit("md/raid:%s: Cannot continue operation (%d/%d failed).\n",
2893			mdname(mddev), mddev->degraded, conf->raid_disks);
2894	} else {
2895		pr_crit("md/raid:%s: Operation continuing on %d devices.\n",
2896			mdname(mddev), conf->raid_disks - mddev->degraded);
2897	}
2898
2899	spin_unlock_irqrestore(&conf->device_lock, flags);
2900	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2901
2902	set_bit(Blocked, &rdev->flags);
2903	set_mask_bits(&mddev->sb_flags, 0,
2904		      BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
2905	r5c_update_on_rdev_error(mddev, rdev);
2906}
2907
2908/*
2909 * Input: a 'big' sector number,
2910 * Output: index of the data and parity disk, and the sector # in them.
2911 */
2912sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
2913			      int previous, int *dd_idx,
2914			      struct stripe_head *sh)
2915{
2916	sector_t stripe, stripe2;
2917	sector_t chunk_number;
2918	unsigned int chunk_offset;
2919	int pd_idx, qd_idx;
2920	int ddf_layout = 0;
2921	sector_t new_sector;
2922	int algorithm = previous ? conf->prev_algo
2923				 : conf->algorithm;
2924	int sectors_per_chunk = previous ? conf->prev_chunk_sectors
2925					 : conf->chunk_sectors;
2926	int raid_disks = previous ? conf->previous_raid_disks
2927				  : conf->raid_disks;
2928	int data_disks = raid_disks - conf->max_degraded;
2929
2930	/* First compute the information on this sector */
2931
2932	/*
2933	 * Compute the chunk number and the sector offset inside the chunk
2934	 */
2935	chunk_offset = sector_div(r_sector, sectors_per_chunk);
2936	chunk_number = r_sector;
2937
2938	/*
2939	 * Compute the stripe number
2940	 */
2941	stripe = chunk_number;
2942	*dd_idx = sector_div(stripe, data_disks);
2943	stripe2 = stripe;
2944	/*
2945	 * Select the parity disk based on the user selected algorithm.
2946	 */
2947	pd_idx = qd_idx = -1;
2948	switch(conf->level) {
2949	case 4:
2950		pd_idx = data_disks;
2951		break;
2952	case 5:
2953		switch (algorithm) {
2954		case ALGORITHM_LEFT_ASYMMETRIC:
2955			pd_idx = data_disks - sector_div(stripe2, raid_disks);
2956			if (*dd_idx >= pd_idx)
2957				(*dd_idx)++;
2958			break;
2959		case ALGORITHM_RIGHT_ASYMMETRIC:
2960			pd_idx = sector_div(stripe2, raid_disks);
2961			if (*dd_idx >= pd_idx)
2962				(*dd_idx)++;
2963			break;
2964		case ALGORITHM_LEFT_SYMMETRIC:
2965			pd_idx = data_disks - sector_div(stripe2, raid_disks);
2966			*dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2967			break;
2968		case ALGORITHM_RIGHT_SYMMETRIC:
2969			pd_idx = sector_div(stripe2, raid_disks);
2970			*dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2971			break;
2972		case ALGORITHM_PARITY_0:
2973			pd_idx = 0;
2974			(*dd_idx)++;
2975			break;
2976		case ALGORITHM_PARITY_N:
2977			pd_idx = data_disks;
2978			break;
2979		default:
2980			BUG();
2981		}
2982		break;
2983	case 6:
2984
2985		switch (algorithm) {
2986		case ALGORITHM_LEFT_ASYMMETRIC:
2987			pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2988			qd_idx = pd_idx + 1;
2989			if (pd_idx == raid_disks-1) {
2990				(*dd_idx)++;	/* Q D D D P */
2991				qd_idx = 0;
2992			} else if (*dd_idx >= pd_idx)
2993				(*dd_idx) += 2; /* D D P Q D */
2994			break;
2995		case ALGORITHM_RIGHT_ASYMMETRIC:
2996			pd_idx = sector_div(stripe2, raid_disks);
2997			qd_idx = pd_idx + 1;
2998			if (pd_idx == raid_disks-1) {
2999				(*dd_idx)++;	/* Q D D D P */
3000				qd_idx = 0;
3001			} else if (*dd_idx >= pd_idx)
3002				(*dd_idx) += 2; /* D D P Q D */
3003			break;
3004		case ALGORITHM_LEFT_SYMMETRIC:
3005			pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
3006			qd_idx = (pd_idx + 1) % raid_disks;
3007			*dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
3008			break;
3009		case ALGORITHM_RIGHT_SYMMETRIC:
3010			pd_idx = sector_div(stripe2, raid_disks);
3011			qd_idx = (pd_idx + 1) % raid_disks;
3012			*dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
3013			break;
3014
3015		case ALGORITHM_PARITY_0:
3016			pd_idx = 0;
3017			qd_idx = 1;
3018			(*dd_idx) += 2;
3019			break;
3020		case ALGORITHM_PARITY_N:
3021			pd_idx = data_disks;
3022			qd_idx = data_disks + 1;
3023			break;
3024
3025		case ALGORITHM_ROTATING_ZERO_RESTART:
3026			/* Exactly the same as RIGHT_ASYMMETRIC, but or
3027			 * of blocks for computing Q is different.
3028			 */
3029			pd_idx = sector_div(stripe2, raid_disks);
3030			qd_idx = pd_idx + 1;
3031			if (pd_idx == raid_disks-1) {
3032				(*dd_idx)++;	/* Q D D D P */
3033				qd_idx = 0;
3034			} else if (*dd_idx >= pd_idx)
3035				(*dd_idx) += 2; /* D D P Q D */
3036			ddf_layout = 1;
3037			break;
3038
3039		case ALGORITHM_ROTATING_N_RESTART:
3040			/* Same a left_asymmetric, by first stripe is
3041			 * D D D P Q  rather than
3042			 * Q D D D P
3043			 */
3044			stripe2 += 1;
3045			pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
3046			qd_idx = pd_idx + 1;
3047			if (pd_idx == raid_disks-1) {
3048				(*dd_idx)++;	/* Q D D D P */
3049				qd_idx = 0;
3050			} else if (*dd_idx >= pd_idx)
3051				(*dd_idx) += 2; /* D D P Q D */
3052			ddf_layout = 1;
3053			break;
3054
3055		case ALGORITHM_ROTATING_N_CONTINUE:
3056			/* Same as left_symmetric but Q is before P */
3057			pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
3058			qd_idx = (pd_idx + raid_disks - 1) % raid_disks;
3059			*dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
3060			ddf_layout = 1;
3061			break;
3062
3063		case ALGORITHM_LEFT_ASYMMETRIC_6:
3064			/* RAID5 left_asymmetric, with Q on last device */
3065			pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
3066			if (*dd_idx >= pd_idx)
3067				(*dd_idx)++;
3068			qd_idx = raid_disks - 1;
3069			break;
3070
3071		case ALGORITHM_RIGHT_ASYMMETRIC_6:
3072			pd_idx = sector_div(stripe2, raid_disks-1);
3073			if (*dd_idx >= pd_idx)
3074				(*dd_idx)++;
3075			qd_idx = raid_disks - 1;
3076			break;
3077
3078		case ALGORITHM_LEFT_SYMMETRIC_6:
3079			pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
3080			*dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
3081			qd_idx = raid_disks - 1;
3082			break;
3083
3084		case ALGORITHM_RIGHT_SYMMETRIC_6:
3085			pd_idx = sector_div(stripe2, raid_disks-1);
3086			*dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
3087			qd_idx = raid_disks - 1;
3088			break;
3089
3090		case ALGORITHM_PARITY_0_6:
3091			pd_idx = 0;
3092			(*dd_idx)++;
3093			qd_idx = raid_disks - 1;
3094			break;
3095
3096		default:
3097			BUG();
3098		}
3099		break;
3100	}
3101
3102	if (sh) {
3103		sh->pd_idx = pd_idx;
3104		sh->qd_idx = qd_idx;
3105		sh->ddf_layout = ddf_layout;
3106	}
3107	/*
3108	 * Finally, compute the new sector number
3109	 */
3110	new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset;
3111	return new_sector;
3112}
3113
3114sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
3115{
3116	struct r5conf *conf = sh->raid_conf;
3117	int raid_disks = sh->disks;
3118	int data_disks = raid_disks - conf->max_degraded;
3119	sector_t new_sector = sh->sector, check;
3120	int sectors_per_chunk = previous ? conf->prev_chunk_sectors
3121					 : conf->chunk_sectors;
3122	int algorithm = previous ? conf->prev_algo
3123				 : conf->algorithm;
3124	sector_t stripe;
3125	int chunk_offset;
3126	sector_t chunk_number;
3127	int dummy1, dd_idx = i;
3128	sector_t r_sector;
3129	struct stripe_head sh2;
3130
3131	chunk_offset = sector_div(new_sector, sectors_per_chunk);
3132	stripe = new_sector;
3133
3134	if (i == sh->pd_idx)
3135		return 0;
3136	switch(conf->level) {
3137	case 4: break;
3138	case 5:
3139		switch (algorithm) {
3140		case ALGORITHM_LEFT_ASYMMETRIC:
3141		case ALGORITHM_RIGHT_ASYMMETRIC:
3142			if (i > sh->pd_idx)
3143				i--;
3144			break;
3145		case ALGORITHM_LEFT_SYMMETRIC:
3146		case ALGORITHM_RIGHT_SYMMETRIC:
3147			if (i < sh->pd_idx)
3148				i += raid_disks;
3149			i -= (sh->pd_idx + 1);
3150			break;
3151		case ALGORITHM_PARITY_0:
3152			i -= 1;
3153			break;
3154		case ALGORITHM_PARITY_N:
3155			break;
3156		default:
3157			BUG();
3158		}
3159		break;
3160	case 6:
3161		if (i == sh->qd_idx)
3162			return 0; /* It is the Q disk */
3163		switch (algorithm) {
3164		case ALGORITHM_LEFT_ASYMMETRIC:
3165		case ALGORITHM_RIGHT_ASYMMETRIC:
3166		case ALGORITHM_ROTATING_ZERO_RESTART:
3167		case ALGORITHM_ROTATING_N_RESTART:
3168			if (sh->pd_idx == raid_disks-1)
3169				i--;	/* Q D D D P */
3170			else if (i > sh->pd_idx)
3171				i -= 2; /* D D P Q D */
3172			break;
3173		case ALGORITHM_LEFT_SYMMETRIC:
3174		case ALGORITHM_RIGHT_SYMMETRIC:
3175			if (sh->pd_idx == raid_disks-1)
3176				i--; /* Q D D D P */
3177			else {
3178				/* D D P Q D */
3179				if (i < sh->pd_idx)
3180					i += raid_disks;
3181				i -= (sh->pd_idx + 2);
3182			}
3183			break;
3184		case ALGORITHM_PARITY_0:
3185			i -= 2;
3186			break;
3187		case ALGORITHM_PARITY_N:
3188			break;
3189		case ALGORITHM_ROTATING_N_CONTINUE:
3190			/* Like left_symmetric, but P is before Q */
3191			if (sh->pd_idx == 0)
3192				i--;	/* P D D D Q */
3193			else {
3194				/* D D Q P D */
3195				if (i < sh->pd_idx)
3196					i += raid_disks;
3197				i -= (sh->pd_idx + 1);
3198			}
3199			break;
3200		case ALGORITHM_LEFT_ASYMMETRIC_6:
3201		case ALGORITHM_RIGHT_ASYMMETRIC_6:
3202			if (i > sh->pd_idx)
3203				i--;
3204			break;
3205		case ALGORITHM_LEFT_SYMMETRIC_6:
3206		case ALGORITHM_RIGHT_SYMMETRIC_6:
3207			if (i < sh->pd_idx)
3208				i += data_disks + 1;
3209			i -= (sh->pd_idx + 1);
3210			break;
3211		case ALGORITHM_PARITY_0_6:
3212			i -= 1;
3213			break;
3214		default:
3215			BUG();
3216		}
3217		break;
3218	}
3219
3220	chunk_number = stripe * data_disks + i;
3221	r_sector = chunk_number * sectors_per_chunk + chunk_offset;
3222
3223	check = raid5_compute_sector(conf, r_sector,
3224				     previous, &dummy1, &sh2);
3225	if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
3226		|| sh2.qd_idx != sh->qd_idx) {
3227		pr_warn("md/raid:%s: compute_blocknr: map not correct\n",
3228			mdname(conf->mddev));
3229		return 0;
3230	}
3231	return r_sector;
3232}
3233
3234/*
3235 * There are cases where we want handle_stripe_dirtying() and
3236 * schedule_reconstruction() to delay towrite to some dev of a stripe.
3237 *
3238 * This function checks whether we want to delay the towrite. Specifically,
3239 * we delay the towrite when:
3240 *
3241 *   1. degraded stripe has a non-overwrite to the missing dev, AND this
3242 *      stripe has data in journal (for other devices).
3243 *
3244 *      In this case, when reading data for the non-overwrite dev, it is
3245 *      necessary to handle complex rmw of write back cache (prexor with
3246 *      orig_page, and xor with page). To keep read path simple, we would
3247 *      like to flush data in journal to RAID disks first, so complex rmw
3248 *      is handled in the write patch (handle_stripe_dirtying).
3249 *
3250 *   2. when journal space is critical (R5C_LOG_CRITICAL=1)
3251 *
3252 *      It is important to be able to flush all stripes in raid5-cache.
3253 *      Therefore, we need reserve some space on the journal device for
3254 *      these flushes. If flush operation includes pending writes to the
3255 *      stripe, we need to reserve (conf->raid_disk + 1) pages per stripe
3256 *      for the flush out. If we exclude these pending writes from flush
3257 *      operation, we only need (conf->max_degraded + 1) pages per stripe.
3258 *      Therefore, excluding pending writes in these cases enables more
3259 *      efficient use of the journal device.
3260 *
3261 *      Note: To make sure the stripe makes progress, we only delay
3262 *      towrite for stripes with data already in journal (injournal > 0).
3263 *      When LOG_CRITICAL, stripes with injournal == 0 will be sent to
3264 *      no_space_stripes list.
3265 *
3266 *   3. during journal failure
3267 *      In journal failure, we try to flush all cached data to raid disks
3268 *      based on data in stripe cache. The array is read-only to upper
3269 *      layers, so we would skip all pending writes.
3270 *
3271 */
3272static inline bool delay_towrite(struct r5conf *conf,
3273				 struct r5dev *dev,
3274				 struct stripe_head_state *s)
3275{
3276	/* case 1 above */
3277	if (!test_bit(R5_OVERWRITE, &dev->flags) &&
3278	    !test_bit(R5_Insync, &dev->flags) && s->injournal)
3279		return true;
3280	/* case 2 above */
3281	if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
3282	    s->injournal > 0)
3283		return true;
3284	/* case 3 above */
3285	if (s->log_failed && s->injournal)
3286		return true;
3287	return false;
3288}
3289
3290static void
3291schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
3292			 int rcw, int expand)
3293{
3294	int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks;
3295	struct r5conf *conf = sh->raid_conf;
3296	int level = conf->level;
3297
3298	if (rcw) {
3299		/*
3300		 * In some cases, handle_stripe_dirtying initially decided to
3301		 * run rmw and allocates extra page for prexor. However, rcw is
3302		 * cheaper later on. We need to free the extra page now,
3303		 * because we won't be able to do that in ops_complete_prexor().
3304		 */
3305		r5c_release_extra_page(sh);
3306
3307		for (i = disks; i--; ) {
3308			struct r5dev *dev = &sh->dev[i];
3309
3310			if (dev->towrite && !delay_towrite(conf, dev, s)) {
3311				set_bit(R5_LOCKED, &dev->flags);
3312				set_bit(R5_Wantdrain, &dev->flags);
3313				if (!expand)
3314					clear_bit(R5_UPTODATE, &dev->flags);
3315				s->locked++;
3316			} else if (test_bit(R5_InJournal, &dev->flags)) {
3317				set_bit(R5_LOCKED, &dev->flags);
3318				s->locked++;
3319			}
3320		}
3321		/* if we are not expanding this is a proper write request, and
3322		 * there will be bios with new data to be drained into the
3323		 * stripe cache
3324		 */
3325		if (!expand) {
3326			if (!s->locked)
3327				/* False alarm, nothing to do */
3328				return;
3329			sh->reconstruct_state = reconstruct_state_drain_run;
3330			set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
3331		} else
3332			sh->reconstruct_state = reconstruct_state_run;
3333
3334		set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
3335
3336		if (s->locked + conf->max_degraded == disks)
3337			if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
3338				atomic_inc(&conf->pending_full_writes);
3339	} else {
3340		BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
3341			test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
3342		BUG_ON(level == 6 &&
3343			(!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) ||
3344			   test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags))));
3345
3346		for (i = disks; i--; ) {
3347			struct r5dev *dev = &sh->dev[i];
3348			if (i == pd_idx || i == qd_idx)
3349				continue;
3350
3351			if (dev->towrite &&
3352			    (test_bit(R5_UPTODATE, &dev->flags) ||
3353			     test_bit(R5_Wantcompute, &dev->flags))) {
3354				set_bit(R5_Wantdrain, &dev->flags);
3355				set_bit(R5_LOCKED, &dev->flags);
3356				clear_bit(R5_UPTODATE, &dev->flags);
3357				s->locked++;
3358			} else if (test_bit(R5_InJournal, &dev->flags)) {
3359				set_bit(R5_LOCKED, &dev->flags);
3360				s->locked++;
3361			}
3362		}
3363		if (!s->locked)
3364			/* False alarm - nothing to do */
3365			return;
3366		sh->reconstruct_state = reconstruct_state_prexor_drain_run;
3367		set_bit(STRIPE_OP_PREXOR, &s->ops_request);
3368		set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
3369		set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
3370	}
3371
3372	/* keep the parity disk(s) locked while asynchronous operations
3373	 * are in flight
3374	 */
3375	set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
3376	clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
3377	s->locked++;
3378
3379	if (level == 6) {
3380		int qd_idx = sh->qd_idx;
3381		struct r5dev *dev = &sh->dev[qd_idx];
3382
3383		set_bit(R5_LOCKED, &dev->flags);
3384		clear_bit(R5_UPTODATE, &dev->flags);
3385		s->locked++;
3386	}
3387
3388	if (raid5_has_ppl(sh->raid_conf) && sh->ppl_page &&
3389	    test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) &&
3390	    !test_bit(STRIPE_FULL_WRITE, &sh->state) &&
3391	    test_bit(R5_Insync, &sh->dev[pd_idx].flags))
3392		set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request);
3393
3394	pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
3395		__func__, (unsigned long long)sh->sector,
3396		s->locked, s->ops_request);
3397}
3398
3399/*
3400 * Each stripe/dev can have one or more bion attached.
3401 * toread/towrite point to the first in a chain.
3402 * The bi_next chain must be in order.
3403 */
3404static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
3405			  int forwrite, int previous)
3406{
3407	struct bio **bip;
3408	struct r5conf *conf = sh->raid_conf;
3409	int firstwrite=0;
3410
3411	pr_debug("adding bi b#%llu to stripe s#%llu\n",
3412		(unsigned long long)bi->bi_iter.bi_sector,
3413		(unsigned long long)sh->sector);
3414
3415	spin_lock_irq(&sh->stripe_lock);
3416	sh->dev[dd_idx].write_hint = bi->bi_write_hint;
3417	/* Don't allow new IO added to stripes in batch list */
3418	if (sh->batch_head)
3419		goto overlap;
3420	if (forwrite) {
3421		bip = &sh->dev[dd_idx].towrite;
3422		if (*bip == NULL)
3423			firstwrite = 1;
3424	} else
3425		bip = &sh->dev[dd_idx].toread;
3426	while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) {
3427		if (bio_end_sector(*bip) > bi->bi_iter.bi_sector)
3428			goto overlap;
3429		bip = & (*bip)->bi_next;
3430	}
3431	if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
3432		goto overlap;
3433
3434	if (forwrite && raid5_has_ppl(conf)) {
3435		/*
3436		 * With PPL only writes to consecutive data chunks within a
3437		 * stripe are allowed because for a single stripe_head we can
3438		 * only have one PPL entry at a time, which describes one data
3439		 * range. Not really an overlap, but wait_for_overlap can be
3440		 * used to handle this.
3441		 */
3442		sector_t sector;
3443		sector_t first = 0;
3444		sector_t last = 0;
3445		int count = 0;
3446		int i;
3447
3448		for (i = 0; i < sh->disks; i++) {
3449			if (i != sh->pd_idx &&
3450			    (i == dd_idx || sh->dev[i].towrite)) {
3451				sector = sh->dev[i].sector;
3452				if (count == 0 || sector < first)
3453					first = sector;
3454				if (sector > last)
3455					last = sector;
3456				count++;
3457			}
3458		}
3459
3460		if (first + conf->chunk_sectors * (count - 1) != last)
3461			goto overlap;
3462	}
3463
3464	if (!forwrite || previous)
3465		clear_bit(STRIPE_BATCH_READY, &sh->state);
3466
3467	BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
3468	if (*bip)
3469		bi->bi_next = *bip;
3470	*bip = bi;
3471	bio_inc_remaining(bi);
3472	md_write_inc(conf->mddev, bi);
3473
3474	if (forwrite) {
3475		/* check if page is covered */
3476		sector_t sector = sh->dev[dd_idx].sector;
3477		for (bi=sh->dev[dd_idx].towrite;
3478		     sector < sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf) &&
3479			     bi && bi->bi_iter.bi_sector <= sector;
3480		     bi = r5_next_bio(conf, bi, sh->dev[dd_idx].sector)) {
3481			if (bio_end_sector(bi) >= sector)
3482				sector = bio_end_sector(bi);
3483		}
3484		if (sector >= sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf))
3485			if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags))
3486				sh->overwrite_disks++;
3487	}
3488
3489	pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
3490		(unsigned long long)(*bip)->bi_iter.bi_sector,
3491		(unsigned long long)sh->sector, dd_idx);
3492
3493	if (conf->mddev->bitmap && firstwrite) {
3494		/* Cannot hold spinlock over bitmap_startwrite,
3495		 * but must ensure this isn't added to a batch until
3496		 * we have added to the bitmap and set bm_seq.
3497		 * So set STRIPE_BITMAP_PENDING to prevent
3498		 * batching.
3499		 * If multiple add_stripe_bio() calls race here they
3500		 * much all set STRIPE_BITMAP_PENDING.  So only the first one
3501		 * to complete "bitmap_startwrite" gets to set
3502		 * STRIPE_BIT_DELAY.  This is important as once a stripe
3503		 * is added to a batch, STRIPE_BIT_DELAY cannot be changed
3504		 * any more.
3505		 */
3506		set_bit(STRIPE_BITMAP_PENDING, &sh->state);
3507		spin_unlock_irq(&sh->stripe_lock);
3508		md_bitmap_startwrite(conf->mddev->bitmap, sh->sector,
3509				     RAID5_STRIPE_SECTORS(conf), 0);
3510		spin_lock_irq(&sh->stripe_lock);
3511		clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
3512		if (!sh->batch_head) {
3513			sh->bm_seq = conf->seq_flush+1;
3514			set_bit(STRIPE_BIT_DELAY, &sh->state);
3515		}
3516	}
3517	spin_unlock_irq(&sh->stripe_lock);
3518
3519	if (stripe_can_batch(sh))
3520		stripe_add_to_batch_list(conf, sh);
3521	return 1;
3522
3523 overlap:
3524	set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
3525	spin_unlock_irq(&sh->stripe_lock);
3526	return 0;
3527}
3528
3529static void end_reshape(struct r5conf *conf);
3530
3531static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
3532			    struct stripe_head *sh)
3533{
3534	int sectors_per_chunk =
3535		previous ? conf->prev_chunk_sectors : conf->chunk_sectors;
3536	int dd_idx;
3537	int chunk_offset = sector_div(stripe, sectors_per_chunk);
3538	int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
3539
3540	raid5_compute_sector(conf,
3541			     stripe * (disks - conf->max_degraded)
3542			     *sectors_per_chunk + chunk_offset,
3543			     previous,
3544			     &dd_idx, sh);
3545}
3546
3547static void
3548handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
3549		     struct stripe_head_state *s, int disks)
3550{
3551	int i;
3552	BUG_ON(sh->batch_head);
3553	for (i = disks; i--; ) {
3554		struct bio *bi;
3555		int bitmap_end = 0;
3556
3557		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
3558			struct md_rdev *rdev;
3559			rcu_read_lock();
3560			rdev = rcu_dereference(conf->disks[i].rdev);
3561			if (rdev && test_bit(In_sync, &rdev->flags) &&
3562			    !test_bit(Faulty, &rdev->flags))
3563				atomic_inc(&rdev->nr_pending);
3564			else
3565				rdev = NULL;
3566			rcu_read_unlock();
3567			if (rdev) {
3568				if (!rdev_set_badblocks(
3569					    rdev,
3570					    sh->sector,
3571					    RAID5_STRIPE_SECTORS(conf), 0))
3572					md_error(conf->mddev, rdev);
3573				rdev_dec_pending(rdev, conf->mddev);
3574			}
3575		}
3576		spin_lock_irq(&sh->stripe_lock);
3577		/* fail all writes first */
3578		bi = sh->dev[i].towrite;
3579		sh->dev[i].towrite = NULL;
3580		sh->overwrite_disks = 0;
3581		spin_unlock_irq(&sh->stripe_lock);
3582		if (bi)
3583			bitmap_end = 1;
3584
3585		log_stripe_write_finished(sh);
3586
3587		if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
3588			wake_up(&conf->wait_for_overlap);
3589
3590		while (bi && bi->bi_iter.bi_sector <
3591			sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
3592			struct bio *nextbi = r5_next_bio(conf, bi, sh->dev[i].sector);
3593
3594			md_write_end(conf->mddev);
3595			bio_io_error(bi);
3596			bi = nextbi;
3597		}
3598		if (bitmap_end)
3599			md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3600					   RAID5_STRIPE_SECTORS(conf), 0, 0);
3601		bitmap_end = 0;
3602		/* and fail all 'written' */
3603		bi = sh->dev[i].written;
3604		sh->dev[i].written = NULL;
3605		if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) {
3606			WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
3607			sh->dev[i].page = sh->dev[i].orig_page;
3608		}
3609
3610		if (bi) bitmap_end = 1;
3611		while (bi && bi->bi_iter.bi_sector <
3612		       sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
3613			struct bio *bi2 = r5_next_bio(conf, bi, sh->dev[i].sector);
3614
3615			md_write_end(conf->mddev);
3616			bio_io_error(bi);
3617			bi = bi2;
3618		}
3619
3620		/* fail any reads if this device is non-operational and
3621		 * the data has not reached the cache yet.
3622		 */
3623		if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
3624		    s->failed > conf->max_degraded &&
3625		    (!test_bit(R5_Insync, &sh->dev[i].flags) ||
3626		      test_bit(R5_ReadError, &sh->dev[i].flags))) {
3627			spin_lock_irq(&sh->stripe_lock);
3628			bi = sh->dev[i].toread;
3629			sh->dev[i].toread = NULL;
3630			spin_unlock_irq(&sh->stripe_lock);
3631			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
3632				wake_up(&conf->wait_for_overlap);
3633			if (bi)
3634				s->to_read--;
3635			while (bi && bi->bi_iter.bi_sector <
3636			       sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
3637				struct bio *nextbi =
3638					r5_next_bio(conf, bi, sh->dev[i].sector);
3639
3640				bio_io_error(bi);
3641				bi = nextbi;
3642			}
3643		}
3644		if (bitmap_end)
3645			md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3646					   RAID5_STRIPE_SECTORS(conf), 0, 0);
3647		/* If we were in the middle of a write the parity block might
3648		 * still be locked - so just clear all R5_LOCKED flags
3649		 */
3650		clear_bit(R5_LOCKED, &sh->dev[i].flags);
3651	}
3652	s->to_write = 0;
3653	s->written = 0;
3654
3655	if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
3656		if (atomic_dec_and_test(&conf->pending_full_writes))
3657			md_wakeup_thread(conf->mddev->thread);
3658}
3659
3660static void
3661handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
3662		   struct stripe_head_state *s)
3663{
3664	int abort = 0;
3665	int i;
3666
3667	BUG_ON(sh->batch_head);
3668	clear_bit(STRIPE_SYNCING, &sh->state);
3669	if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
3670		wake_up(&conf->wait_for_overlap);
3671	s->syncing = 0;
3672	s->replacing = 0;
3673	/* There is nothing more to do for sync/check/repair.
3674	 * Don't even need to abort as that is handled elsewhere
3675	 * if needed, and not always wanted e.g. if there is a known
3676	 * bad block here.
3677	 * For recover/replace we need to record a bad block on all
3678	 * non-sync devices, or abort the recovery
3679	 */
3680	if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) {
3681		/* During recovery devices cannot be removed, so
3682		 * locking and refcounting of rdevs is not needed
3683		 */
3684		rcu_read_lock();
3685		for (i = 0; i < conf->raid_disks; i++) {
3686			struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
3687			if (rdev
3688			    && !test_bit(Faulty, &rdev->flags)
3689			    && !test_bit(In_sync, &rdev->flags)
3690			    && !rdev_set_badblocks(rdev, sh->sector,
3691						   RAID5_STRIPE_SECTORS(conf), 0))
3692				abort = 1;
3693			rdev = rcu_dereference(conf->disks[i].replacement);
3694			if (rdev
3695			    && !test_bit(Faulty, &rdev->flags)
3696			    && !test_bit(In_sync, &rdev->flags)
3697			    && !rdev_set_badblocks(rdev, sh->sector,
3698						   RAID5_STRIPE_SECTORS(conf), 0))
3699				abort = 1;
3700		}
3701		rcu_read_unlock();
3702		if (abort)
3703			conf->recovery_disabled =
3704				conf->mddev->recovery_disabled;
3705	}
3706	md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), !abort);
3707}
3708
3709static int want_replace(struct stripe_head *sh, int disk_idx)
3710{
3711	struct md_rdev *rdev;
3712	int rv = 0;
3713
3714	rcu_read_lock();
3715	rdev = rcu_dereference(sh->raid_conf->disks[disk_idx].replacement);
3716	if (rdev
3717	    && !test_bit(Faulty, &rdev->flags)
3718	    && !test_bit(In_sync, &rdev->flags)
3719	    && (rdev->recovery_offset <= sh->sector
3720		|| rdev->mddev->recovery_cp <= sh->sector))
3721		rv = 1;
3722	rcu_read_unlock();
3723	return rv;
3724}
3725
3726static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
3727			   int disk_idx, int disks)
3728{
3729	struct r5dev *dev = &sh->dev[disk_idx];
3730	struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
3731				  &sh->dev[s->failed_num[1]] };
3732	int i;
3733	bool force_rcw = (sh->raid_conf->rmw_level == PARITY_DISABLE_RMW);
3734
3735
3736	if (test_bit(R5_LOCKED, &dev->flags) ||
3737	    test_bit(R5_UPTODATE, &dev->flags))
3738		/* No point reading this as we already have it or have
3739		 * decided to get it.
3740		 */
3741		return 0;
3742
3743	if (dev->toread ||
3744	    (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)))
3745		/* We need this block to directly satisfy a request */
3746		return 1;
3747
3748	if (s->syncing || s->expanding ||
3749	    (s->replacing && want_replace(sh, disk_idx)))
3750		/* When syncing, or expanding we read everything.
3751		 * When replacing, we need the replaced block.
3752		 */
3753		return 1;
3754
3755	if ((s->failed >= 1 && fdev[0]->toread) ||
3756	    (s->failed >= 2 && fdev[1]->toread))
3757		/* If we want to read from a failed device, then
3758		 * we need to actually read every other device.
3759		 */
3760		return 1;
3761
3762	/* Sometimes neither read-modify-write nor reconstruct-write
3763	 * cycles can work.  In those cases we read every block we
3764	 * can.  Then the parity-update is certain to have enough to
3765	 * work with.
3766	 * This can only be a problem when we need to write something,
3767	 * and some device has failed.  If either of those tests
3768	 * fail we need look no further.
3769	 */
3770	if (!s->failed || !s->to_write)
3771		return 0;
3772
3773	if (test_bit(R5_Insync, &dev->flags) &&
3774	    !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3775		/* Pre-reads at not permitted until after short delay
3776		 * to gather multiple requests.  However if this
3777		 * device is no Insync, the block could only be computed
3778		 * and there is no need to delay that.
3779		 */
3780		return 0;
3781
3782	for (i = 0; i < s->failed && i < 2; i++) {
3783		if (fdev[i]->towrite &&
3784		    !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
3785		    !test_bit(R5_OVERWRITE, &fdev[i]->flags))
3786			/* If we have a partial write to a failed
3787			 * device, then we will need to reconstruct
3788			 * the content of that device, so all other
3789			 * devices must be read.
3790			 */
3791			return 1;
3792
3793		if (s->failed >= 2 &&
3794		    (fdev[i]->towrite ||
3795		     s->failed_num[i] == sh->pd_idx ||
3796		     s->failed_num[i] == sh->qd_idx) &&
3797		    !test_bit(R5_UPTODATE, &fdev[i]->flags))
3798			/* In max degraded raid6, If the failed disk is P, Q,
3799			 * or we want to read the failed disk, we need to do
3800			 * reconstruct-write.
3801			 */
3802			force_rcw = true;
3803	}
3804
3805	/* If we are forced to do a reconstruct-write, because parity
3806	 * cannot be trusted and we are currently recovering it, there
3807	 * is extra need to be careful.
3808	 * If one of the devices that we would need to read, because
3809	 * it is not being overwritten (and maybe not written at all)
3810	 * is missing/faulty, then we need to read everything we can.
3811	 */
3812	if (!force_rcw &&
3813	    sh->sector < sh->raid_conf->mddev->recovery_cp)
3814		/* reconstruct-write isn't being forced */
3815		return 0;
3816	for (i = 0; i < s->failed && i < 2; i++) {
3817		if (s->failed_num[i] != sh->pd_idx &&
3818		    s->failed_num[i] != sh->qd_idx &&
3819		    !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
3820		    !test_bit(R5_OVERWRITE, &fdev[i]->flags))
3821			return 1;
3822	}
3823
3824	return 0;
3825}
3826
3827/* fetch_block - checks the given member device to see if its data needs
3828 * to be read or computed to satisfy a request.
3829 *
3830 * Returns 1 when no more member devices need to be checked, otherwise returns
3831 * 0 to tell the loop in handle_stripe_fill to continue
3832 */
3833static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
3834		       int disk_idx, int disks)
3835{
3836	struct r5dev *dev = &sh->dev[disk_idx];
3837
3838	/* is the data in this block needed, and can we get it? */
3839	if (need_this_block(sh, s, disk_idx, disks)) {
3840		/* we would like to get this block, possibly by computing it,
3841		 * otherwise read it if the backing disk is insync
3842		 */
3843		BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
3844		BUG_ON(test_bit(R5_Wantread, &dev->flags));
3845		BUG_ON(sh->batch_head);
3846
3847		/*
3848		 * In the raid6 case if the only non-uptodate disk is P
3849		 * then we already trusted P to compute the other failed
3850		 * drives. It is safe to compute rather than re-read P.
3851		 * In other cases we only compute blocks from failed
3852		 * devices, otherwise check/repair might fail to detect
3853		 * a real inconsistency.
3854		 */
3855
3856		if ((s->uptodate == disks - 1) &&
3857		    ((sh->qd_idx >= 0 && sh->pd_idx == disk_idx) ||
3858		    (s->failed && (disk_idx == s->failed_num[0] ||
3859				   disk_idx == s->failed_num[1])))) {
3860			/* have disk failed, and we're requested to fetch it;
3861			 * do compute it
3862			 */
3863			pr_debug("Computing stripe %llu block %d\n",
3864			       (unsigned long long)sh->sector, disk_idx);
3865			set_bit(STRIPE_COMPUTE_RUN, &sh->state);
3866			set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
3867			set_bit(R5_Wantcompute, &dev->flags);
3868			sh->ops.target = disk_idx;
3869			sh->ops.target2 = -1; /* no 2nd target */
3870			s->req_compute = 1;
3871			/* Careful: from this point on 'uptodate' is in the eye
3872			 * of raid_run_ops which services 'compute' operations
3873			 * before writes. R5_Wantcompute flags a block that will
3874			 * be R5_UPTODATE by the time it is needed for a
3875			 * subsequent operation.
3876			 */
3877			s->uptodate++;
3878			return 1;
3879		} else if (s->uptodate == disks-2 && s->failed >= 2) {
3880			/* Computing 2-failure is *very* expensive; only
3881			 * do it if failed >= 2
3882			 */
3883			int other;
3884			for (other = disks; other--; ) {
3885				if (other == disk_idx)
3886					continue;
3887				if (!test_bit(R5_UPTODATE,
3888				      &sh->dev[other].flags))
3889					break;
3890			}
3891			BUG_ON(other < 0);
3892			pr_debug("Computing stripe %llu blocks %d,%d\n",
3893			       (unsigned long long)sh->sector,
3894			       disk_idx, other);
3895			set_bit(STRIPE_COMPUTE_RUN, &sh->state);
3896			set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
3897			set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags);
3898			set_bit(R5_Wantcompute, &sh->dev[other].flags);
3899			sh->ops.target = disk_idx;
3900			sh->ops.target2 = other;
3901			s->uptodate += 2;
3902			s->req_compute = 1;
3903			return 1;
3904		} else if (test_bit(R5_Insync, &dev->flags)) {
3905			set_bit(R5_LOCKED, &dev->flags);
3906			set_bit(R5_Wantread, &dev->flags);
3907			s->locked++;
3908			pr_debug("Reading block %d (sync=%d)\n",
3909				disk_idx, s->syncing);
3910		}
3911	}
3912
3913	return 0;
3914}
3915
3916/*
3917 * handle_stripe_fill - read or compute data to satisfy pending requests.
3918 */
3919static void handle_stripe_fill(struct stripe_head *sh,
3920			       struct stripe_head_state *s,
3921			       int disks)
3922{
3923	int i;
3924
3925	/* look for blocks to read/compute, skip this if a compute
3926	 * is already in flight, or if the stripe contents are in the
3927	 * midst of changing due to a write
3928	 */
3929	if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
3930	    !sh->reconstruct_state) {
3931
3932		/*
3933		 * For degraded stripe with data in journal, do not handle
3934		 * read requests yet, instead, flush the stripe to raid
3935		 * disks first, this avoids handling complex rmw of write
3936		 * back cache (prexor with orig_page, and then xor with
3937		 * page) in the read path
3938		 */
3939		if (s->to_read && s->injournal && s->failed) {
3940			if (test_bit(STRIPE_R5C_CACHING, &sh->state))
3941				r5c_make_stripe_write_out(sh);
3942			goto out;
3943		}
3944
3945		for (i = disks; i--; )
3946			if (fetch_block(sh, s, i, disks))
3947				break;
3948	}
3949out:
3950	set_bit(STRIPE_HANDLE, &sh->state);
3951}
3952
3953static void break_stripe_batch_list(struct stripe_head *head_sh,
3954				    unsigned long handle_flags);
3955/* handle_stripe_clean_event
3956 * any written block on an uptodate or failed drive can be returned.
3957 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
3958 * never LOCKED, so we don't need to test 'failed' directly.
3959 */
3960static void handle_stripe_clean_event(struct r5conf *conf,
3961	struct stripe_head *sh, int disks)
3962{
3963	int i;
3964	struct r5dev *dev;
3965	int discard_pending = 0;
3966	struct stripe_head *head_sh = sh;
3967	bool do_endio = false;
3968
3969	for (i = disks; i--; )
3970		if (sh->dev[i].written) {
3971			dev = &sh->dev[i];
3972			if (!test_bit(R5_LOCKED, &dev->flags) &&
3973			    (test_bit(R5_UPTODATE, &dev->flags) ||
3974			     test_bit(R5_Discard, &dev->flags) ||
3975			     test_bit(R5_SkipCopy, &dev->flags))) {
3976				/* We can return any write requests */
3977				struct bio *wbi, *wbi2;
3978				pr_debug("Return write for disc %d\n", i);
3979				if (test_and_clear_bit(R5_Discard, &dev->flags))
3980					clear_bit(R5_UPTODATE, &dev->flags);
3981				if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) {
3982					WARN_ON(test_bit(R5_UPTODATE, &dev->flags));
3983				}
3984				do_endio = true;
3985
3986returnbi:
3987				dev->page = dev->orig_page;
3988				wbi = dev->written;
3989				dev->written = NULL;
3990				while (wbi && wbi->bi_iter.bi_sector <
3991					dev->sector + RAID5_STRIPE_SECTORS(conf)) {
3992					wbi2 = r5_next_bio(conf, wbi, dev->sector);
3993					md_write_end(conf->mddev);
3994					bio_endio(wbi);
3995					wbi = wbi2;
3996				}
3997				md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3998						   RAID5_STRIPE_SECTORS(conf),
3999						   !test_bit(STRIPE_DEGRADED, &sh->state),
4000						   0);
4001				if (head_sh->batch_head) {
4002					sh = list_first_entry(&sh->batch_list,
4003							      struct stripe_head,
4004							      batch_list);
4005					if (sh != head_sh) {
4006						dev = &sh->dev[i];
4007						goto returnbi;
4008					}
4009				}
4010				sh = head_sh;
4011				dev = &sh->dev[i];
4012			} else if (test_bit(R5_Discard, &dev->flags))
4013				discard_pending = 1;
4014		}
4015
4016	log_stripe_write_finished(sh);
4017
4018	if (!discard_pending &&
4019	    test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
4020		int hash;
4021		clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
4022		clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
4023		if (sh->qd_idx >= 0) {
4024			clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
4025			clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags);
4026		}
4027		/* now that discard is done we can proceed with any sync */
4028		clear_bit(STRIPE_DISCARD, &sh->state);
4029		/*
4030		 * SCSI discard will change some bio fields and the stripe has
4031		 * no updated data, so remove it from hash list and the stripe
4032		 * will be reinitialized
4033		 */
4034unhash:
4035		hash = sh->hash_lock_index;
4036		spin_lock_irq(conf->hash_locks + hash);
4037		remove_hash(sh);
4038		spin_unlock_irq(conf->hash_locks + hash);
4039		if (head_sh->batch_head) {
4040			sh = list_first_entry(&sh->batch_list,
4041					      struct stripe_head, batch_list);
4042			if (sh != head_sh)
4043					goto unhash;
4044		}
4045		sh = head_sh;
4046
4047		if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
4048			set_bit(STRIPE_HANDLE, &sh->state);
4049
4050	}
4051
4052	if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
4053		if (atomic_dec_and_test(&conf->pending_full_writes))
4054			md_wakeup_thread(conf->mddev->thread);
4055
4056	if (head_sh->batch_head && do_endio)
4057		break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS);
4058}
4059
4060/*
4061 * For RMW in write back cache, we need extra page in prexor to store the
4062 * old data. This page is stored in dev->orig_page.
4063 *
4064 * This function checks whether we have data for prexor. The exact logic
4065 * is:
4066 *       R5_UPTODATE && (!R5_InJournal || R5_OrigPageUPTDODATE)
4067 */
4068static inline bool uptodate_for_rmw(struct r5dev *dev)
4069{
4070	return (test_bit(R5_UPTODATE, &dev->flags)) &&
4071		(!test_bit(R5_InJournal, &dev->flags) ||
4072		 test_bit(R5_OrigPageUPTDODATE, &dev->flags));
4073}
4074
4075static int handle_stripe_dirtying(struct r5conf *conf,
4076				  struct stripe_head *sh,
4077				  struct stripe_head_state *s,
4078				  int disks)
4079{
4080	int rmw = 0, rcw = 0, i;
4081	sector_t recovery_cp = conf->mddev->recovery_cp;
4082
4083	/* Check whether resync is now happening or should start.
4084	 * If yes, then the array is dirty (after unclean shutdown or
4085	 * initial creation), so parity in some stripes might be inconsistent.
4086	 * In this case, we need to always do reconstruct-write, to ensure
4087	 * that in case of drive failure or read-error correction, we
4088	 * generate correct data from the parity.
4089	 */
4090	if (conf->rmw_level == PARITY_DISABLE_RMW ||
4091	    (recovery_cp < MaxSector && sh->sector >= recovery_cp &&
4092	     s->failed == 0)) {
4093		/* Calculate the real rcw later - for now make it
4094		 * look like rcw is cheaper
4095		 */
4096		rcw = 1; rmw = 2;
4097		pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n",
4098			 conf->rmw_level, (unsigned long long)recovery_cp,
4099			 (unsigned long long)sh->sector);
4100	} else for (i = disks; i--; ) {
4101		/* would I have to read this buffer for read_modify_write */
4102		struct r5dev *dev = &sh->dev[i];
4103		if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
4104		     i == sh->pd_idx || i == sh->qd_idx ||
4105		     test_bit(R5_InJournal, &dev->flags)) &&
4106		    !test_bit(R5_LOCKED, &dev->flags) &&
4107		    !(uptodate_for_rmw(dev) ||
4108		      test_bit(R5_Wantcompute, &dev->flags))) {
4109			if (test_bit(R5_Insync, &dev->flags))
4110				rmw++;
4111			else
4112				rmw += 2*disks;  /* cannot read it */
4113		}
4114		/* Would I have to read this buffer for reconstruct_write */
4115		if (!test_bit(R5_OVERWRITE, &dev->flags) &&
4116		    i != sh->pd_idx && i != sh->qd_idx &&
4117		    !test_bit(R5_LOCKED, &dev->flags) &&
4118		    !(test_bit(R5_UPTODATE, &dev->flags) ||
4119		      test_bit(R5_Wantcompute, &dev->flags))) {
4120			if (test_bit(R5_Insync, &dev->flags))
4121				rcw++;
4122			else
4123				rcw += 2*disks;
4124		}
4125	}
4126
4127	pr_debug("for sector %llu state 0x%lx, rmw=%d rcw=%d\n",
4128		 (unsigned long long)sh->sector, sh->state, rmw, rcw);
4129	set_bit(STRIPE_HANDLE, &sh->state);
4130	if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) {
4131		/* prefer read-modify-write, but need to get some data */
4132		if (conf->mddev->queue)
4133			blk_add_trace_msg(conf->mddev->queue,
4134					  "raid5 rmw %llu %d",
4135					  (unsigned long long)sh->sector, rmw);
4136		for (i = disks; i--; ) {
4137			struct r5dev *dev = &sh->dev[i];
4138			if (test_bit(R5_InJournal, &dev->flags) &&
4139			    dev->page == dev->orig_page &&
4140			    !test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) {
4141				/* alloc page for prexor */
4142				struct page *p = alloc_page(GFP_NOIO);
4143
4144				if (p) {
4145					dev->orig_page = p;
4146					continue;
4147				}
4148
4149				/*
4150				 * alloc_page() failed, try use
4151				 * disk_info->extra_page
4152				 */
4153				if (!test_and_set_bit(R5C_EXTRA_PAGE_IN_USE,
4154						      &conf->cache_state)) {
4155					r5c_use_extra_page(sh);
4156					break;
4157				}
4158
4159				/* extra_page in use, add to delayed_list */
4160				set_bit(STRIPE_DELAYED, &sh->state);
4161				s->waiting_extra_page = 1;
4162				return -EAGAIN;
4163			}
4164		}
4165
4166		for (i = disks; i--; ) {
4167			struct r5dev *dev = &sh->dev[i];
4168			if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
4169			     i == sh->pd_idx || i == sh->qd_idx ||
4170			     test_bit(R5_InJournal, &dev->flags)) &&
4171			    !test_bit(R5_LOCKED, &dev->flags) &&
4172			    !(uptodate_for_rmw(dev) ||
4173			      test_bit(R5_Wantcompute, &dev->flags)) &&
4174			    test_bit(R5_Insync, &dev->flags)) {
4175				if (test_bit(STRIPE_PREREAD_ACTIVE,
4176					     &sh->state)) {
4177					pr_debug("Read_old block %d for r-m-w\n",
4178						 i);
4179					set_bit(R5_LOCKED, &dev->flags);
4180					set_bit(R5_Wantread, &dev->flags);
4181					s->locked++;
4182				} else
4183					set_bit(STRIPE_DELAYED, &sh->state);
4184			}
4185		}
4186	}
4187	if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_PREFER_RMW)) && rcw > 0) {
4188		/* want reconstruct write, but need to get some data */
4189		int qread =0;
4190		rcw = 0;
4191		for (i = disks; i--; ) {
4192			struct r5dev *dev = &sh->dev[i];
4193			if (!test_bit(R5_OVERWRITE, &dev->flags) &&
4194			    i != sh->pd_idx && i != sh->qd_idx &&
4195			    !test_bit(R5_LOCKED, &dev->flags) &&
4196			    !(test_bit(R5_UPTODATE, &dev->flags) ||
4197			      test_bit(R5_Wantcompute, &dev->flags))) {
4198				rcw++;
4199				if (test_bit(R5_Insync, &dev->flags) &&
4200				    test_bit(STRIPE_PREREAD_ACTIVE,
4201					     &sh->state)) {
4202					pr_debug("Read_old block "
4203						"%d for Reconstruct\n", i);
4204					set_bit(R5_LOCKED, &dev->flags);
4205					set_bit(R5_Wantread, &dev->flags);
4206					s->locked++;
4207					qread++;
4208				} else
4209					set_bit(STRIPE_DELAYED, &sh->state);
4210			}
4211		}
4212		if (rcw && conf->mddev->queue)
4213			blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d",
4214					  (unsigned long long)sh->sector,
4215					  rcw, qread, test_bit(STRIPE_DELAYED, &sh->state));
4216	}
4217
4218	if (rcw > disks && rmw > disks &&
4219	    !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4220		set_bit(STRIPE_DELAYED, &sh->state);
4221
4222	/* now if nothing is locked, and if we have enough data,
4223	 * we can start a write request
4224	 */
4225	/* since handle_stripe can be called at any time we need to handle the
4226	 * case where a compute block operation has been submitted and then a
4227	 * subsequent call wants to start a write request.  raid_run_ops only
4228	 * handles the case where compute block and reconstruct are requested
4229	 * simultaneously.  If this is not the case then new writes need to be
4230	 * held off until the compute completes.
4231	 */
4232	if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
4233	    (s->locked == 0 && (rcw == 0 || rmw == 0) &&
4234	     !test_bit(STRIPE_BIT_DELAY, &sh->state)))
4235		schedule_reconstruction(sh, s, rcw == 0, 0);
4236	return 0;
4237}
4238
4239static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
4240				struct stripe_head_state *s, int disks)
4241{
4242	struct r5dev *dev = NULL;
4243
4244	BUG_ON(sh->batch_head);
4245	set_bit(STRIPE_HANDLE, &sh->state);
4246
4247	switch (sh->check_state) {
4248	case check_state_idle:
4249		/* start a new check operation if there are no failures */
4250		if (s->failed == 0) {
4251			BUG_ON(s->uptodate != disks);
4252			sh->check_state = check_state_run;
4253			set_bit(STRIPE_OP_CHECK, &s->ops_request);
4254			clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
4255			s->uptodate--;
4256			break;
4257		}
4258		dev = &sh->dev[s->failed_num[0]];
4259		fallthrough;
4260	case check_state_compute_result:
4261		sh->check_state = check_state_idle;
4262		if (!dev)
4263			dev = &sh->dev[sh->pd_idx];
4264
4265		/* check that a write has not made the stripe insync */
4266		if (test_bit(STRIPE_INSYNC, &sh->state))
4267			break;
4268
4269		/* either failed parity check, or recovery is happening */
4270		BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
4271		BUG_ON(s->uptodate != disks);
4272
4273		set_bit(R5_LOCKED, &dev->flags);
4274		s->locked++;
4275		set_bit(R5_Wantwrite, &dev->flags);
4276
4277		clear_bit(STRIPE_DEGRADED, &sh->state);
4278		set_bit(STRIPE_INSYNC, &sh->state);
4279		break;
4280	case check_state_run:
4281		break; /* we will be called again upon completion */
4282	case check_state_check_result:
4283		sh->check_state = check_state_idle;
4284
4285		/* if a failure occurred during the check operation, leave
4286		 * STRIPE_INSYNC not set and let the stripe be handled again
4287		 */
4288		if (s->failed)
4289			break;
4290
4291		/* handle a successful check operation, if parity is correct
4292		 * we are done.  Otherwise update the mismatch count and repair
4293		 * parity if !MD_RECOVERY_CHECK
4294		 */
4295		if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0)
4296			/* parity is correct (on disc,
4297			 * not in buffer any more)
4298			 */
4299			set_bit(STRIPE_INSYNC, &sh->state);
4300		else {
4301			atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches);
4302			if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
4303				/* don't try to repair!! */
4304				set_bit(STRIPE_INSYNC, &sh->state);
4305				pr_warn_ratelimited("%s: mismatch sector in range "
4306						    "%llu-%llu\n", mdname(conf->mddev),
4307						    (unsigned long long) sh->sector,
4308						    (unsigned long long) sh->sector +
4309						    RAID5_STRIPE_SECTORS(conf));
4310			} else {
4311				sh->check_state = check_state_compute_run;
4312				set_bit(STRIPE_COMPUTE_RUN, &sh->state);
4313				set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
4314				set_bit(R5_Wantcompute,
4315					&sh->dev[sh->pd_idx].flags);
4316				sh->ops.target = sh->pd_idx;
4317				sh->ops.target2 = -1;
4318				s->uptodate++;
4319			}
4320		}
4321		break;
4322	case check_state_compute_run:
4323		break;
4324	default:
4325		pr_err("%s: unknown check_state: %d sector: %llu\n",
4326		       __func__, sh->check_state,
4327		       (unsigned long long) sh->sector);
4328		BUG();
4329	}
4330}
4331
4332static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
4333				  struct stripe_head_state *s,
4334				  int disks)
4335{
4336	int pd_idx = sh->pd_idx;
4337	int qd_idx = sh->qd_idx;
4338	struct r5dev *dev;
4339
4340	BUG_ON(sh->batch_head);
4341	set_bit(STRIPE_HANDLE, &sh->state);
4342
4343	BUG_ON(s->failed > 2);
4344
4345	/* Want to check and possibly repair P and Q.
4346	 * However there could be one 'failed' device, in which
4347	 * case we can only check one of them, possibly using the
4348	 * other to generate missing data
4349	 */
4350
4351	switch (sh->check_state) {
4352	case check_state_idle:
4353		/* start a new check operation if there are < 2 failures */
4354		if (s->failed == s->q_failed) {
4355			/* The only possible failed device holds Q, so it
4356			 * makes sense to check P (If anything else were failed,
4357			 * we would have used P to recreate it).
4358			 */
4359			sh->check_state = check_state_run;
4360		}
4361		if (!s->q_failed && s->failed < 2) {
4362			/* Q is not failed, and we didn't use it to generate
4363			 * anything, so it makes sense to check it
4364			 */
4365			if (sh->check_state == check_state_run)
4366				sh->check_state = check_state_run_pq;
4367			else
4368				sh->check_state = check_state_run_q;
4369		}
4370
4371		/* discard potentially stale zero_sum_result */
4372		sh->ops.zero_sum_result = 0;
4373
4374		if (sh->check_state == check_state_run) {
4375			/* async_xor_zero_sum destroys the contents of P */
4376			clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
4377			s->uptodate--;
4378		}
4379		if (sh->check_state >= check_state_run &&
4380		    sh->check_state <= check_state_run_pq) {
4381			/* async_syndrome_zero_sum preserves P and Q, so
4382			 * no need to mark them !uptodate here
4383			 */
4384			set_bit(STRIPE_OP_CHECK, &s->ops_request);
4385			break;
4386		}
4387
4388		/* we have 2-disk failure */
4389		BUG_ON(s->failed != 2);
4390		fallthrough;
4391	case check_state_compute_result:
4392		sh->check_state = check_state_idle;
4393
4394		/* check that a write has not made the stripe insync */
4395		if (test_bit(STRIPE_INSYNC, &sh->state))
4396			break;
4397
4398		/* now write out any block on a failed drive,
4399		 * or P or Q if they were recomputed
4400		 */
4401		dev = NULL;
4402		if (s->failed == 2) {
4403			dev = &sh->dev[s->failed_num[1]];
4404			s->locked++;
4405			set_bit(R5_LOCKED, &dev->flags);
4406			set_bit(R5_Wantwrite, &dev->flags);
4407		}
4408		if (s->failed >= 1) {
4409			dev = &sh->dev[s->failed_num[0]];
4410			s->locked++;
4411			set_bit(R5_LOCKED, &dev->flags);
4412			set_bit(R5_Wantwrite, &dev->flags);
4413		}
4414		if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
4415			dev = &sh->dev[pd_idx];
4416			s->locked++;
4417			set_bit(R5_LOCKED, &dev->flags);
4418			set_bit(R5_Wantwrite, &dev->flags);
4419		}
4420		if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
4421			dev = &sh->dev[qd_idx];
4422			s->locked++;
4423			set_bit(R5_LOCKED, &dev->flags);
4424			set_bit(R5_Wantwrite, &dev->flags);
4425		}
4426		if (WARN_ONCE(dev && !test_bit(R5_UPTODATE, &dev->flags),
4427			      "%s: disk%td not up to date\n",
4428			      mdname(conf->mddev),
4429			      dev - (struct r5dev *) &sh->dev)) {
4430			clear_bit(R5_LOCKED, &dev->flags);
4431			clear_bit(R5_Wantwrite, &dev->flags);
4432			s->locked--;
4433		}
4434		clear_bit(STRIPE_DEGRADED, &sh->state);
4435
4436		set_bit(STRIPE_INSYNC, &sh->state);
4437		break;
4438	case check_state_run:
4439	case check_state_run_q:
4440	case check_state_run_pq:
4441		break; /* we will be called again upon completion */
4442	case check_state_check_result:
4443		sh->check_state = check_state_idle;
4444
4445		/* handle a successful check operation, if parity is correct
4446		 * we are done.  Otherwise update the mismatch count and repair
4447		 * parity if !MD_RECOVERY_CHECK
4448		 */
4449		if (sh->ops.zero_sum_result == 0) {
4450			/* both parities are correct */
4451			if (!s->failed)
4452				set_bit(STRIPE_INSYNC, &sh->state);
4453			else {
4454				/* in contrast to the raid5 case we can validate
4455				 * parity, but still have a failure to write
4456				 * back
4457				 */
4458				sh->check_state = check_state_compute_result;
4459				/* Returning at this point means that we may go
4460				 * off and bring p and/or q uptodate again so
4461				 * we make sure to check zero_sum_result again
4462				 * to verify if p or q need writeback
4463				 */
4464			}
4465		} else {
4466			atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches);
4467			if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
4468				/* don't try to repair!! */
4469				set_bit(STRIPE_INSYNC, &sh->state);
4470				pr_warn_ratelimited("%s: mismatch sector in range "
4471						    "%llu-%llu\n", mdname(conf->mddev),
4472						    (unsigned long long) sh->sector,
4473						    (unsigned long long) sh->sector +
4474						    RAID5_STRIPE_SECTORS(conf));
4475			} else {
4476				int *target = &sh->ops.target;
4477
4478				sh->ops.target = -1;
4479				sh->ops.target2 = -1;
4480				sh->check_state = check_state_compute_run;
4481				set_bit(STRIPE_COMPUTE_RUN, &sh->state);
4482				set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
4483				if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
4484					set_bit(R5_Wantcompute,
4485						&sh->dev[pd_idx].flags);
4486					*target = pd_idx;
4487					target = &sh->ops.target2;
4488					s->uptodate++;
4489				}
4490				if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
4491					set_bit(R5_Wantcompute,
4492						&sh->dev[qd_idx].flags);
4493					*target = qd_idx;
4494					s->uptodate++;
4495				}
4496			}
4497		}
4498		break;
4499	case check_state_compute_run:
4500		break;
4501	default:
4502		pr_warn("%s: unknown check_state: %d sector: %llu\n",
4503			__func__, sh->check_state,
4504			(unsigned long long) sh->sector);
4505		BUG();
4506	}
4507}
4508
4509static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
4510{
4511	int i;
4512
4513	/* We have read all the blocks in this stripe and now we need to
4514	 * copy some of them into a target stripe for expand.
4515	 */
4516	struct dma_async_tx_descriptor *tx = NULL;
4517	BUG_ON(sh->batch_head);
4518	clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
4519	for (i = 0; i < sh->disks; i++)
4520		if (i != sh->pd_idx && i != sh->qd_idx) {
4521			int dd_idx, j;
4522			struct stripe_head *sh2;
4523			struct async_submit_ctl submit;
4524
4525			sector_t bn = raid5_compute_blocknr(sh, i, 1);
4526			sector_t s = raid5_compute_sector(conf, bn, 0,
4527							  &dd_idx, NULL);
4528			sh2 = raid5_get_active_stripe(conf, s, 0, 1, 1);
4529			if (sh2 == NULL)
4530				/* so far only the early blocks of this stripe
4531				 * have been requested.  When later blocks
4532				 * get requested, we will try again
4533				 */
4534				continue;
4535			if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
4536			   test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
4537				/* must have already done this block */
4538				raid5_release_stripe(sh2);
4539				continue;
4540			}
4541
4542			/* place all the copies on one channel */
4543			init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
4544			tx = async_memcpy(sh2->dev[dd_idx].page,
4545					  sh->dev[i].page, sh2->dev[dd_idx].offset,
4546					  sh->dev[i].offset, RAID5_STRIPE_SIZE(conf),
4547					  &submit);
4548
4549			set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
4550			set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
4551			for (j = 0; j < conf->raid_disks; j++)
4552				if (j != sh2->pd_idx &&
4553				    j != sh2->qd_idx &&
4554				    !test_bit(R5_Expanded, &sh2->dev[j].flags))
4555					break;
4556			if (j == conf->raid_disks) {
4557				set_bit(STRIPE_EXPAND_READY, &sh2->state);
4558				set_bit(STRIPE_HANDLE, &sh2->state);
4559			}
4560			raid5_release_stripe(sh2);
4561
4562		}
4563	/* done submitting copies, wait for them to complete */
4564	async_tx_quiesce(&tx);
4565}
4566
4567/*
4568 * handle_stripe - do things to a stripe.
4569 *
4570 * We lock the stripe by setting STRIPE_ACTIVE and then examine the
4571 * state of various bits to see what needs to be done.
4572 * Possible results:
4573 *    return some read requests which now have data
4574 *    return some write requests which are safely on storage
4575 *    schedule a read on some buffers
4576 *    schedule a write of some buffers
4577 *    return confirmation of parity correctness
4578 *
4579 */
4580
4581static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
4582{
4583	struct r5conf *conf = sh->raid_conf;
4584	int disks = sh->disks;
4585	struct r5dev *dev;
4586	int i;
4587	int do_recovery = 0;
4588
4589	memset(s, 0, sizeof(*s));
4590
4591	s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head;
4592	s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head;
4593	s->failed_num[0] = -1;
4594	s->failed_num[1] = -1;
4595	s->log_failed = r5l_log_disk_error(conf);
4596
4597	/* Now to look around and see what can be done */
4598	rcu_read_lock();
4599	for (i=disks; i--; ) {
4600		struct md_rdev *rdev;
4601		sector_t first_bad;
4602		int bad_sectors;
4603		int is_bad = 0;
4604
4605		dev = &sh->dev[i];
4606
4607		pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
4608			 i, dev->flags,
4609			 dev->toread, dev->towrite, dev->written);
4610		/* maybe we can reply to a read
4611		 *
4612		 * new wantfill requests are only permitted while
4613		 * ops_complete_biofill is guaranteed to be inactive
4614		 */
4615		if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
4616		    !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
4617			set_bit(R5_Wantfill, &dev->flags);
4618
4619		/* now count some things */
4620		if (test_bit(R5_LOCKED, &dev->flags))
4621			s->locked++;
4622		if (test_bit(R5_UPTODATE, &dev->flags))
4623			s->uptodate++;
4624		if (test_bit(R5_Wantcompute, &dev->flags)) {
4625			s->compute++;
4626			BUG_ON(s->compute > 2);
4627		}
4628
4629		if (test_bit(R5_Wantfill, &dev->flags))
4630			s->to_fill++;
4631		else if (dev->toread)
4632			s->to_read++;
4633		if (dev->towrite) {
4634			s->to_write++;
4635			if (!test_bit(R5_OVERWRITE, &dev->flags))
4636				s->non_overwrite++;
4637		}
4638		if (dev->written)
4639			s->written++;
4640		/* Prefer to use the replacement for reads, but only
4641		 * if it is recovered enough and has no bad blocks.
4642		 */
4643		rdev = rcu_dereference(conf->disks[i].replacement);
4644		if (rdev && !test_bit(Faulty, &rdev->flags) &&
4645		    rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) &&
4646		    !is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
4647				 &first_bad, &bad_sectors))
4648			set_bit(R5_ReadRepl, &dev->flags);
4649		else {
4650			if (rdev && !test_bit(Faulty, &rdev->flags))
4651				set_bit(R5_NeedReplace, &dev->flags);
4652			else
4653				clear_bit(R5_NeedReplace, &dev->flags);
4654			rdev = rcu_dereference(conf->disks[i].rdev);
4655			clear_bit(R5_ReadRepl, &dev->flags);
4656		}
4657		if (rdev && test_bit(Faulty, &rdev->flags))
4658			rdev = NULL;
4659		if (rdev) {
4660			is_bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
4661					     &first_bad, &bad_sectors);
4662			if (s->blocked_rdev == NULL
4663			    && (test_bit(Blocked, &rdev->flags)
4664				|| is_bad < 0)) {
4665				if (is_bad < 0)
4666					set_bit(BlockedBadBlocks,
4667						&rdev->flags);
4668				s->blocked_rdev = rdev;
4669				atomic_inc(&rdev->nr_pending);
4670			}
4671		}
4672		clear_bit(R5_Insync, &dev->flags);
4673		if (!rdev)
4674			/* Not in-sync */;
4675		else if (is_bad) {
4676			/* also not in-sync */
4677			if (!test_bit(WriteErrorSeen, &rdev->flags) &&
4678			    test_bit(R5_UPTODATE, &dev->flags)) {
4679				/* treat as in-sync, but with a read error
4680				 * which we can now try to correct
4681				 */
4682				set_bit(R5_Insync, &dev->flags);
4683				set_bit(R5_ReadError, &dev->flags);
4684			}
4685		} else if (test_bit(In_sync, &rdev->flags))
4686			set_bit(R5_Insync, &dev->flags);
4687		else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <= rdev->recovery_offset)
4688			/* in sync if before recovery_offset */
4689			set_bit(R5_Insync, &dev->flags);
4690		else if (test_bit(R5_UPTODATE, &dev->flags) &&
4691			 test_bit(R5_Expanded, &dev->flags))
4692			/* If we've reshaped into here, we assume it is Insync.
4693			 * We will shortly update recovery_offset to make
4694			 * it official.
4695			 */
4696			set_bit(R5_Insync, &dev->flags);
4697
4698		if (test_bit(R5_WriteError, &dev->flags)) {
4699			/* This flag does not apply to '.replacement'
4700			 * only to .rdev, so make sure to check that*/
4701			struct md_rdev *rdev2 = rcu_dereference(
4702				conf->disks[i].rdev);
4703			if (rdev2 == rdev)
4704				clear_bit(R5_Insync, &dev->flags);
4705			if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4706				s->handle_bad_blocks = 1;
4707				atomic_inc(&rdev2->nr_pending);
4708			} else
4709				clear_bit(R5_WriteError, &dev->flags);
4710		}
4711		if (test_bit(R5_MadeGood, &dev->flags)) {
4712			/* This flag does not apply to '.replacement'
4713			 * only to .rdev, so make sure to check that*/
4714			struct md_rdev *rdev2 = rcu_dereference(
4715				conf->disks[i].rdev);
4716			if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4717				s->handle_bad_blocks = 1;
4718				atomic_inc(&rdev2->nr_pending);
4719			} else
4720				clear_bit(R5_MadeGood, &dev->flags);
4721		}
4722		if (test_bit(R5_MadeGoodRepl, &dev->flags)) {
4723			struct md_rdev *rdev2 = rcu_dereference(
4724				conf->disks[i].replacement);
4725			if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4726				s->handle_bad_blocks = 1;
4727				atomic_inc(&rdev2->nr_pending);
4728			} else
4729				clear_bit(R5_MadeGoodRepl, &dev->flags);
4730		}
4731		if (!test_bit(R5_Insync, &dev->flags)) {
4732			/* The ReadError flag will just be confusing now */
4733			clear_bit(R5_ReadError, &dev->flags);
4734			clear_bit(R5_ReWrite, &dev->flags);
4735		}
4736		if (test_bit(R5_ReadError, &dev->flags))
4737			clear_bit(R5_Insync, &dev->flags);
4738		if (!test_bit(R5_Insync, &dev->flags)) {
4739			if (s->failed < 2)
4740				s->failed_num[s->failed] = i;
4741			s->failed++;
4742			if (rdev && !test_bit(Faulty, &rdev->flags))
4743				do_recovery = 1;
4744			else if (!rdev) {
4745				rdev = rcu_dereference(
4746				    conf->disks[i].replacement);
4747				if (rdev && !test_bit(Faulty, &rdev->flags))
4748					do_recovery = 1;
4749			}
4750		}
4751
4752		if (test_bit(R5_InJournal, &dev->flags))
4753			s->injournal++;
4754		if (test_bit(R5_InJournal, &dev->flags) && dev->written)
4755			s->just_cached++;
4756	}
4757	if (test_bit(STRIPE_SYNCING, &sh->state)) {
4758		/* If there is a failed device being replaced,
4759		 *     we must be recovering.
4760		 * else if we are after recovery_cp, we must be syncing
4761		 * else if MD_RECOVERY_REQUESTED is set, we also are syncing.
4762		 * else we can only be replacing
4763		 * sync and recovery both need to read all devices, and so
4764		 * use the same flag.
4765		 */
4766		if (do_recovery ||
4767		    sh->sector >= conf->mddev->recovery_cp ||
4768		    test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery)))
4769			s->syncing = 1;
4770		else
4771			s->replacing = 1;
4772	}
4773	rcu_read_unlock();
4774}
4775
4776/*
4777 * Return '1' if this is a member of batch, or '0' if it is a lone stripe or
4778 * a head which can now be handled.
4779 */
4780static int clear_batch_ready(struct stripe_head *sh)
4781{
4782	struct stripe_head *tmp;
4783	if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state))
4784		return (sh->batch_head && sh->batch_head != sh);
4785	spin_lock(&sh->stripe_lock);
4786	if (!sh->batch_head) {
4787		spin_unlock(&sh->stripe_lock);
4788		return 0;
4789	}
4790
4791	/*
4792	 * this stripe could be added to a batch list before we check
4793	 * BATCH_READY, skips it
4794	 */
4795	if (sh->batch_head != sh) {
4796		spin_unlock(&sh->stripe_lock);
4797		return 1;
4798	}
4799	spin_lock(&sh->batch_lock);
4800	list_for_each_entry(tmp, &sh->batch_list, batch_list)
4801		clear_bit(STRIPE_BATCH_READY, &tmp->state);
4802	spin_unlock(&sh->batch_lock);
4803	spin_unlock(&sh->stripe_lock);
4804
4805	/*
4806	 * BATCH_READY is cleared, no new stripes can be added.
4807	 * batch_list can be accessed without lock
4808	 */
4809	return 0;
4810}
4811
4812static void break_stripe_batch_list(struct stripe_head *head_sh,
4813				    unsigned long handle_flags)
4814{
4815	struct stripe_head *sh, *next;
4816	int i;
4817	int do_wakeup = 0;
4818
4819	list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) {
4820
4821		list_del_init(&sh->batch_list);
4822
4823		WARN_ONCE(sh->state & ((1 << STRIPE_ACTIVE) |
4824					  (1 << STRIPE_SYNCING) |
4825					  (1 << STRIPE_REPLACED) |
4826					  (1 << STRIPE_DELAYED) |
4827					  (1 << STRIPE_BIT_DELAY) |
4828					  (1 << STRIPE_FULL_WRITE) |
4829					  (1 << STRIPE_BIOFILL_RUN) |
4830					  (1 << STRIPE_COMPUTE_RUN)  |
4831					  (1 << STRIPE_DISCARD) |
4832					  (1 << STRIPE_BATCH_READY) |
4833					  (1 << STRIPE_BATCH_ERR) |
4834					  (1 << STRIPE_BITMAP_PENDING)),
4835			"stripe state: %lx\n", sh->state);
4836		WARN_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) |
4837					      (1 << STRIPE_REPLACED)),
4838			"head stripe state: %lx\n", head_sh->state);
4839
4840		set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS |
4841					    (1 << STRIPE_PREREAD_ACTIVE) |
4842					    (1 << STRIPE_DEGRADED) |
4843					    (1 << STRIPE_ON_UNPLUG_LIST)),
4844			      head_sh->state & (1 << STRIPE_INSYNC));
4845
4846		sh->check_state = head_sh->check_state;
4847		sh->reconstruct_state = head_sh->reconstruct_state;
4848		spin_lock_irq(&sh->stripe_lock);
4849		sh->batch_head = NULL;
4850		spin_unlock_irq(&sh->stripe_lock);
4851		for (i = 0; i < sh->disks; i++) {
4852			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
4853				do_wakeup = 1;
4854			sh->dev[i].flags = head_sh->dev[i].flags &
4855				(~((1 << R5_WriteError) | (1 << R5_Overlap)));
4856		}
4857		if (handle_flags == 0 ||
4858		    sh->state & handle_flags)
4859			set_bit(STRIPE_HANDLE, &sh->state);
4860		raid5_release_stripe(sh);
4861	}
4862	spin_lock_irq(&head_sh->stripe_lock);
4863	head_sh->batch_head = NULL;
4864	spin_unlock_irq(&head_sh->stripe_lock);
4865	for (i = 0; i < head_sh->disks; i++)
4866		if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
4867			do_wakeup = 1;
4868	if (head_sh->state & handle_flags)
4869		set_bit(STRIPE_HANDLE, &head_sh->state);
4870
4871	if (do_wakeup)
4872		wake_up(&head_sh->raid_conf->wait_for_overlap);
4873}
4874
4875static void handle_stripe(struct stripe_head *sh)
4876{
4877	struct stripe_head_state s;
4878	struct r5conf *conf = sh->raid_conf;
4879	int i;
4880	int prexor;
4881	int disks = sh->disks;
4882	struct r5dev *pdev, *qdev;
4883
4884	clear_bit(STRIPE_HANDLE, &sh->state);
4885
4886	/*
4887	 * handle_stripe should not continue handle the batched stripe, only
4888	 * the head of batch list or lone stripe can continue. Otherwise we
4889	 * could see break_stripe_batch_list warns about the STRIPE_ACTIVE
4890	 * is set for the batched stripe.
4891	 */
4892	if (clear_batch_ready(sh))
4893		return;
4894
4895	if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) {
4896		/* already being handled, ensure it gets handled
4897		 * again when current action finishes */
4898		set_bit(STRIPE_HANDLE, &sh->state);
4899		return;
4900	}
4901
4902	if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
4903		break_stripe_batch_list(sh, 0);
4904
4905	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
4906		spin_lock(&sh->stripe_lock);
4907		/*
4908		 * Cannot process 'sync' concurrently with 'discard'.
4909		 * Flush data in r5cache before 'sync'.
4910		 */
4911		if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
4912		    !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) &&
4913		    !test_bit(STRIPE_DISCARD, &sh->state) &&
4914		    test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
4915			set_bit(STRIPE_SYNCING, &sh->state);
4916			clear_bit(STRIPE_INSYNC, &sh->state);
4917			clear_bit(STRIPE_REPLACED, &sh->state);
4918		}
4919		spin_unlock(&sh->stripe_lock);
4920	}
4921	clear_bit(STRIPE_DELAYED, &sh->state);
4922
4923	pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
4924		"pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
4925	       (unsigned long long)sh->sector, sh->state,
4926	       atomic_read(&sh->count), sh->pd_idx, sh->qd_idx,
4927	       sh->check_state, sh->reconstruct_state);
4928
4929	analyse_stripe(sh, &s);
4930
4931	if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
4932		goto finish;
4933
4934	if (s.handle_bad_blocks ||
4935	    test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
4936		set_bit(STRIPE_HANDLE, &sh->state);
4937		goto finish;
4938	}
4939
4940	if (unlikely(s.blocked_rdev)) {
4941		if (s.syncing || s.expanding || s.expanded ||
4942		    s.replacing || s.to_write || s.written) {
4943			set_bit(STRIPE_HANDLE, &sh->state);
4944			goto finish;
4945		}
4946		/* There is nothing for the blocked_rdev to block */
4947		rdev_dec_pending(s.blocked_rdev, conf->mddev);
4948		s.blocked_rdev = NULL;
4949	}
4950
4951	if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
4952		set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
4953		set_bit(STRIPE_BIOFILL_RUN, &sh->state);
4954	}
4955
4956	pr_debug("locked=%d uptodate=%d to_read=%d"
4957	       " to_write=%d failed=%d failed_num=%d,%d\n",
4958	       s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
4959	       s.failed_num[0], s.failed_num[1]);
4960	/*
4961	 * check if the array has lost more than max_degraded devices and,
4962	 * if so, some requests might need to be failed.
4963	 *
4964	 * When journal device failed (log_failed), we will only process
4965	 * the stripe if there is data need write to raid disks
4966	 */
4967	if (s.failed > conf->max_degraded ||
4968	    (s.log_failed && s.injournal == 0)) {
4969		sh->check_state = 0;
4970		sh->reconstruct_state = 0;
4971		break_stripe_batch_list(sh, 0);
4972		if (s.to_read+s.to_write+s.written)
4973			handle_failed_stripe(conf, sh, &s, disks);
4974		if (s.syncing + s.replacing)
4975			handle_failed_sync(conf, sh, &s);
4976	}
4977
4978	/* Now we check to see if any write operations have recently
4979	 * completed
4980	 */
4981	prexor = 0;
4982	if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
4983		prexor = 1;
4984	if (sh->reconstruct_state == reconstruct_state_drain_result ||
4985	    sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
4986		sh->reconstruct_state = reconstruct_state_idle;
4987
4988		/* All the 'written' buffers and the parity block are ready to
4989		 * be written back to disk
4990		 */
4991		BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) &&
4992		       !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags));
4993		BUG_ON(sh->qd_idx >= 0 &&
4994		       !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) &&
4995		       !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags));
4996		for (i = disks; i--; ) {
4997			struct r5dev *dev = &sh->dev[i];
4998			if (test_bit(R5_LOCKED, &dev->flags) &&
4999				(i == sh->pd_idx || i == sh->qd_idx ||
5000				 dev->written || test_bit(R5_InJournal,
5001							  &dev->flags))) {
5002				pr_debug("Writing block %d\n", i);
5003				set_bit(R5_Wantwrite, &dev->flags);
5004				if (prexor)
5005					continue;
5006				if (s.failed > 1)
5007					continue;
5008				if (!test_bit(R5_Insync, &dev->flags) ||
5009				    ((i == sh->pd_idx || i == sh->qd_idx)  &&
5010				     s.failed == 0))
5011					set_bit(STRIPE_INSYNC, &sh->state);
5012			}
5013		}
5014		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5015			s.dec_preread_active = 1;
5016	}
5017
5018	/*
5019	 * might be able to return some write requests if the parity blocks
5020	 * are safe, or on a failed drive
5021	 */
5022	pdev = &sh->dev[sh->pd_idx];
5023	s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
5024		|| (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
5025	qdev = &sh->dev[sh->qd_idx];
5026	s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
5027		|| (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
5028		|| conf->level < 6;
5029
5030	if (s.written &&
5031	    (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
5032			     && !test_bit(R5_LOCKED, &pdev->flags)
5033			     && (test_bit(R5_UPTODATE, &pdev->flags) ||
5034				 test_bit(R5_Discard, &pdev->flags))))) &&
5035	    (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
5036			     && !test_bit(R5_LOCKED, &qdev->flags)
5037			     && (test_bit(R5_UPTODATE, &qdev->flags) ||
5038				 test_bit(R5_Discard, &qdev->flags))))))
5039		handle_stripe_clean_event(conf, sh, disks);
5040
5041	if (s.just_cached)
5042		r5c_handle_cached_data_endio(conf, sh, disks);
5043	log_stripe_write_finished(sh);
5044
5045	/* Now we might consider reading some blocks, either to check/generate
5046	 * parity, or to satisfy requests
5047	 * or to load a block that is being partially written.
5048	 */
5049	if (s.to_read || s.non_overwrite
5050	    || (s.to_write && s.failed)
5051	    || (s.syncing && (s.uptodate + s.compute < disks))
5052	    || s.replacing
5053	    || s.expanding)
5054		handle_stripe_fill(sh, &s, disks);
5055
5056	/*
5057	 * When the stripe finishes full journal write cycle (write to journal
5058	 * and raid disk), this is the clean up procedure so it is ready for
5059	 * next operation.
5060	 */
5061	r5c_finish_stripe_write_out(conf, sh, &s);
5062
5063	/*
5064	 * Now to consider new write requests, cache write back and what else,
5065	 * if anything should be read.  We do not handle new writes when:
5066	 * 1/ A 'write' operation (copy+xor) is already in flight.
5067	 * 2/ A 'check' operation is in flight, as it may clobber the parity
5068	 *    block.
5069	 * 3/ A r5c cache log write is in flight.
5070	 */
5071
5072	if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) {
5073		if (!r5c_is_writeback(conf->log)) {
5074			if (s.to_write)
5075				handle_stripe_dirtying(conf, sh, &s, disks);
5076		} else { /* write back cache */
5077			int ret = 0;
5078
5079			/* First, try handle writes in caching phase */
5080			if (s.to_write)
5081				ret = r5c_try_caching_write(conf, sh, &s,
5082							    disks);
5083			/*
5084			 * If caching phase failed: ret == -EAGAIN
5085			 *    OR
5086			 * stripe under reclaim: !caching && injournal
5087			 *
5088			 * fall back to handle_stripe_dirtying()
5089			 */
5090			if (ret == -EAGAIN ||
5091			    /* stripe under reclaim: !caching && injournal */
5092			    (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
5093			     s.injournal > 0)) {
5094				ret = handle_stripe_dirtying(conf, sh, &s,
5095							     disks);
5096				if (ret == -EAGAIN)
5097					goto finish;
5098			}
5099		}
5100	}
5101
5102	/* maybe we need to check and possibly fix the parity for this stripe
5103	 * Any reads will already have been scheduled, so we just see if enough
5104	 * data is available.  The parity check is held off while parity
5105	 * dependent operations are in flight.
5106	 */
5107	if (sh->check_state ||
5108	    (s.syncing && s.locked == 0 &&
5109	     !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
5110	     !test_bit(STRIPE_INSYNC, &sh->state))) {
5111		if (conf->level == 6)
5112			handle_parity_checks6(conf, sh, &s, disks);
5113		else
5114			handle_parity_checks5(conf, sh, &s, disks);
5115	}
5116
5117	if ((s.replacing || s.syncing) && s.locked == 0
5118	    && !test_bit(STRIPE_COMPUTE_RUN, &sh->state)
5119	    && !test_bit(STRIPE_REPLACED, &sh->state)) {
5120		/* Write out to replacement devices where possible */
5121		for (i = 0; i < conf->raid_disks; i++)
5122			if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) {
5123				WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags));
5124				set_bit(R5_WantReplace, &sh->dev[i].flags);
5125				set_bit(R5_LOCKED, &sh->dev[i].flags);
5126				s.locked++;
5127			}
5128		if (s.replacing)
5129			set_bit(STRIPE_INSYNC, &sh->state);
5130		set_bit(STRIPE_REPLACED, &sh->state);
5131	}
5132	if ((s.syncing || s.replacing) && s.locked == 0 &&
5133	    !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
5134	    test_bit(STRIPE_INSYNC, &sh->state)) {
5135		md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
5136		clear_bit(STRIPE_SYNCING, &sh->state);
5137		if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
5138			wake_up(&conf->wait_for_overlap);
5139	}
5140
5141	/* If the failed drives are just a ReadError, then we might need
5142	 * to progress the repair/check process
5143	 */
5144	if (s.failed <= conf->max_degraded && !conf->mddev->ro)
5145		for (i = 0; i < s.failed; i++) {
5146			struct r5dev *dev = &sh->dev[s.failed_num[i]];
5147			if (test_bit(R5_ReadError, &dev->flags)
5148			    && !test_bit(R5_LOCKED, &dev->flags)
5149			    && test_bit(R5_UPTODATE, &dev->flags)
5150				) {
5151				if (!test_bit(R5_ReWrite, &dev->flags)) {
5152					set_bit(R5_Wantwrite, &dev->flags);
5153					set_bit(R5_ReWrite, &dev->flags);
5154				} else
5155					/* let's read it back */
5156					set_bit(R5_Wantread, &dev->flags);
5157				set_bit(R5_LOCKED, &dev->flags);
5158				s.locked++;
5159			}
5160		}
5161
5162	/* Finish reconstruct operations initiated by the expansion process */
5163	if (sh->reconstruct_state == reconstruct_state_result) {
5164		struct stripe_head *sh_src
5165			= raid5_get_active_stripe(conf, sh->sector, 1, 1, 1);
5166		if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) {
5167			/* sh cannot be written until sh_src has been read.
5168			 * so arrange for sh to be delayed a little
5169			 */
5170			set_bit(STRIPE_DELAYED, &sh->state);
5171			set_bit(STRIPE_HANDLE, &sh->state);
5172			if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
5173					      &sh_src->state))
5174				atomic_inc(&conf->preread_active_stripes);
5175			raid5_release_stripe(sh_src);
5176			goto finish;
5177		}
5178		if (sh_src)
5179			raid5_release_stripe(sh_src);
5180
5181		sh->reconstruct_state = reconstruct_state_idle;
5182		clear_bit(STRIPE_EXPANDING, &sh->state);
5183		for (i = conf->raid_disks; i--; ) {
5184			set_bit(R5_Wantwrite, &sh->dev[i].flags);
5185			set_bit(R5_LOCKED, &sh->dev[i].flags);
5186			s.locked++;
5187		}
5188	}
5189
5190	if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
5191	    !sh->reconstruct_state) {
5192		/* Need to write out all blocks after computing parity */
5193		sh->disks = conf->raid_disks;
5194		stripe_set_idx(sh->sector, conf, 0, sh);
5195		schedule_reconstruction(sh, &s, 1, 1);
5196	} else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
5197		clear_bit(STRIPE_EXPAND_READY, &sh->state);
5198		atomic_dec(&conf->reshape_stripes);
5199		wake_up(&conf->wait_for_overlap);
5200		md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
5201	}
5202
5203	if (s.expanding && s.locked == 0 &&
5204	    !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
5205		handle_stripe_expansion(conf, sh);
5206
5207finish:
5208	/* wait for this device to become unblocked */
5209	if (unlikely(s.blocked_rdev)) {
5210		if (conf->mddev->external)
5211			md_wait_for_blocked_rdev(s.blocked_rdev,
5212						 conf->mddev);
5213		else
5214			/* Internal metadata will immediately
5215			 * be written by raid5d, so we don't
5216			 * need to wait here.
5217			 */
5218			rdev_dec_pending(s.blocked_rdev,
5219					 conf->mddev);
5220	}
5221
5222	if (s.handle_bad_blocks)
5223		for (i = disks; i--; ) {
5224			struct md_rdev *rdev;
5225			struct r5dev *dev = &sh->dev[i];
5226			if (test_and_clear_bit(R5_WriteError, &dev->flags)) {
5227				/* We own a safe reference to the rdev */
5228				rdev = conf->disks[i].rdev;
5229				if (!rdev_set_badblocks(rdev, sh->sector,
5230							RAID5_STRIPE_SECTORS(conf), 0))
5231					md_error(conf->mddev, rdev);
5232				rdev_dec_pending(rdev, conf->mddev);
5233			}
5234			if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
5235				rdev = conf->disks[i].rdev;
5236				rdev_clear_badblocks(rdev, sh->sector,
5237						     RAID5_STRIPE_SECTORS(conf), 0);
5238				rdev_dec_pending(rdev, conf->mddev);
5239			}
5240			if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
5241				rdev = conf->disks[i].replacement;
5242				if (!rdev)
5243					/* rdev have been moved down */
5244					rdev = conf->disks[i].rdev;
5245				rdev_clear_badblocks(rdev, sh->sector,
5246						     RAID5_STRIPE_SECTORS(conf), 0);
5247				rdev_dec_pending(rdev, conf->mddev);
5248			}
5249		}
5250
5251	if (s.ops_request)
5252		raid_run_ops(sh, s.ops_request);
5253
5254	ops_run_io(sh, &s);
5255
5256	if (s.dec_preread_active) {
5257		/* We delay this until after ops_run_io so that if make_request
5258		 * is waiting on a flush, it won't continue until the writes
5259		 * have actually been submitted.
5260		 */
5261		atomic_dec(&conf->preread_active_stripes);
5262		if (atomic_read(&conf->preread_active_stripes) <
5263		    IO_THRESHOLD)
5264			md_wakeup_thread(conf->mddev->thread);
5265	}
5266
5267	clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
5268}
5269
5270static void raid5_activate_delayed(struct r5conf *conf)
5271{
5272	if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
5273		while (!list_empty(&conf->delayed_list)) {
5274			struct list_head *l = conf->delayed_list.next;
5275			struct stripe_head *sh;
5276			sh = list_entry(l, struct stripe_head, lru);
5277			list_del_init(l);
5278			clear_bit(STRIPE_DELAYED, &sh->state);
5279			if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5280				atomic_inc(&conf->preread_active_stripes);
5281			list_add_tail(&sh->lru, &conf->hold_list);
5282			raid5_wakeup_stripe_thread(sh);
5283		}
5284	}
5285}
5286
5287static void activate_bit_delay(struct r5conf *conf,
5288	struct list_head *temp_inactive_list)
5289{
5290	/* device_lock is held */
5291	struct list_head head;
5292	list_add(&head, &conf->bitmap_list);
5293	list_del_init(&conf->bitmap_list);
5294	while (!list_empty(&head)) {
5295		struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
5296		int hash;
5297		list_del_init(&sh->lru);
5298		atomic_inc(&sh->count);
5299		hash = sh->hash_lock_index;
5300		__release_stripe(conf, sh, &temp_inactive_list[hash]);
5301	}
5302}
5303
5304static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
5305{
5306	struct r5conf *conf = mddev->private;
5307	sector_t sector = bio->bi_iter.bi_sector;
5308	unsigned int chunk_sectors;
5309	unsigned int bio_sectors = bio_sectors(bio);
5310
5311	WARN_ON_ONCE(bio->bi_partno);
5312
5313	chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors);
5314	return  chunk_sectors >=
5315		((sector & (chunk_sectors - 1)) + bio_sectors);
5316}
5317
5318/*
5319 *  add bio to the retry LIFO  ( in O(1) ... we are in interrupt )
5320 *  later sampled by raid5d.
5321 */
5322static void add_bio_to_retry(struct bio *bi,struct r5conf *conf)
5323{
5324	unsigned long flags;
5325
5326	spin_lock_irqsave(&conf->device_lock, flags);
5327
5328	bi->bi_next = conf->retry_read_aligned_list;
5329	conf->retry_read_aligned_list = bi;
5330
5331	spin_unlock_irqrestore(&conf->device_lock, flags);
5332	md_wakeup_thread(conf->mddev->thread);
5333}
5334
5335static struct bio *remove_bio_from_retry(struct r5conf *conf,
5336					 unsigned int *offset)
5337{
5338	struct bio *bi;
5339
5340	bi = conf->retry_read_aligned;
5341	if (bi) {
5342		*offset = conf->retry_read_offset;
5343		conf->retry_read_aligned = NULL;
5344		return bi;
5345	}
5346	bi = conf->retry_read_aligned_list;
5347	if(bi) {
5348		conf->retry_read_aligned_list = bi->bi_next;
5349		bi->bi_next = NULL;
5350		*offset = 0;
5351	}
5352
5353	return bi;
5354}
5355
5356/*
5357 *  The "raid5_align_endio" should check if the read succeeded and if it
5358 *  did, call bio_endio on the original bio (having bio_put the new bio
5359 *  first).
5360 *  If the read failed..
5361 */
5362static void raid5_align_endio(struct bio *bi)
5363{
5364	struct bio* raid_bi  = bi->bi_private;
5365	struct mddev *mddev;
5366	struct r5conf *conf;
5367	struct md_rdev *rdev;
5368	blk_status_t error = bi->bi_status;
5369
5370	bio_put(bi);
5371
5372	rdev = (void*)raid_bi->bi_next;
5373	raid_bi->bi_next = NULL;
5374	mddev = rdev->mddev;
5375	conf = mddev->private;
5376
5377	rdev_dec_pending(rdev, conf->mddev);
5378
5379	if (!error) {
5380		bio_endio(raid_bi);
5381		if (atomic_dec_and_test(&conf->active_aligned_reads))
5382			wake_up(&conf->wait_for_quiescent);
5383		return;
5384	}
5385
5386	pr_debug("raid5_align_endio : io error...handing IO for a retry\n");
5387
5388	add_bio_to_retry(raid_bi, conf);
5389}
5390
5391static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
5392{
5393	struct r5conf *conf = mddev->private;
5394	int dd_idx;
5395	struct bio* align_bi;
5396	struct md_rdev *rdev;
5397	sector_t end_sector;
5398
5399	if (!in_chunk_boundary(mddev, raid_bio)) {
5400		pr_debug("%s: non aligned\n", __func__);
5401		return 0;
5402	}
5403	/*
5404	 * use bio_clone_fast to make a copy of the bio
5405	 */
5406	align_bi = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->bio_set);
5407	if (!align_bi)
5408		return 0;
5409	/*
5410	 *   set bi_end_io to a new function, and set bi_private to the
5411	 *     original bio.
5412	 */
5413	align_bi->bi_end_io  = raid5_align_endio;
5414	align_bi->bi_private = raid_bio;
5415	/*
5416	 *	compute position
5417	 */
5418	align_bi->bi_iter.bi_sector =
5419		raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector,
5420				     0, &dd_idx, NULL);
5421
5422	end_sector = bio_end_sector(align_bi);
5423	rcu_read_lock();
5424	rdev = rcu_dereference(conf->disks[dd_idx].replacement);
5425	if (!rdev || test_bit(Faulty, &rdev->flags) ||
5426	    rdev->recovery_offset < end_sector) {
5427		rdev = rcu_dereference(conf->disks[dd_idx].rdev);
5428		if (rdev &&
5429		    (test_bit(Faulty, &rdev->flags) ||
5430		    !(test_bit(In_sync, &rdev->flags) ||
5431		      rdev->recovery_offset >= end_sector)))
5432			rdev = NULL;
5433	}
5434
5435	if (r5c_big_stripe_cached(conf, align_bi->bi_iter.bi_sector)) {
5436		rcu_read_unlock();
5437		bio_put(align_bi);
5438		return 0;
5439	}
5440
5441	if (rdev) {
5442		sector_t first_bad;
5443		int bad_sectors;
5444
5445		atomic_inc(&rdev->nr_pending);
5446		rcu_read_unlock();
5447		raid_bio->bi_next = (void*)rdev;
5448		bio_set_dev(align_bi, rdev->bdev);
5449
5450		if (is_badblock(rdev, align_bi->bi_iter.bi_sector,
5451				bio_sectors(align_bi),
5452				&first_bad, &bad_sectors)) {
5453			bio_put(align_bi);
5454			rdev_dec_pending(rdev, mddev);
5455			return 0;
5456		}
5457
5458		/* No reshape active, so we can trust rdev->data_offset */
5459		align_bi->bi_iter.bi_sector += rdev->data_offset;
5460
5461		spin_lock_irq(&conf->device_lock);
5462		wait_event_lock_irq(conf->wait_for_quiescent,
5463				    conf->quiesce == 0,
5464				    conf->device_lock);
5465		atomic_inc(&conf->active_aligned_reads);
5466		spin_unlock_irq(&conf->device_lock);
5467
5468		if (mddev->gendisk)
5469			trace_block_bio_remap(align_bi->bi_disk->queue,
5470					      align_bi, disk_devt(mddev->gendisk),
5471					      raid_bio->bi_iter.bi_sector);
5472		submit_bio_noacct(align_bi);
5473		return 1;
5474	} else {
5475		rcu_read_unlock();
5476		bio_put(align_bi);
5477		return 0;
5478	}
5479}
5480
5481static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
5482{
5483	struct bio *split;
5484	sector_t sector = raid_bio->bi_iter.bi_sector;
5485	unsigned chunk_sects = mddev->chunk_sectors;
5486	unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
5487
5488	if (sectors < bio_sectors(raid_bio)) {
5489		struct r5conf *conf = mddev->private;
5490		split = bio_split(raid_bio, sectors, GFP_NOIO, &conf->bio_split);
5491		bio_chain(split, raid_bio);
5492		submit_bio_noacct(raid_bio);
5493		raid_bio = split;
5494	}
5495
5496	if (!raid5_read_one_chunk(mddev, raid_bio))
5497		return raid_bio;
5498
5499	return NULL;
5500}
5501
5502/* __get_priority_stripe - get the next stripe to process
5503 *
5504 * Full stripe writes are allowed to pass preread active stripes up until
5505 * the bypass_threshold is exceeded.  In general the bypass_count
5506 * increments when the handle_list is handled before the hold_list; however, it
5507 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a
5508 * stripe with in flight i/o.  The bypass_count will be reset when the
5509 * head of the hold_list has changed, i.e. the head was promoted to the
5510 * handle_list.
5511 */
5512static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
5513{
5514	struct stripe_head *sh, *tmp;
5515	struct list_head *handle_list = NULL;
5516	struct r5worker_group *wg;
5517	bool second_try = !r5c_is_writeback(conf->log) &&
5518		!r5l_log_disk_error(conf);
5519	bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state) ||
5520		r5l_log_disk_error(conf);
5521
5522again:
5523	wg = NULL;
5524	sh = NULL;
5525	if (conf->worker_cnt_per_group == 0) {
5526		handle_list = try_loprio ? &conf->loprio_list :
5527					&conf->handle_list;
5528	} else if (group != ANY_GROUP) {
5529		handle_list = try_loprio ? &conf->worker_groups[group].loprio_list :
5530				&conf->worker_groups[group].handle_list;
5531		wg = &conf->worker_groups[group];
5532	} else {
5533		int i;
5534		for (i = 0; i < conf->group_cnt; i++) {
5535			handle_list = try_loprio ? &conf->worker_groups[i].loprio_list :
5536				&conf->worker_groups[i].handle_list;
5537			wg = &conf->worker_groups[i];
5538			if (!list_empty(handle_list))
5539				break;
5540		}
5541	}
5542
5543	pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
5544		  __func__,
5545		  list_empty(handle_list) ? "empty" : "busy",
5546		  list_empty(&conf->hold_list) ? "empty" : "busy",
5547		  atomic_read(&conf->pending_full_writes), conf->bypass_count);
5548
5549	if (!list_empty(handle_list)) {
5550		sh = list_entry(handle_list->next, typeof(*sh), lru);
5551
5552		if (list_empty(&conf->hold_list))
5553			conf->bypass_count = 0;
5554		else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) {
5555			if (conf->hold_list.next == conf->last_hold)
5556				conf->bypass_count++;
5557			else {
5558				conf->last_hold = conf->hold_list.next;
5559				conf->bypass_count -= conf->bypass_threshold;
5560				if (conf->bypass_count < 0)
5561					conf->bypass_count = 0;
5562			}
5563		}
5564	} else if (!list_empty(&conf->hold_list) &&
5565		   ((conf->bypass_threshold &&
5566		     conf->bypass_count > conf->bypass_threshold) ||
5567		    atomic_read(&conf->pending_full_writes) == 0)) {
5568
5569		list_for_each_entry(tmp, &conf->hold_list,  lru) {
5570			if (conf->worker_cnt_per_group == 0 ||
5571			    group == ANY_GROUP ||
5572			    !cpu_online(tmp->cpu) ||
5573			    cpu_to_group(tmp->cpu) == group) {
5574				sh = tmp;
5575				break;
5576			}
5577		}
5578
5579		if (sh) {
5580			conf->bypass_count -= conf->bypass_threshold;
5581			if (conf->bypass_count < 0)
5582				conf->bypass_count = 0;
5583		}
5584		wg = NULL;
5585	}
5586
5587	if (!sh) {
5588		if (second_try)
5589			return NULL;
5590		second_try = true;
5591		try_loprio = !try_loprio;
5592		goto again;
5593	}
5594
5595	if (wg) {
5596		wg->stripes_cnt--;
5597		sh->group = NULL;
5598	}
5599	list_del_init(&sh->lru);
5600	BUG_ON(atomic_inc_return(&sh->count) != 1);
5601	return sh;
5602}
5603
5604struct raid5_plug_cb {
5605	struct blk_plug_cb	cb;
5606	struct list_head	list;
5607	struct list_head	temp_inactive_list[NR_STRIPE_HASH_LOCKS];
5608};
5609
5610static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
5611{
5612	struct raid5_plug_cb *cb = container_of(
5613		blk_cb, struct raid5_plug_cb, cb);
5614	struct stripe_head *sh;
5615	struct mddev *mddev = cb->cb.data;
5616	struct r5conf *conf = mddev->private;
5617	int cnt = 0;
5618	int hash;
5619
5620	if (cb->list.next && !list_empty(&cb->list)) {
5621		spin_lock_irq(&conf->device_lock);
5622		while (!list_empty(&cb->list)) {
5623			sh = list_first_entry(&cb->list, struct stripe_head, lru);
5624			list_del_init(&sh->lru);
5625			/*
5626			 * avoid race release_stripe_plug() sees
5627			 * STRIPE_ON_UNPLUG_LIST clear but the stripe
5628			 * is still in our list
5629			 */
5630			smp_mb__before_atomic();
5631			clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state);
5632			/*
5633			 * STRIPE_ON_RELEASE_LIST could be set here. In that
5634			 * case, the count is always > 1 here
5635			 */
5636			hash = sh->hash_lock_index;
5637			__release_stripe(conf, sh, &cb->temp_inactive_list[hash]);
5638			cnt++;
5639		}
5640		spin_unlock_irq(&conf->device_lock);
5641	}
5642	release_inactive_stripe_list(conf, cb->temp_inactive_list,
5643				     NR_STRIPE_HASH_LOCKS);
5644	if (mddev->queue)
5645		trace_block_unplug(mddev->queue, cnt, !from_schedule);
5646	kfree(cb);
5647}
5648
5649static void release_stripe_plug(struct mddev *mddev,
5650				struct stripe_head *sh)
5651{
5652	struct blk_plug_cb *blk_cb = blk_check_plugged(
5653		raid5_unplug, mddev,
5654		sizeof(struct raid5_plug_cb));
5655	struct raid5_plug_cb *cb;
5656
5657	if (!blk_cb) {
5658		raid5_release_stripe(sh);
5659		return;
5660	}
5661
5662	cb = container_of(blk_cb, struct raid5_plug_cb, cb);
5663
5664	if (cb->list.next == NULL) {
5665		int i;
5666		INIT_LIST_HEAD(&cb->list);
5667		for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
5668			INIT_LIST_HEAD(cb->temp_inactive_list + i);
5669	}
5670
5671	if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
5672		list_add_tail(&sh->lru, &cb->list);
5673	else
5674		raid5_release_stripe(sh);
5675}
5676
5677static void make_discard_request(struct mddev *mddev, struct bio *bi)
5678{
5679	struct r5conf *conf = mddev->private;
5680	sector_t logical_sector, last_sector;
5681	struct stripe_head *sh;
5682	int stripe_sectors;
5683
5684	if (mddev->reshape_position != MaxSector)
5685		/* Skip discard while reshape is happening */
5686		return;
5687
5688	logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
5689	last_sector = bio_end_sector(bi);
5690
5691	bi->bi_next = NULL;
5692
5693	stripe_sectors = conf->chunk_sectors *
5694		(conf->raid_disks - conf->max_degraded);
5695	logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
5696					       stripe_sectors);
5697	sector_div(last_sector, stripe_sectors);
5698
5699	logical_sector *= conf->chunk_sectors;
5700	last_sector *= conf->chunk_sectors;
5701
5702	for (; logical_sector < last_sector;
5703	     logical_sector += RAID5_STRIPE_SECTORS(conf)) {
5704		DEFINE_WAIT(w);
5705		int d;
5706	again:
5707		sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0);
5708		prepare_to_wait(&conf->wait_for_overlap, &w,
5709				TASK_UNINTERRUPTIBLE);
5710		set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
5711		if (test_bit(STRIPE_SYNCING, &sh->state)) {
5712			raid5_release_stripe(sh);
5713			schedule();
5714			goto again;
5715		}
5716		clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
5717		spin_lock_irq(&sh->stripe_lock);
5718		for (d = 0; d < conf->raid_disks; d++) {
5719			if (d == sh->pd_idx || d == sh->qd_idx)
5720				continue;
5721			if (sh->dev[d].towrite || sh->dev[d].toread) {
5722				set_bit(R5_Overlap, &sh->dev[d].flags);
5723				spin_unlock_irq(&sh->stripe_lock);
5724				raid5_release_stripe(sh);
5725				schedule();
5726				goto again;
5727			}
5728		}
5729		set_bit(STRIPE_DISCARD, &sh->state);
5730		finish_wait(&conf->wait_for_overlap, &w);
5731		sh->overwrite_disks = 0;
5732		for (d = 0; d < conf->raid_disks; d++) {
5733			if (d == sh->pd_idx || d == sh->qd_idx)
5734				continue;
5735			sh->dev[d].towrite = bi;
5736			set_bit(R5_OVERWRITE, &sh->dev[d].flags);
5737			bio_inc_remaining(bi);
5738			md_write_inc(mddev, bi);
5739			sh->overwrite_disks++;
5740		}
5741		spin_unlock_irq(&sh->stripe_lock);
5742		if (conf->mddev->bitmap) {
5743			for (d = 0;
5744			     d < conf->raid_disks - conf->max_degraded;
5745			     d++)
5746				md_bitmap_startwrite(mddev->bitmap,
5747						     sh->sector,
5748						     RAID5_STRIPE_SECTORS(conf),
5749						     0);
5750			sh->bm_seq = conf->seq_flush + 1;
5751			set_bit(STRIPE_BIT_DELAY, &sh->state);
5752		}
5753
5754		set_bit(STRIPE_HANDLE, &sh->state);
5755		clear_bit(STRIPE_DELAYED, &sh->state);
5756		if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5757			atomic_inc(&conf->preread_active_stripes);
5758		release_stripe_plug(mddev, sh);
5759	}
5760
5761	bio_endio(bi);
5762}
5763
5764static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
5765{
5766	struct r5conf *conf = mddev->private;
5767	int dd_idx;
5768	sector_t new_sector;
5769	sector_t logical_sector, last_sector;
5770	struct stripe_head *sh;
5771	const int rw = bio_data_dir(bi);
5772	DEFINE_WAIT(w);
5773	bool do_prepare;
5774	bool do_flush = false;
5775
5776	if (unlikely(bi->bi_opf & REQ_PREFLUSH)) {
5777		int ret = log_handle_flush_request(conf, bi);
5778
5779		if (ret == 0)
5780			return true;
5781		if (ret == -ENODEV) {
5782			if (md_flush_request(mddev, bi))
5783				return true;
5784		}
5785		/* ret == -EAGAIN, fallback */
5786		/*
5787		 * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH,
5788		 * we need to flush journal device
5789		 */
5790		do_flush = bi->bi_opf & REQ_PREFLUSH;
5791	}
5792
5793	if (!md_write_start(mddev, bi))
5794		return false;
5795	/*
5796	 * If array is degraded, better not do chunk aligned read because
5797	 * later we might have to read it again in order to reconstruct
5798	 * data on failed drives.
5799	 */
5800	if (rw == READ && mddev->degraded == 0 &&
5801	    mddev->reshape_position == MaxSector) {
5802		bi = chunk_aligned_read(mddev, bi);
5803		if (!bi)
5804			return true;
5805	}
5806
5807	if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) {
5808		make_discard_request(mddev, bi);
5809		md_write_end(mddev);
5810		return true;
5811	}
5812
5813	logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
5814	last_sector = bio_end_sector(bi);
5815	bi->bi_next = NULL;
5816
5817	prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
5818	for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) {
5819		int previous;
5820		int seq;
5821
5822		do_prepare = false;
5823	retry:
5824		seq = read_seqcount_begin(&conf->gen_lock);
5825		previous = 0;
5826		if (do_prepare)
5827			prepare_to_wait(&conf->wait_for_overlap, &w,
5828				TASK_UNINTERRUPTIBLE);
5829		if (unlikely(conf->reshape_progress != MaxSector)) {
5830			/* spinlock is needed as reshape_progress may be
5831			 * 64bit on a 32bit platform, and so it might be
5832			 * possible to see a half-updated value
5833			 * Of course reshape_progress could change after
5834			 * the lock is dropped, so once we get a reference
5835			 * to the stripe that we think it is, we will have
5836			 * to check again.
5837			 */
5838			spin_lock_irq(&conf->device_lock);
5839			if (mddev->reshape_backwards
5840			    ? logical_sector < conf->reshape_progress
5841			    : logical_sector >= conf->reshape_progress) {
5842				previous = 1;
5843			} else {
5844				if (mddev->reshape_backwards
5845				    ? logical_sector < conf->reshape_safe
5846				    : logical_sector >= conf->reshape_safe) {
5847					spin_unlock_irq(&conf->device_lock);
5848					schedule();
5849					do_prepare = true;
5850					goto retry;
5851				}
5852			}
5853			spin_unlock_irq(&conf->device_lock);
5854		}
5855
5856		new_sector = raid5_compute_sector(conf, logical_sector,
5857						  previous,
5858						  &dd_idx, NULL);
5859		pr_debug("raid456: raid5_make_request, sector %llu logical %llu\n",
5860			(unsigned long long)new_sector,
5861			(unsigned long long)logical_sector);
5862
5863		sh = raid5_get_active_stripe(conf, new_sector, previous,
5864				       (bi->bi_opf & REQ_RAHEAD), 0);
5865		if (sh) {
5866			if (unlikely(previous)) {
5867				/* expansion might have moved on while waiting for a
5868				 * stripe, so we must do the range check again.
5869				 * Expansion could still move past after this
5870				 * test, but as we are holding a reference to
5871				 * 'sh', we know that if that happens,
5872				 *  STRIPE_EXPANDING will get set and the expansion
5873				 * won't proceed until we finish with the stripe.
5874				 */
5875				int must_retry = 0;
5876				spin_lock_irq(&conf->device_lock);
5877				if (mddev->reshape_backwards
5878				    ? logical_sector >= conf->reshape_progress
5879				    : logical_sector < conf->reshape_progress)
5880					/* mismatch, need to try again */
5881					must_retry = 1;
5882				spin_unlock_irq(&conf->device_lock);
5883				if (must_retry) {
5884					raid5_release_stripe(sh);
5885					schedule();
5886					do_prepare = true;
5887					goto retry;
5888				}
5889			}
5890			if (read_seqcount_retry(&conf->gen_lock, seq)) {
5891				/* Might have got the wrong stripe_head
5892				 * by accident
5893				 */
5894				raid5_release_stripe(sh);
5895				goto retry;
5896			}
5897
5898			if (test_bit(STRIPE_EXPANDING, &sh->state) ||
5899			    !add_stripe_bio(sh, bi, dd_idx, rw, previous)) {
5900				/* Stripe is busy expanding or
5901				 * add failed due to overlap.  Flush everything
5902				 * and wait a while
5903				 */
5904				md_wakeup_thread(mddev->thread);
5905				raid5_release_stripe(sh);
5906				schedule();
5907				do_prepare = true;
5908				goto retry;
5909			}
5910			if (do_flush) {
5911				set_bit(STRIPE_R5C_PREFLUSH, &sh->state);
5912				/* we only need flush for one stripe */
5913				do_flush = false;
5914			}
5915
5916			set_bit(STRIPE_HANDLE, &sh->state);
5917			clear_bit(STRIPE_DELAYED, &sh->state);
5918			if ((!sh->batch_head || sh == sh->batch_head) &&
5919			    (bi->bi_opf & REQ_SYNC) &&
5920			    !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5921				atomic_inc(&conf->preread_active_stripes);
5922			release_stripe_plug(mddev, sh);
5923		} else {
5924			/* cannot get stripe for read-ahead, just give-up */
5925			bi->bi_status = BLK_STS_IOERR;
5926			break;
5927		}
5928	}
5929	finish_wait(&conf->wait_for_overlap, &w);
5930
5931	if (rw == WRITE)
5932		md_write_end(mddev);
5933	bio_endio(bi);
5934	return true;
5935}
5936
5937static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks);
5938
5939static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped)
5940{
5941	/* reshaping is quite different to recovery/resync so it is
5942	 * handled quite separately ... here.
5943	 *
5944	 * On each call to sync_request, we gather one chunk worth of
5945	 * destination stripes and flag them as expanding.
5946	 * Then we find all the source stripes and request reads.
5947	 * As the reads complete, handle_stripe will copy the data
5948	 * into the destination stripe and release that stripe.
5949	 */
5950	struct r5conf *conf = mddev->private;
5951	struct stripe_head *sh;
5952	struct md_rdev *rdev;
5953	sector_t first_sector, last_sector;
5954	int raid_disks = conf->previous_raid_disks;
5955	int data_disks = raid_disks - conf->max_degraded;
5956	int new_data_disks = conf->raid_disks - conf->max_degraded;
5957	int i;
5958	int dd_idx;
5959	sector_t writepos, readpos, safepos;
5960	sector_t stripe_addr;
5961	int reshape_sectors;
5962	struct list_head stripes;
5963	sector_t retn;
5964
5965	if (sector_nr == 0) {
5966		/* If restarting in the middle, skip the initial sectors */
5967		if (mddev->reshape_backwards &&
5968		    conf->reshape_progress < raid5_size(mddev, 0, 0)) {
5969			sector_nr = raid5_size(mddev, 0, 0)
5970				- conf->reshape_progress;
5971		} else if (mddev->reshape_backwards &&
5972			   conf->reshape_progress == MaxSector) {
5973			/* shouldn't happen, but just in case, finish up.*/
5974			sector_nr = MaxSector;
5975		} else if (!mddev->reshape_backwards &&
5976			   conf->reshape_progress > 0)
5977			sector_nr = conf->reshape_progress;
5978		sector_div(sector_nr, new_data_disks);
5979		if (sector_nr) {
5980			mddev->curr_resync_completed = sector_nr;
5981			sysfs_notify_dirent_safe(mddev->sysfs_completed);
5982			*skipped = 1;
5983			retn = sector_nr;
5984			goto finish;
5985		}
5986	}
5987
5988	/* We need to process a full chunk at a time.
5989	 * If old and new chunk sizes differ, we need to process the
5990	 * largest of these
5991	 */
5992
5993	reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors);
5994
5995	/* We update the metadata at least every 10 seconds, or when
5996	 * the data about to be copied would over-write the source of
5997	 * the data at the front of the range.  i.e. one new_stripe
5998	 * along from reshape_progress new_maps to after where
5999	 * reshape_safe old_maps to
6000	 */
6001	writepos = conf->reshape_progress;
6002	sector_div(writepos, new_data_disks);
6003	readpos = conf->reshape_progress;
6004	sector_div(readpos, data_disks);
6005	safepos = conf->reshape_safe;
6006	sector_div(safepos, data_disks);
6007	if (mddev->reshape_backwards) {
6008		BUG_ON(writepos < reshape_sectors);
6009		writepos -= reshape_sectors;
6010		readpos += reshape_sectors;
6011		safepos += reshape_sectors;
6012	} else {
6013		writepos += reshape_sectors;
6014		/* readpos and safepos are worst-case calculations.
6015		 * A negative number is overly pessimistic, and causes
6016		 * obvious problems for unsigned storage.  So clip to 0.
6017		 */
6018		readpos -= min_t(sector_t, reshape_sectors, readpos);
6019		safepos -= min_t(sector_t, reshape_sectors, safepos);
6020	}
6021
6022	/* Having calculated the 'writepos' possibly use it
6023	 * to set 'stripe_addr' which is where we will write to.
6024	 */
6025	if (mddev->reshape_backwards) {
6026		BUG_ON(conf->reshape_progress == 0);
6027		stripe_addr = writepos;
6028		BUG_ON((mddev->dev_sectors &
6029			~((sector_t)reshape_sectors - 1))
6030		       - reshape_sectors - stripe_addr
6031		       != sector_nr);
6032	} else {
6033		BUG_ON(writepos != sector_nr + reshape_sectors);
6034		stripe_addr = sector_nr;
6035	}
6036
6037	/* 'writepos' is the most advanced device address we might write.
6038	 * 'readpos' is the least advanced device address we might read.
6039	 * 'safepos' is the least address recorded in the metadata as having
6040	 *     been reshaped.
6041	 * If there is a min_offset_diff, these are adjusted either by
6042	 * increasing the safepos/readpos if diff is negative, or
6043	 * increasing writepos if diff is positive.
6044	 * If 'readpos' is then behind 'writepos', there is no way that we can
6045	 * ensure safety in the face of a crash - that must be done by userspace
6046	 * making a backup of the data.  So in that case there is no particular
6047	 * rush to update metadata.
6048	 * Otherwise if 'safepos' is behind 'writepos', then we really need to
6049	 * update the metadata to advance 'safepos' to match 'readpos' so that
6050	 * we can be safe in the event of a crash.
6051	 * So we insist on updating metadata if safepos is behind writepos and
6052	 * readpos is beyond writepos.
6053	 * In any case, update the metadata every 10 seconds.
6054	 * Maybe that number should be configurable, but I'm not sure it is
6055	 * worth it.... maybe it could be a multiple of safemode_delay???
6056	 */
6057	if (conf->min_offset_diff < 0) {
6058		safepos += -conf->min_offset_diff;
6059		readpos += -conf->min_offset_diff;
6060	} else
6061		writepos += conf->min_offset_diff;
6062
6063	if ((mddev->reshape_backwards
6064	     ? (safepos > writepos && readpos < writepos)
6065	     : (safepos < writepos && readpos > writepos)) ||
6066	    time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
6067		/* Cannot proceed until we've updated the superblock... */
6068		wait_event(conf->wait_for_overlap,
6069			   atomic_read(&conf->reshape_stripes)==0
6070			   || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6071		if (atomic_read(&conf->reshape_stripes) != 0)
6072			return 0;
6073		mddev->reshape_position = conf->reshape_progress;
6074		mddev->curr_resync_completed = sector_nr;
6075		if (!mddev->reshape_backwards)
6076			/* Can update recovery_offset */
6077			rdev_for_each(rdev, mddev)
6078				if (rdev->raid_disk >= 0 &&
6079				    !test_bit(Journal, &rdev->flags) &&
6080				    !test_bit(In_sync, &rdev->flags) &&
6081				    rdev->recovery_offset < sector_nr)
6082					rdev->recovery_offset = sector_nr;
6083
6084		conf->reshape_checkpoint = jiffies;
6085		set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6086		md_wakeup_thread(mddev->thread);
6087		wait_event(mddev->sb_wait, mddev->sb_flags == 0 ||
6088			   test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6089		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6090			return 0;
6091		spin_lock_irq(&conf->device_lock);
6092		conf->reshape_safe = mddev->reshape_position;
6093		spin_unlock_irq(&conf->device_lock);
6094		wake_up(&conf->wait_for_overlap);
6095		sysfs_notify_dirent_safe(mddev->sysfs_completed);
6096	}
6097
6098	INIT_LIST_HEAD(&stripes);
6099	for (i = 0; i < reshape_sectors; i += RAID5_STRIPE_SECTORS(conf)) {
6100		int j;
6101		int skipped_disk = 0;
6102		sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
6103		set_bit(STRIPE_EXPANDING, &sh->state);
6104		atomic_inc(&conf->reshape_stripes);
6105		/* If any of this stripe is beyond the end of the old
6106		 * array, then we need to zero those blocks
6107		 */
6108		for (j=sh->disks; j--;) {
6109			sector_t s;
6110			if (j == sh->pd_idx)
6111				continue;
6112			if (conf->level == 6 &&
6113			    j == sh->qd_idx)
6114				continue;
6115			s = raid5_compute_blocknr(sh, j, 0);
6116			if (s < raid5_size(mddev, 0, 0)) {
6117				skipped_disk = 1;
6118				continue;
6119			}
6120			memset(page_address(sh->dev[j].page), 0, RAID5_STRIPE_SIZE(conf));
6121			set_bit(R5_Expanded, &sh->dev[j].flags);
6122			set_bit(R5_UPTODATE, &sh->dev[j].flags);
6123		}
6124		if (!skipped_disk) {
6125			set_bit(STRIPE_EXPAND_READY, &sh->state);
6126			set_bit(STRIPE_HANDLE, &sh->state);
6127		}
6128		list_add(&sh->lru, &stripes);
6129	}
6130	spin_lock_irq(&conf->device_lock);
6131	if (mddev->reshape_backwards)
6132		conf->reshape_progress -= reshape_sectors * new_data_disks;
6133	else
6134		conf->reshape_progress += reshape_sectors * new_data_disks;
6135	spin_unlock_irq(&conf->device_lock);
6136	/* Ok, those stripe are ready. We can start scheduling
6137	 * reads on the source stripes.
6138	 * The source stripes are determined by mapping the first and last
6139	 * block on the destination stripes.
6140	 */
6141	first_sector =
6142		raid5_compute_sector(conf, stripe_addr*(new_data_disks),
6143				     1, &dd_idx, NULL);
6144	last_sector =
6145		raid5_compute_sector(conf, ((stripe_addr+reshape_sectors)
6146					    * new_data_disks - 1),
6147				     1, &dd_idx, NULL);
6148	if (last_sector >= mddev->dev_sectors)
6149		last_sector = mddev->dev_sectors - 1;
6150	while (first_sector <= last_sector) {
6151		sh = raid5_get_active_stripe(conf, first_sector, 1, 0, 1);
6152		set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
6153		set_bit(STRIPE_HANDLE, &sh->state);
6154		raid5_release_stripe(sh);
6155		first_sector += RAID5_STRIPE_SECTORS(conf);
6156	}
6157	/* Now that the sources are clearly marked, we can release
6158	 * the destination stripes
6159	 */
6160	while (!list_empty(&stripes)) {
6161		sh = list_entry(stripes.next, struct stripe_head, lru);
6162		list_del_init(&sh->lru);
6163		raid5_release_stripe(sh);
6164	}
6165	/* If this takes us to the resync_max point where we have to pause,
6166	 * then we need to write out the superblock.
6167	 */
6168	sector_nr += reshape_sectors;
6169	retn = reshape_sectors;
6170finish:
6171	if (mddev->curr_resync_completed > mddev->resync_max ||
6172	    (sector_nr - mddev->curr_resync_completed) * 2
6173	    >= mddev->resync_max - mddev->curr_resync_completed) {
6174		/* Cannot proceed until we've updated the superblock... */
6175		wait_event(conf->wait_for_overlap,
6176			   atomic_read(&conf->reshape_stripes) == 0
6177			   || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6178		if (atomic_read(&conf->reshape_stripes) != 0)
6179			goto ret;
6180		mddev->reshape_position = conf->reshape_progress;
6181		mddev->curr_resync_completed = sector_nr;
6182		if (!mddev->reshape_backwards)
6183			/* Can update recovery_offset */
6184			rdev_for_each(rdev, mddev)
6185				if (rdev->raid_disk >= 0 &&
6186				    !test_bit(Journal, &rdev->flags) &&
6187				    !test_bit(In_sync, &rdev->flags) &&
6188				    rdev->recovery_offset < sector_nr)
6189					rdev->recovery_offset = sector_nr;
6190		conf->reshape_checkpoint = jiffies;
6191		set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6192		md_wakeup_thread(mddev->thread);
6193		wait_event(mddev->sb_wait,
6194			   !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)
6195			   || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6196		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6197			goto ret;
6198		spin_lock_irq(&conf->device_lock);
6199		conf->reshape_safe = mddev->reshape_position;
6200		spin_unlock_irq(&conf->device_lock);
6201		wake_up(&conf->wait_for_overlap);
6202		sysfs_notify_dirent_safe(mddev->sysfs_completed);
6203	}
6204ret:
6205	return retn;
6206}
6207
6208static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_nr,
6209					  int *skipped)
6210{
6211	struct r5conf *conf = mddev->private;
6212	struct stripe_head *sh;
6213	sector_t max_sector = mddev->dev_sectors;
6214	sector_t sync_blocks;
6215	int still_degraded = 0;
6216	int i;
6217
6218	if (sector_nr >= max_sector) {
6219		/* just being told to finish up .. nothing much to do */
6220
6221		if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
6222			end_reshape(conf);
6223			return 0;
6224		}
6225
6226		if (mddev->curr_resync < max_sector) /* aborted */
6227			md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
6228					   &sync_blocks, 1);
6229		else /* completed sync */
6230			conf->fullsync = 0;
6231		md_bitmap_close_sync(mddev->bitmap);
6232
6233		return 0;
6234	}
6235
6236	/* Allow raid5_quiesce to complete */
6237	wait_event(conf->wait_for_overlap, conf->quiesce != 2);
6238
6239	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
6240		return reshape_request(mddev, sector_nr, skipped);
6241
6242	/* No need to check resync_max as we never do more than one
6243	 * stripe, and as resync_max will always be on a chunk boundary,
6244	 * if the check in md_do_sync didn't fire, there is no chance
6245	 * of overstepping resync_max here
6246	 */
6247
6248	/* if there is too many failed drives and we are trying
6249	 * to resync, then assert that we are finished, because there is
6250	 * nothing we can do.
6251	 */
6252	if (mddev->degraded >= conf->max_degraded &&
6253	    test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
6254		sector_t rv = mddev->dev_sectors - sector_nr;
6255		*skipped = 1;
6256		return rv;
6257	}
6258	if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
6259	    !conf->fullsync &&
6260	    !md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
6261	    sync_blocks >= RAID5_STRIPE_SECTORS(conf)) {
6262		/* we can skip this block, and probably more */
6263		do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf));
6264		*skipped = 1;
6265		/* keep things rounded to whole stripes */
6266		return sync_blocks * RAID5_STRIPE_SECTORS(conf);
6267	}
6268
6269	md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false);
6270
6271	sh = raid5_get_active_stripe(conf, sector_nr, 0, 1, 0);
6272	if (sh == NULL) {
6273		sh = raid5_get_active_stripe(conf, sector_nr, 0, 0, 0);
6274		/* make sure we don't swamp the stripe cache if someone else
6275		 * is trying to get access
6276		 */
6277		schedule_timeout_uninterruptible(1);
6278	}
6279	/* Need to check if array will still be degraded after recovery/resync
6280	 * Note in case of > 1 drive failures it's possible we're rebuilding
6281	 * one drive while leaving another faulty drive in array.
6282	 */
6283	rcu_read_lock();
6284	for (i = 0; i < conf->raid_disks; i++) {
6285		struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev);
6286
6287		if (rdev == NULL || test_bit(Faulty, &rdev->flags))
6288			still_degraded = 1;
6289	}
6290	rcu_read_unlock();
6291
6292	md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
6293
6294	set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
6295	set_bit(STRIPE_HANDLE, &sh->state);
6296
6297	raid5_release_stripe(sh);
6298
6299	return RAID5_STRIPE_SECTORS(conf);
6300}
6301
6302static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
6303			       unsigned int offset)
6304{
6305	/* We may not be able to submit a whole bio at once as there
6306	 * may not be enough stripe_heads available.
6307	 * We cannot pre-allocate enough stripe_heads as we may need
6308	 * more than exist in the cache (if we allow ever large chunks).
6309	 * So we do one stripe head at a time and record in
6310	 * ->bi_hw_segments how many have been done.
6311	 *
6312	 * We *know* that this entire raid_bio is in one chunk, so
6313	 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector.
6314	 */
6315	struct stripe_head *sh;
6316	int dd_idx;
6317	sector_t sector, logical_sector, last_sector;
6318	int scnt = 0;
6319	int handled = 0;
6320
6321	logical_sector = raid_bio->bi_iter.bi_sector &
6322		~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
6323	sector = raid5_compute_sector(conf, logical_sector,
6324				      0, &dd_idx, NULL);
6325	last_sector = bio_end_sector(raid_bio);
6326
6327	for (; logical_sector < last_sector;
6328	     logical_sector += RAID5_STRIPE_SECTORS(conf),
6329		     sector += RAID5_STRIPE_SECTORS(conf),
6330		     scnt++) {
6331
6332		if (scnt < offset)
6333			/* already done this stripe */
6334			continue;
6335
6336		sh = raid5_get_active_stripe(conf, sector, 0, 1, 1);
6337
6338		if (!sh) {
6339			/* failed to get a stripe - must wait */
6340			conf->retry_read_aligned = raid_bio;
6341			conf->retry_read_offset = scnt;
6342			return handled;
6343		}
6344
6345		if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
6346			raid5_release_stripe(sh);
6347			conf->retry_read_aligned = raid_bio;
6348			conf->retry_read_offset = scnt;
6349			return handled;
6350		}
6351
6352		set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags);
6353		handle_stripe(sh);
6354		raid5_release_stripe(sh);
6355		handled++;
6356	}
6357
6358	bio_endio(raid_bio);
6359
6360	if (atomic_dec_and_test(&conf->active_aligned_reads))
6361		wake_up(&conf->wait_for_quiescent);
6362	return handled;
6363}
6364
6365static int handle_active_stripes(struct r5conf *conf, int group,
6366				 struct r5worker *worker,
6367				 struct list_head *temp_inactive_list)
6368		__releases(&conf->device_lock)
6369		__acquires(&conf->device_lock)
6370{
6371	struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
6372	int i, batch_size = 0, hash;
6373	bool release_inactive = false;
6374
6375	while (batch_size < MAX_STRIPE_BATCH &&
6376			(sh = __get_priority_stripe(conf, group)) != NULL)
6377		batch[batch_size++] = sh;
6378
6379	if (batch_size == 0) {
6380		for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
6381			if (!list_empty(temp_inactive_list + i))
6382				break;
6383		if (i == NR_STRIPE_HASH_LOCKS) {
6384			spin_unlock_irq(&conf->device_lock);
6385			log_flush_stripe_to_raid(conf);
6386			spin_lock_irq(&conf->device_lock);
6387			return batch_size;
6388		}
6389		release_inactive = true;
6390	}
6391	spin_unlock_irq(&conf->device_lock);
6392
6393	release_inactive_stripe_list(conf, temp_inactive_list,
6394				     NR_STRIPE_HASH_LOCKS);
6395
6396	r5l_flush_stripe_to_raid(conf->log);
6397	if (release_inactive) {
6398		spin_lock_irq(&conf->device_lock);
6399		return 0;
6400	}
6401
6402	for (i = 0; i < batch_size; i++)
6403		handle_stripe(batch[i]);
6404	log_write_stripe_run(conf);
6405
6406	cond_resched();
6407
6408	spin_lock_irq(&conf->device_lock);
6409	for (i = 0; i < batch_size; i++) {
6410		hash = batch[i]->hash_lock_index;
6411		__release_stripe(conf, batch[i], &temp_inactive_list[hash]);
6412	}
6413	return batch_size;
6414}
6415
6416static void raid5_do_work(struct work_struct *work)
6417{
6418	struct r5worker *worker = container_of(work, struct r5worker, work);
6419	struct r5worker_group *group = worker->group;
6420	struct r5conf *conf = group->conf;
6421	struct mddev *mddev = conf->mddev;
6422	int group_id = group - conf->worker_groups;
6423	int handled;
6424	struct blk_plug plug;
6425
6426	pr_debug("+++ raid5worker active\n");
6427
6428	blk_start_plug(&plug);
6429	handled = 0;
6430	spin_lock_irq(&conf->device_lock);
6431	while (1) {
6432		int batch_size, released;
6433
6434		released = release_stripe_list(conf, worker->temp_inactive_list);
6435
6436		batch_size = handle_active_stripes(conf, group_id, worker,
6437						   worker->temp_inactive_list);
6438		worker->working = false;
6439		if (!batch_size && !released)
6440			break;
6441		handled += batch_size;
6442		wait_event_lock_irq(mddev->sb_wait,
6443			!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
6444			conf->device_lock);
6445	}
6446	pr_debug("%d stripes handled\n", handled);
6447
6448	spin_unlock_irq(&conf->device_lock);
6449
6450	flush_deferred_bios(conf);
6451
6452	r5l_flush_stripe_to_raid(conf->log);
6453
6454	async_tx_issue_pending_all();
6455	blk_finish_plug(&plug);
6456
6457	pr_debug("--- raid5worker inactive\n");
6458}
6459
6460/*
6461 * This is our raid5 kernel thread.
6462 *
6463 * We scan the hash table for stripes which can be handled now.
6464 * During the scan, completed stripes are saved for us by the interrupt
6465 * handler, so that they will not have to wait for our next wakeup.
6466 */
6467static void raid5d(struct md_thread *thread)
6468{
6469	struct mddev *mddev = thread->mddev;
6470	struct r5conf *conf = mddev->private;
6471	int handled;
6472	struct blk_plug plug;
6473
6474	pr_debug("+++ raid5d active\n");
6475
6476	md_check_recovery(mddev);
6477
6478	blk_start_plug(&plug);
6479	handled = 0;
6480	spin_lock_irq(&conf->device_lock);
6481	while (1) {
6482		struct bio *bio;
6483		int batch_size, released;
6484		unsigned int offset;
6485
6486		released = release_stripe_list(conf, conf->temp_inactive_list);
6487		if (released)
6488			clear_bit(R5_DID_ALLOC, &conf->cache_state);
6489
6490		if (
6491		    !list_empty(&conf->bitmap_list)) {
6492			/* Now is a good time to flush some bitmap updates */
6493			conf->seq_flush++;
6494			spin_unlock_irq(&conf->device_lock);
6495			md_bitmap_unplug(mddev->bitmap);
6496			spin_lock_irq(&conf->device_lock);
6497			conf->seq_write = conf->seq_flush;
6498			activate_bit_delay(conf, conf->temp_inactive_list);
6499		}
6500		raid5_activate_delayed(conf);
6501
6502		while ((bio = remove_bio_from_retry(conf, &offset))) {
6503			int ok;
6504			spin_unlock_irq(&conf->device_lock);
6505			ok = retry_aligned_read(conf, bio, offset);
6506			spin_lock_irq(&conf->device_lock);
6507			if (!ok)
6508				break;
6509			handled++;
6510		}
6511
6512		batch_size = handle_active_stripes(conf, ANY_GROUP, NULL,
6513						   conf->temp_inactive_list);
6514		if (!batch_size && !released)
6515			break;
6516		handled += batch_size;
6517
6518		if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) {
6519			spin_unlock_irq(&conf->device_lock);
6520			md_check_recovery(mddev);
6521			spin_lock_irq(&conf->device_lock);
6522		}
6523	}
6524	pr_debug("%d stripes handled\n", handled);
6525
6526	spin_unlock_irq(&conf->device_lock);
6527	if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state) &&
6528	    mutex_trylock(&conf->cache_size_mutex)) {
6529		grow_one_stripe(conf, __GFP_NOWARN);
6530		/* Set flag even if allocation failed.  This helps
6531		 * slow down allocation requests when mem is short
6532		 */
6533		set_bit(R5_DID_ALLOC, &conf->cache_state);
6534		mutex_unlock(&conf->cache_size_mutex);
6535	}
6536
6537	flush_deferred_bios(conf);
6538
6539	r5l_flush_stripe_to_raid(conf->log);
6540
6541	async_tx_issue_pending_all();
6542	blk_finish_plug(&plug);
6543
6544	pr_debug("--- raid5d inactive\n");
6545}
6546
6547static ssize_t
6548raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
6549{
6550	struct r5conf *conf;
6551	int ret = 0;
6552	spin_lock(&mddev->lock);
6553	conf = mddev->private;
6554	if (conf)
6555		ret = sprintf(page, "%d\n", conf->min_nr_stripes);
6556	spin_unlock(&mddev->lock);
6557	return ret;
6558}
6559
6560int
6561raid5_set_cache_size(struct mddev *mddev, int size)
6562{
6563	int result = 0;
6564	struct r5conf *conf = mddev->private;
6565
6566	if (size <= 16 || size > 32768)
6567		return -EINVAL;
6568
6569	conf->min_nr_stripes = size;
6570	mutex_lock(&conf->cache_size_mutex);
6571	while (size < conf->max_nr_stripes &&
6572	       drop_one_stripe(conf))
6573		;
6574	mutex_unlock(&conf->cache_size_mutex);
6575
6576	md_allow_write(mddev);
6577
6578	mutex_lock(&conf->cache_size_mutex);
6579	while (size > conf->max_nr_stripes)
6580		if (!grow_one_stripe(conf, GFP_KERNEL)) {
6581			conf->min_nr_stripes = conf->max_nr_stripes;
6582			result = -ENOMEM;
6583			break;
6584		}
6585	mutex_unlock(&conf->cache_size_mutex);
6586
6587	return result;
6588}
6589EXPORT_SYMBOL(raid5_set_cache_size);
6590
6591static ssize_t
6592raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len)
6593{
6594	struct r5conf *conf;
6595	unsigned long new;
6596	int err;
6597
6598	if (len >= PAGE_SIZE)
6599		return -EINVAL;
6600	if (kstrtoul(page, 10, &new))
6601		return -EINVAL;
6602	err = mddev_lock(mddev);
6603	if (err)
6604		return err;
6605	conf = mddev->private;
6606	if (!conf)
6607		err = -ENODEV;
6608	else
6609		err = raid5_set_cache_size(mddev, new);
6610	mddev_unlock(mddev);
6611
6612	return err ?: len;
6613}
6614
6615static struct md_sysfs_entry
6616raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
6617				raid5_show_stripe_cache_size,
6618				raid5_store_stripe_cache_size);
6619
6620static ssize_t
6621raid5_show_rmw_level(struct mddev  *mddev, char *page)
6622{
6623	struct r5conf *conf = mddev->private;
6624	if (conf)
6625		return sprintf(page, "%d\n", conf->rmw_level);
6626	else
6627		return 0;
6628}
6629
6630static ssize_t
6631raid5_store_rmw_level(struct mddev  *mddev, const char *page, size_t len)
6632{
6633	struct r5conf *conf = mddev->private;
6634	unsigned long new;
6635
6636	if (!conf)
6637		return -ENODEV;
6638
6639	if (len >= PAGE_SIZE)
6640		return -EINVAL;
6641
6642	if (kstrtoul(page, 10, &new))
6643		return -EINVAL;
6644
6645	if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome)
6646		return -EINVAL;
6647
6648	if (new != PARITY_DISABLE_RMW &&
6649	    new != PARITY_ENABLE_RMW &&
6650	    new != PARITY_PREFER_RMW)
6651		return -EINVAL;
6652
6653	conf->rmw_level = new;
6654	return len;
6655}
6656
6657static struct md_sysfs_entry
6658raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR,
6659			 raid5_show_rmw_level,
6660			 raid5_store_rmw_level);
6661
6662static ssize_t
6663raid5_show_stripe_size(struct mddev  *mddev, char *page)
6664{
6665	struct r5conf *conf;
6666	int ret = 0;
6667
6668	spin_lock(&mddev->lock);
6669	conf = mddev->private;
6670	if (conf)
6671		ret = sprintf(page, "%lu\n", RAID5_STRIPE_SIZE(conf));
6672	spin_unlock(&mddev->lock);
6673	return ret;
6674}
6675
6676#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
6677static ssize_t
6678raid5_store_stripe_size(struct mddev  *mddev, const char *page, size_t len)
6679{
6680	struct r5conf *conf;
6681	unsigned long new;
6682	int err;
6683	int size;
6684
6685	if (len >= PAGE_SIZE)
6686		return -EINVAL;
6687	if (kstrtoul(page, 10, &new))
6688		return -EINVAL;
6689
6690	/*
6691	 * The value should not be bigger than PAGE_SIZE. It requires to
6692	 * be multiple of DEFAULT_STRIPE_SIZE and the value should be power
6693	 * of two.
6694	 */
6695	if (new % DEFAULT_STRIPE_SIZE != 0 ||
6696			new > PAGE_SIZE || new == 0 ||
6697			new != roundup_pow_of_two(new))
6698		return -EINVAL;
6699
6700	err = mddev_lock(mddev);
6701	if (err)
6702		return err;
6703
6704	conf = mddev->private;
6705	if (!conf) {
6706		err = -ENODEV;
6707		goto out_unlock;
6708	}
6709
6710	if (new == conf->stripe_size)
6711		goto out_unlock;
6712
6713	pr_debug("md/raid: change stripe_size from %lu to %lu\n",
6714			conf->stripe_size, new);
6715
6716	if (mddev->sync_thread ||
6717		test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
6718		mddev->reshape_position != MaxSector ||
6719		mddev->sysfs_active) {
6720		err = -EBUSY;
6721		goto out_unlock;
6722	}
6723
6724	mddev_suspend(mddev);
6725	mutex_lock(&conf->cache_size_mutex);
6726	size = conf->max_nr_stripes;
6727
6728	shrink_stripes(conf);
6729
6730	conf->stripe_size = new;
6731	conf->stripe_shift = ilog2(new) - 9;
6732	conf->stripe_sectors = new >> 9;
6733	if (grow_stripes(conf, size)) {
6734		pr_warn("md/raid:%s: couldn't allocate buffers\n",
6735				mdname(mddev));
6736		err = -ENOMEM;
6737	}
6738	mutex_unlock(&conf->cache_size_mutex);
6739	mddev_resume(mddev);
6740
6741out_unlock:
6742	mddev_unlock(mddev);
6743	return err ?: len;
6744}
6745
6746static struct md_sysfs_entry
6747raid5_stripe_size = __ATTR(stripe_size, 0644,
6748			 raid5_show_stripe_size,
6749			 raid5_store_stripe_size);
6750#else
6751static struct md_sysfs_entry
6752raid5_stripe_size = __ATTR(stripe_size, 0444,
6753			 raid5_show_stripe_size,
6754			 NULL);
6755#endif
6756
6757static ssize_t
6758raid5_show_preread_threshold(struct mddev *mddev, char *page)
6759{
6760	struct r5conf *conf;
6761	int ret = 0;
6762	spin_lock(&mddev->lock);
6763	conf = mddev->private;
6764	if (conf)
6765		ret = sprintf(page, "%d\n", conf->bypass_threshold);
6766	spin_unlock(&mddev->lock);
6767	return ret;
6768}
6769
6770static ssize_t
6771raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
6772{
6773	struct r5conf *conf;
6774	unsigned long new;
6775	int err;
6776
6777	if (len >= PAGE_SIZE)
6778		return -EINVAL;
6779	if (kstrtoul(page, 10, &new))
6780		return -EINVAL;
6781
6782	err = mddev_lock(mddev);
6783	if (err)
6784		return err;
6785	conf = mddev->private;
6786	if (!conf)
6787		err = -ENODEV;
6788	else if (new > conf->min_nr_stripes)
6789		err = -EINVAL;
6790	else
6791		conf->bypass_threshold = new;
6792	mddev_unlock(mddev);
6793	return err ?: len;
6794}
6795
6796static struct md_sysfs_entry
6797raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
6798					S_IRUGO | S_IWUSR,
6799					raid5_show_preread_threshold,
6800					raid5_store_preread_threshold);
6801
6802static ssize_t
6803raid5_show_skip_copy(struct mddev *mddev, char *page)
6804{
6805	struct r5conf *conf;
6806	int ret = 0;
6807	spin_lock(&mddev->lock);
6808	conf = mddev->private;
6809	if (conf)
6810		ret = sprintf(page, "%d\n", conf->skip_copy);
6811	spin_unlock(&mddev->lock);
6812	return ret;
6813}
6814
6815static ssize_t
6816raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
6817{
6818	struct r5conf *conf;
6819	unsigned long new;
6820	int err;
6821
6822	if (len >= PAGE_SIZE)
6823		return -EINVAL;
6824	if (kstrtoul(page, 10, &new))
6825		return -EINVAL;
6826	new = !!new;
6827
6828	err = mddev_lock(mddev);
6829	if (err)
6830		return err;
6831	conf = mddev->private;
6832	if (!conf)
6833		err = -ENODEV;
6834	else if (new != conf->skip_copy) {
6835		struct request_queue *q = mddev->queue;
6836
6837		mddev_suspend(mddev);
6838		conf->skip_copy = new;
6839		if (new)
6840			blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);
6841		else
6842			blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q);
6843		mddev_resume(mddev);
6844	}
6845	mddev_unlock(mddev);
6846	return err ?: len;
6847}
6848
6849static struct md_sysfs_entry
6850raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR,
6851					raid5_show_skip_copy,
6852					raid5_store_skip_copy);
6853
6854static ssize_t
6855stripe_cache_active_show(struct mddev *mddev, char *page)
6856{
6857	struct r5conf *conf = mddev->private;
6858	if (conf)
6859		return sprintf(page, "%d\n", atomic_read(&conf->active_stripes));
6860	else
6861		return 0;
6862}
6863
6864static struct md_sysfs_entry
6865raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
6866
6867static ssize_t
6868raid5_show_group_thread_cnt(struct mddev *mddev, char *page)
6869{
6870	struct r5conf *conf;
6871	int ret = 0;
6872	spin_lock(&mddev->lock);
6873	conf = mddev->private;
6874	if (conf)
6875		ret = sprintf(page, "%d\n", conf->worker_cnt_per_group);
6876	spin_unlock(&mddev->lock);
6877	return ret;
6878}
6879
6880static int alloc_thread_groups(struct r5conf *conf, int cnt,
6881			       int *group_cnt,
6882			       struct r5worker_group **worker_groups);
6883static ssize_t
6884raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
6885{
6886	struct r5conf *conf;
6887	unsigned int new;
6888	int err;
6889	struct r5worker_group *new_groups, *old_groups;
6890	int group_cnt;
6891
6892	if (len >= PAGE_SIZE)
6893		return -EINVAL;
6894	if (kstrtouint(page, 10, &new))
6895		return -EINVAL;
6896	/* 8192 should be big enough */
6897	if (new > 8192)
6898		return -EINVAL;
6899
6900	err = mddev_lock(mddev);
6901	if (err)
6902		return err;
6903	conf = mddev->private;
6904	if (!conf)
6905		err = -ENODEV;
6906	else if (new != conf->worker_cnt_per_group) {
6907		mddev_suspend(mddev);
6908
6909		old_groups = conf->worker_groups;
6910		if (old_groups)
6911			flush_workqueue(raid5_wq);
6912
6913		err = alloc_thread_groups(conf, new, &group_cnt, &new_groups);
6914		if (!err) {
6915			spin_lock_irq(&conf->device_lock);
6916			conf->group_cnt = group_cnt;
6917			conf->worker_cnt_per_group = new;
6918			conf->worker_groups = new_groups;
6919			spin_unlock_irq(&conf->device_lock);
6920
6921			if (old_groups)
6922				kfree(old_groups[0].workers);
6923			kfree(old_groups);
6924		}
6925		mddev_resume(mddev);
6926	}
6927	mddev_unlock(mddev);
6928
6929	return err ?: len;
6930}
6931
6932static struct md_sysfs_entry
6933raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR,
6934				raid5_show_group_thread_cnt,
6935				raid5_store_group_thread_cnt);
6936
6937static struct attribute *raid5_attrs[] =  {
6938	&raid5_stripecache_size.attr,
6939	&raid5_stripecache_active.attr,
6940	&raid5_preread_bypass_threshold.attr,
6941	&raid5_group_thread_cnt.attr,
6942	&raid5_skip_copy.attr,
6943	&raid5_rmw_level.attr,
6944	&raid5_stripe_size.attr,
6945	&r5c_journal_mode.attr,
6946	&ppl_write_hint.attr,
6947	NULL,
6948};
6949static struct attribute_group raid5_attrs_group = {
6950	.name = NULL,
6951	.attrs = raid5_attrs,
6952};
6953
6954static int alloc_thread_groups(struct r5conf *conf, int cnt, int *group_cnt,
6955			       struct r5worker_group **worker_groups)
6956{
6957	int i, j, k;
6958	ssize_t size;
6959	struct r5worker *workers;
6960
6961	if (cnt == 0) {
6962		*group_cnt = 0;
6963		*worker_groups = NULL;
6964		return 0;
6965	}
6966	*group_cnt = num_possible_nodes();
6967	size = sizeof(struct r5worker) * cnt;
6968	workers = kcalloc(size, *group_cnt, GFP_NOIO);
6969	*worker_groups = kcalloc(*group_cnt, sizeof(struct r5worker_group),
6970				 GFP_NOIO);
6971	if (!*worker_groups || !workers) {
6972		kfree(workers);
6973		kfree(*worker_groups);
6974		return -ENOMEM;
6975	}
6976
6977	for (i = 0; i < *group_cnt; i++) {
6978		struct r5worker_group *group;
6979
6980		group = &(*worker_groups)[i];
6981		INIT_LIST_HEAD(&group->handle_list);
6982		INIT_LIST_HEAD(&group->loprio_list);
6983		group->conf = conf;
6984		group->workers = workers + i * cnt;
6985
6986		for (j = 0; j < cnt; j++) {
6987			struct r5worker *worker = group->workers + j;
6988			worker->group = group;
6989			INIT_WORK(&worker->work, raid5_do_work);
6990
6991			for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++)
6992				INIT_LIST_HEAD(worker->temp_inactive_list + k);
6993		}
6994	}
6995
6996	return 0;
6997}
6998
6999static void free_thread_groups(struct r5conf *conf)
7000{
7001	if (conf->worker_groups)
7002		kfree(conf->worker_groups[0].workers);
7003	kfree(conf->worker_groups);
7004	conf->worker_groups = NULL;
7005}
7006
7007static sector_t
7008raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
7009{
7010	struct r5conf *conf = mddev->private;
7011
7012	if (!sectors)
7013		sectors = mddev->dev_sectors;
7014	if (!raid_disks)
7015		/* size is defined by the smallest of previous and new size */
7016		raid_disks = min(conf->raid_disks, conf->previous_raid_disks);
7017
7018	sectors &= ~((sector_t)conf->chunk_sectors - 1);
7019	sectors &= ~((sector_t)conf->prev_chunk_sectors - 1);
7020	return sectors * (raid_disks - conf->max_degraded);
7021}
7022
7023static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
7024{
7025	safe_put_page(percpu->spare_page);
7026	percpu->spare_page = NULL;
7027	kvfree(percpu->scribble);
7028	percpu->scribble = NULL;
7029}
7030
7031static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
7032{
7033	if (conf->level == 6 && !percpu->spare_page) {
7034		percpu->spare_page = alloc_page(GFP_KERNEL);
7035		if (!percpu->spare_page)
7036			return -ENOMEM;
7037	}
7038
7039	if (scribble_alloc(percpu,
7040			   max(conf->raid_disks,
7041			       conf->previous_raid_disks),
7042			   max(conf->chunk_sectors,
7043			       conf->prev_chunk_sectors)
7044			   / RAID5_STRIPE_SECTORS(conf))) {
7045		free_scratch_buffer(conf, percpu);
7046		return -ENOMEM;
7047	}
7048
7049	return 0;
7050}
7051
7052static int raid456_cpu_dead(unsigned int cpu, struct hlist_node *node)
7053{
7054	struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node);
7055
7056	free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
7057	return 0;
7058}
7059
7060static void raid5_free_percpu(struct r5conf *conf)
7061{
7062	if (!conf->percpu)
7063		return;
7064
7065	cpuhp_state_remove_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
7066	free_percpu(conf->percpu);
7067}
7068
7069static void free_conf(struct r5conf *conf)
7070{
7071	int i;
7072
7073	log_exit(conf);
7074
7075	unregister_shrinker(&conf->shrinker);
7076	free_thread_groups(conf);
7077	shrink_stripes(conf);
7078	raid5_free_percpu(conf);
7079	for (i = 0; i < conf->pool_size; i++)
7080		if (conf->disks[i].extra_page)
7081			put_page(conf->disks[i].extra_page);
7082	kfree(conf->disks);
7083	bioset_exit(&conf->bio_split);
7084	kfree(conf->stripe_hashtbl);
7085	kfree(conf->pending_data);
7086	kfree(conf);
7087}
7088
7089static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
7090{
7091	struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node);
7092	struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
7093
7094	if (alloc_scratch_buffer(conf, percpu)) {
7095		pr_warn("%s: failed memory allocation for cpu%u\n",
7096			__func__, cpu);
7097		return -ENOMEM;
7098	}
7099	return 0;
7100}
7101
7102static int raid5_alloc_percpu(struct r5conf *conf)
7103{
7104	int err = 0;
7105
7106	conf->percpu = alloc_percpu(struct raid5_percpu);
7107	if (!conf->percpu)
7108		return -ENOMEM;
7109
7110	err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
7111	if (!err) {
7112		conf->scribble_disks = max(conf->raid_disks,
7113			conf->previous_raid_disks);
7114		conf->scribble_sectors = max(conf->chunk_sectors,
7115			conf->prev_chunk_sectors);
7116	}
7117	return err;
7118}
7119
7120static unsigned long raid5_cache_scan(struct shrinker *shrink,
7121				      struct shrink_control *sc)
7122{
7123	struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
7124	unsigned long ret = SHRINK_STOP;
7125
7126	if (mutex_trylock(&conf->cache_size_mutex)) {
7127		ret= 0;
7128		while (ret < sc->nr_to_scan &&
7129		       conf->max_nr_stripes > conf->min_nr_stripes) {
7130			if (drop_one_stripe(conf) == 0) {
7131				ret = SHRINK_STOP;
7132				break;
7133			}
7134			ret++;
7135		}
7136		mutex_unlock(&conf->cache_size_mutex);
7137	}
7138	return ret;
7139}
7140
7141static unsigned long raid5_cache_count(struct shrinker *shrink,
7142				       struct shrink_control *sc)
7143{
7144	struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
7145
7146	if (conf->max_nr_stripes < conf->min_nr_stripes)
7147		/* unlikely, but not impossible */
7148		return 0;
7149	return conf->max_nr_stripes - conf->min_nr_stripes;
7150}
7151
7152static struct r5conf *setup_conf(struct mddev *mddev)
7153{
7154	struct r5conf *conf;
7155	int raid_disk, memory, max_disks;
7156	struct md_rdev *rdev;
7157	struct disk_info *disk;
7158	char pers_name[6];
7159	int i;
7160	int group_cnt;
7161	struct r5worker_group *new_group;
7162	int ret;
7163
7164	if (mddev->new_level != 5
7165	    && mddev->new_level != 4
7166	    && mddev->new_level != 6) {
7167		pr_warn("md/raid:%s: raid level not set to 4/5/6 (%d)\n",
7168			mdname(mddev), mddev->new_level);
7169		return ERR_PTR(-EIO);
7170	}
7171	if ((mddev->new_level == 5
7172	     && !algorithm_valid_raid5(mddev->new_layout)) ||
7173	    (mddev->new_level == 6
7174	     && !algorithm_valid_raid6(mddev->new_layout))) {
7175		pr_warn("md/raid:%s: layout %d not supported\n",
7176			mdname(mddev), mddev->new_layout);
7177		return ERR_PTR(-EIO);
7178	}
7179	if (mddev->new_level == 6 && mddev->raid_disks < 4) {
7180		pr_warn("md/raid:%s: not enough configured devices (%d, minimum 4)\n",
7181			mdname(mddev), mddev->raid_disks);
7182		return ERR_PTR(-EINVAL);
7183	}
7184
7185	if (!mddev->new_chunk_sectors ||
7186	    (mddev->new_chunk_sectors << 9) % PAGE_SIZE ||
7187	    !is_power_of_2(mddev->new_chunk_sectors)) {
7188		pr_warn("md/raid:%s: invalid chunk size %d\n",
7189			mdname(mddev), mddev->new_chunk_sectors << 9);
7190		return ERR_PTR(-EINVAL);
7191	}
7192
7193	conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
7194	if (conf == NULL)
7195		goto abort;
7196
7197#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
7198	conf->stripe_size = DEFAULT_STRIPE_SIZE;
7199	conf->stripe_shift = ilog2(DEFAULT_STRIPE_SIZE) - 9;
7200	conf->stripe_sectors = DEFAULT_STRIPE_SIZE >> 9;
7201#endif
7202	INIT_LIST_HEAD(&conf->free_list);
7203	INIT_LIST_HEAD(&conf->pending_list);
7204	conf->pending_data = kcalloc(PENDING_IO_MAX,
7205				     sizeof(struct r5pending_data),
7206				     GFP_KERNEL);
7207	if (!conf->pending_data)
7208		goto abort;
7209	for (i = 0; i < PENDING_IO_MAX; i++)
7210		list_add(&conf->pending_data[i].sibling, &conf->free_list);
7211	/* Don't enable multi-threading by default*/
7212	if (!alloc_thread_groups(conf, 0, &group_cnt, &new_group)) {
7213		conf->group_cnt = group_cnt;
7214		conf->worker_cnt_per_group = 0;
7215		conf->worker_groups = new_group;
7216	} else
7217		goto abort;
7218	spin_lock_init(&conf->device_lock);
7219	seqcount_spinlock_init(&conf->gen_lock, &conf->device_lock);
7220	mutex_init(&conf->cache_size_mutex);
7221	init_waitqueue_head(&conf->wait_for_quiescent);
7222	init_waitqueue_head(&conf->wait_for_stripe);
7223	init_waitqueue_head(&conf->wait_for_overlap);
7224	INIT_LIST_HEAD(&conf->handle_list);
7225	INIT_LIST_HEAD(&conf->loprio_list);
7226	INIT_LIST_HEAD(&conf->hold_list);
7227	INIT_LIST_HEAD(&conf->delayed_list);
7228	INIT_LIST_HEAD(&conf->bitmap_list);
7229	init_llist_head(&conf->released_stripes);
7230	atomic_set(&conf->active_stripes, 0);
7231	atomic_set(&conf->preread_active_stripes, 0);
7232	atomic_set(&conf->active_aligned_reads, 0);
7233	spin_lock_init(&conf->pending_bios_lock);
7234	conf->batch_bio_dispatch = true;
7235	rdev_for_each(rdev, mddev) {
7236		if (test_bit(Journal, &rdev->flags))
7237			continue;
7238		if (blk_queue_nonrot(bdev_get_queue(rdev->bdev))) {
7239			conf->batch_bio_dispatch = false;
7240			break;
7241		}
7242	}
7243
7244	conf->bypass_threshold = BYPASS_THRESHOLD;
7245	conf->recovery_disabled = mddev->recovery_disabled - 1;
7246
7247	conf->raid_disks = mddev->raid_disks;
7248	if (mddev->reshape_position == MaxSector)
7249		conf->previous_raid_disks = mddev->raid_disks;
7250	else
7251		conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
7252	max_disks = max(conf->raid_disks, conf->previous_raid_disks);
7253
7254	conf->disks = kcalloc(max_disks, sizeof(struct disk_info),
7255			      GFP_KERNEL);
7256
7257	if (!conf->disks)
7258		goto abort;
7259
7260	for (i = 0; i < max_disks; i++) {
7261		conf->disks[i].extra_page = alloc_page(GFP_KERNEL);
7262		if (!conf->disks[i].extra_page)
7263			goto abort;
7264	}
7265
7266	ret = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0);
7267	if (ret)
7268		goto abort;
7269	conf->mddev = mddev;
7270
7271	if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
7272		goto abort;
7273
7274	/* We init hash_locks[0] separately to that it can be used
7275	 * as the reference lock in the spin_lock_nest_lock() call
7276	 * in lock_all_device_hash_locks_irq in order to convince
7277	 * lockdep that we know what we are doing.
7278	 */
7279	spin_lock_init(conf->hash_locks);
7280	for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
7281		spin_lock_init(conf->hash_locks + i);
7282
7283	for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
7284		INIT_LIST_HEAD(conf->inactive_list + i);
7285
7286	for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
7287		INIT_LIST_HEAD(conf->temp_inactive_list + i);
7288
7289	atomic_set(&conf->r5c_cached_full_stripes, 0);
7290	INIT_LIST_HEAD(&conf->r5c_full_stripe_list);
7291	atomic_set(&conf->r5c_cached_partial_stripes, 0);
7292	INIT_LIST_HEAD(&conf->r5c_partial_stripe_list);
7293	atomic_set(&conf->r5c_flushing_full_stripes, 0);
7294	atomic_set(&conf->r5c_flushing_partial_stripes, 0);
7295
7296	conf->level = mddev->new_level;
7297	conf->chunk_sectors = mddev->new_chunk_sectors;
7298	if (raid5_alloc_percpu(conf) != 0)
7299		goto abort;
7300
7301	pr_debug("raid456: run(%s) called.\n", mdname(mddev));
7302
7303	rdev_for_each(rdev, mddev) {
7304		raid_disk = rdev->raid_disk;
7305		if (raid_disk >= max_disks
7306		    || raid_disk < 0 || test_bit(Journal, &rdev->flags))
7307			continue;
7308		disk = conf->disks + raid_disk;
7309
7310		if (test_bit(Replacement, &rdev->flags)) {
7311			if (disk->replacement)
7312				goto abort;
7313			disk->replacement = rdev;
7314		} else {
7315			if (disk->rdev)
7316				goto abort;
7317			disk->rdev = rdev;
7318		}
7319
7320		if (test_bit(In_sync, &rdev->flags)) {
7321			char b[BDEVNAME_SIZE];
7322			pr_info("md/raid:%s: device %s operational as raid disk %d\n",
7323				mdname(mddev), bdevname(rdev->bdev, b), raid_disk);
7324		} else if (rdev->saved_raid_disk != raid_disk)
7325			/* Cannot rely on bitmap to complete recovery */
7326			conf->fullsync = 1;
7327	}
7328
7329	conf->level = mddev->new_level;
7330	if (conf->level == 6) {
7331		conf->max_degraded = 2;
7332		if (raid6_call.xor_syndrome)
7333			conf->rmw_level = PARITY_ENABLE_RMW;
7334		else
7335			conf->rmw_level = PARITY_DISABLE_RMW;
7336	} else {
7337		conf->max_degraded = 1;
7338		conf->rmw_level = PARITY_ENABLE_RMW;
7339	}
7340	conf->algorithm = mddev->new_layout;
7341	conf->reshape_progress = mddev->reshape_position;
7342	if (conf->reshape_progress != MaxSector) {
7343		conf->prev_chunk_sectors = mddev->chunk_sectors;
7344		conf->prev_algo = mddev->layout;
7345	} else {
7346		conf->prev_chunk_sectors = conf->chunk_sectors;
7347		conf->prev_algo = conf->algorithm;
7348	}
7349
7350	conf->min_nr_stripes = NR_STRIPES;
7351	if (mddev->reshape_position != MaxSector) {
7352		int stripes = max_t(int,
7353			((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4,
7354			((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4);
7355		conf->min_nr_stripes = max(NR_STRIPES, stripes);
7356		if (conf->min_nr_stripes != NR_STRIPES)
7357			pr_info("md/raid:%s: force stripe size %d for reshape\n",
7358				mdname(mddev), conf->min_nr_stripes);
7359	}
7360	memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
7361		 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
7362	atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
7363	if (grow_stripes(conf, conf->min_nr_stripes)) {
7364		pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n",
7365			mdname(mddev), memory);
7366		goto abort;
7367	} else
7368		pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory);
7369	/*
7370	 * Losing a stripe head costs more than the time to refill it,
7371	 * it reduces the queue depth and so can hurt throughput.
7372	 * So set it rather large, scaled by number of devices.
7373	 */
7374	conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4;
7375	conf->shrinker.scan_objects = raid5_cache_scan;
7376	conf->shrinker.count_objects = raid5_cache_count;
7377	conf->shrinker.batch = 128;
7378	conf->shrinker.flags = 0;
7379	if (register_shrinker(&conf->shrinker)) {
7380		pr_warn("md/raid:%s: couldn't register shrinker.\n",
7381			mdname(mddev));
7382		goto abort;
7383	}
7384
7385	sprintf(pers_name, "raid%d", mddev->new_level);
7386	conf->thread = md_register_thread(raid5d, mddev, pers_name);
7387	if (!conf->thread) {
7388		pr_warn("md/raid:%s: couldn't allocate thread.\n",
7389			mdname(mddev));
7390		goto abort;
7391	}
7392
7393	return conf;
7394
7395 abort:
7396	if (conf) {
7397		free_conf(conf);
7398		return ERR_PTR(-EIO);
7399	} else
7400		return ERR_PTR(-ENOMEM);
7401}
7402
7403static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded)
7404{
7405	switch (algo) {
7406	case ALGORITHM_PARITY_0:
7407		if (raid_disk < max_degraded)
7408			return 1;
7409		break;
7410	case ALGORITHM_PARITY_N:
7411		if (raid_disk >= raid_disks - max_degraded)
7412			return 1;
7413		break;
7414	case ALGORITHM_PARITY_0_6:
7415		if (raid_disk == 0 ||
7416		    raid_disk == raid_disks - 1)
7417			return 1;
7418		break;
7419	case ALGORITHM_LEFT_ASYMMETRIC_6:
7420	case ALGORITHM_RIGHT_ASYMMETRIC_6:
7421	case ALGORITHM_LEFT_SYMMETRIC_6:
7422	case ALGORITHM_RIGHT_SYMMETRIC_6:
7423		if (raid_disk == raid_disks - 1)
7424			return 1;
7425	}
7426	return 0;
7427}
7428
7429static void raid5_set_io_opt(struct r5conf *conf)
7430{
7431	blk_queue_io_opt(conf->mddev->queue, (conf->chunk_sectors << 9) *
7432			 (conf->raid_disks - conf->max_degraded));
7433}
7434
7435static int raid5_run(struct mddev *mddev)
7436{
7437	struct r5conf *conf;
7438	int working_disks = 0;
7439	int dirty_parity_disks = 0;
7440	struct md_rdev *rdev;
7441	struct md_rdev *journal_dev = NULL;
7442	sector_t reshape_offset = 0;
7443	int i;
7444	long long min_offset_diff = 0;
7445	int first = 1;
7446
7447	if (mddev_init_writes_pending(mddev) < 0)
7448		return -ENOMEM;
7449
7450	if (mddev->recovery_cp != MaxSector)
7451		pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
7452			  mdname(mddev));
7453
7454	rdev_for_each(rdev, mddev) {
7455		long long diff;
7456
7457		if (test_bit(Journal, &rdev->flags)) {
7458			journal_dev = rdev;
7459			continue;
7460		}
7461		if (rdev->raid_disk < 0)
7462			continue;
7463		diff = (rdev->new_data_offset - rdev->data_offset);
7464		if (first) {
7465			min_offset_diff = diff;
7466			first = 0;
7467		} else if (mddev->reshape_backwards &&
7468			 diff < min_offset_diff)
7469			min_offset_diff = diff;
7470		else if (!mddev->reshape_backwards &&
7471			 diff > min_offset_diff)
7472			min_offset_diff = diff;
7473	}
7474
7475	if ((test_bit(MD_HAS_JOURNAL, &mddev->flags) || journal_dev) &&
7476	    (mddev->bitmap_info.offset || mddev->bitmap_info.file)) {
7477		pr_notice("md/raid:%s: array cannot have both journal and bitmap\n",
7478			  mdname(mddev));
7479		return -EINVAL;
7480	}
7481
7482	if (mddev->reshape_position != MaxSector) {
7483		/* Check that we can continue the reshape.
7484		 * Difficulties arise if the stripe we would write to
7485		 * next is at or after the stripe we would read from next.
7486		 * For a reshape that changes the number of devices, this
7487		 * is only possible for a very short time, and mdadm makes
7488		 * sure that time appears to have past before assembling
7489		 * the array.  So we fail if that time hasn't passed.
7490		 * For a reshape that keeps the number of devices the same
7491		 * mdadm must be monitoring the reshape can keeping the
7492		 * critical areas read-only and backed up.  It will start
7493		 * the array in read-only mode, so we check for that.
7494		 */
7495		sector_t here_new, here_old;
7496		int old_disks;
7497		int max_degraded = (mddev->level == 6 ? 2 : 1);
7498		int chunk_sectors;
7499		int new_data_disks;
7500
7501		if (journal_dev) {
7502			pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n",
7503				mdname(mddev));
7504			return -EINVAL;
7505		}
7506
7507		if (mddev->new_level != mddev->level) {
7508			pr_warn("md/raid:%s: unsupported reshape required - aborting.\n",
7509				mdname(mddev));
7510			return -EINVAL;
7511		}
7512		old_disks = mddev->raid_disks - mddev->delta_disks;
7513		/* reshape_position must be on a new-stripe boundary, and one
7514		 * further up in new geometry must map after here in old
7515		 * geometry.
7516		 * If the chunk sizes are different, then as we perform reshape
7517		 * in units of the largest of the two, reshape_position needs
7518		 * be a multiple of the largest chunk size times new data disks.
7519		 */
7520		here_new = mddev->reshape_position;
7521		chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors);
7522		new_data_disks = mddev->raid_disks - max_degraded;
7523		if (sector_div(here_new, chunk_sectors * new_data_disks)) {
7524			pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n",
7525				mdname(mddev));
7526			return -EINVAL;
7527		}
7528		reshape_offset = here_new * chunk_sectors;
7529		/* here_new is the stripe we will write to */
7530		here_old = mddev->reshape_position;
7531		sector_div(here_old, chunk_sectors * (old_disks-max_degraded));
7532		/* here_old is the first stripe that we might need to read
7533		 * from */
7534		if (mddev->delta_disks == 0) {
7535			/* We cannot be sure it is safe to start an in-place
7536			 * reshape.  It is only safe if user-space is monitoring
7537			 * and taking constant backups.
7538			 * mdadm always starts a situation like this in
7539			 * readonly mode so it can take control before
7540			 * allowing any writes.  So just check for that.
7541			 */
7542			if (abs(min_offset_diff) >= mddev->chunk_sectors &&
7543			    abs(min_offset_diff) >= mddev->new_chunk_sectors)
7544				/* not really in-place - so OK */;
7545			else if (mddev->ro == 0) {
7546				pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n",
7547					mdname(mddev));
7548				return -EINVAL;
7549			}
7550		} else if (mddev->reshape_backwards
7551		    ? (here_new * chunk_sectors + min_offset_diff <=
7552		       here_old * chunk_sectors)
7553		    : (here_new * chunk_sectors >=
7554		       here_old * chunk_sectors + (-min_offset_diff))) {
7555			/* Reading from the same stripe as writing to - bad */
7556			pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n",
7557				mdname(mddev));
7558			return -EINVAL;
7559		}
7560		pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev));
7561		/* OK, we should be able to continue; */
7562	} else {
7563		BUG_ON(mddev->level != mddev->new_level);
7564		BUG_ON(mddev->layout != mddev->new_layout);
7565		BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors);
7566		BUG_ON(mddev->delta_disks != 0);
7567	}
7568
7569	if (test_bit(MD_HAS_JOURNAL, &mddev->flags) &&
7570	    test_bit(MD_HAS_PPL, &mddev->flags)) {
7571		pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n",
7572			mdname(mddev));
7573		clear_bit(MD_HAS_PPL, &mddev->flags);
7574		clear_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags);
7575	}
7576
7577	if (mddev->private == NULL)
7578		conf = setup_conf(mddev);
7579	else
7580		conf = mddev->private;
7581
7582	if (IS_ERR(conf))
7583		return PTR_ERR(conf);
7584
7585	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
7586		if (!journal_dev) {
7587			pr_warn("md/raid:%s: journal disk is missing, force array readonly\n",
7588				mdname(mddev));
7589			mddev->ro = 1;
7590			set_disk_ro(mddev->gendisk, 1);
7591		} else if (mddev->recovery_cp == MaxSector)
7592			set_bit(MD_JOURNAL_CLEAN, &mddev->flags);
7593	}
7594
7595	conf->min_offset_diff = min_offset_diff;
7596	mddev->thread = conf->thread;
7597	conf->thread = NULL;
7598	mddev->private = conf;
7599
7600	for (i = 0; i < conf->raid_disks && conf->previous_raid_disks;
7601	     i++) {
7602		rdev = conf->disks[i].rdev;
7603		if (!rdev && conf->disks[i].replacement) {
7604			/* The replacement is all we have yet */
7605			rdev = conf->disks[i].replacement;
7606			conf->disks[i].replacement = NULL;
7607			clear_bit(Replacement, &rdev->flags);
7608			conf->disks[i].rdev = rdev;
7609		}
7610		if (!rdev)
7611			continue;
7612		if (conf->disks[i].replacement &&
7613		    conf->reshape_progress != MaxSector) {
7614			/* replacements and reshape simply do not mix. */
7615			pr_warn("md: cannot handle concurrent replacement and reshape.\n");
7616			goto abort;
7617		}
7618		if (test_bit(In_sync, &rdev->flags)) {
7619			working_disks++;
7620			continue;
7621		}
7622		/* This disc is not fully in-sync.  However if it
7623		 * just stored parity (beyond the recovery_offset),
7624		 * when we don't need to be concerned about the
7625		 * array being dirty.
7626		 * When reshape goes 'backwards', we never have
7627		 * partially completed devices, so we only need
7628		 * to worry about reshape going forwards.
7629		 */
7630		/* Hack because v0.91 doesn't store recovery_offset properly. */
7631		if (mddev->major_version == 0 &&
7632		    mddev->minor_version > 90)
7633			rdev->recovery_offset = reshape_offset;
7634
7635		if (rdev->recovery_offset < reshape_offset) {
7636			/* We need to check old and new layout */
7637			if (!only_parity(rdev->raid_disk,
7638					 conf->algorithm,
7639					 conf->raid_disks,
7640					 conf->max_degraded))
7641				continue;
7642		}
7643		if (!only_parity(rdev->raid_disk,
7644				 conf->prev_algo,
7645				 conf->previous_raid_disks,
7646				 conf->max_degraded))
7647			continue;
7648		dirty_parity_disks++;
7649	}
7650
7651	/*
7652	 * 0 for a fully functional array, 1 or 2 for a degraded array.
7653	 */
7654	mddev->degraded = raid5_calc_degraded(conf);
7655
7656	if (has_failed(conf)) {
7657		pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n",
7658			mdname(mddev), mddev->degraded, conf->raid_disks);
7659		goto abort;
7660	}
7661
7662	/* device size must be a multiple of chunk size */
7663	mddev->dev_sectors &= ~(mddev->chunk_sectors - 1);
7664	mddev->resync_max_sectors = mddev->dev_sectors;
7665
7666	if (mddev->degraded > dirty_parity_disks &&
7667	    mddev->recovery_cp != MaxSector) {
7668		if (test_bit(MD_HAS_PPL, &mddev->flags))
7669			pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n",
7670				mdname(mddev));
7671		else if (mddev->ok_start_degraded)
7672			pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n",
7673				mdname(mddev));
7674		else {
7675			pr_crit("md/raid:%s: cannot start dirty degraded array.\n",
7676				mdname(mddev));
7677			goto abort;
7678		}
7679	}
7680
7681	pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n",
7682		mdname(mddev), conf->level,
7683		mddev->raid_disks-mddev->degraded, mddev->raid_disks,
7684		mddev->new_layout);
7685
7686	print_raid5_conf(conf);
7687
7688	if (conf->reshape_progress != MaxSector) {
7689		conf->reshape_safe = conf->reshape_progress;
7690		atomic_set(&conf->reshape_stripes, 0);
7691		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7692		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7693		set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7694		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7695		mddev->sync_thread = md_register_thread(md_do_sync, mddev,
7696							"reshape");
7697		if (!mddev->sync_thread)
7698			goto abort;
7699	}
7700
7701	/* Ok, everything is just fine now */
7702	if (mddev->to_remove == &raid5_attrs_group)
7703		mddev->to_remove = NULL;
7704	else if (mddev->kobj.sd &&
7705	    sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
7706		pr_warn("raid5: failed to create sysfs attributes for %s\n",
7707			mdname(mddev));
7708	md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
7709
7710	if (mddev->queue) {
7711		int chunk_size;
7712		/* read-ahead size must cover two whole stripes, which
7713		 * is 2 * (datadisks) * chunksize where 'n' is the
7714		 * number of raid devices
7715		 */
7716		int data_disks = conf->previous_raid_disks - conf->max_degraded;
7717		int stripe = data_disks *
7718			((mddev->chunk_sectors << 9) / PAGE_SIZE);
7719
7720		chunk_size = mddev->chunk_sectors << 9;
7721		blk_queue_io_min(mddev->queue, chunk_size);
7722		raid5_set_io_opt(conf);
7723		mddev->queue->limits.raid_partial_stripes_expensive = 1;
7724		/*
7725		 * We can only discard a whole stripe. It doesn't make sense to
7726		 * discard data disk but write parity disk
7727		 */
7728		stripe = stripe * PAGE_SIZE;
7729		/* Round up to power of 2, as discard handling
7730		 * currently assumes that */
7731		while ((stripe-1) & stripe)
7732			stripe = (stripe | (stripe-1)) + 1;
7733		mddev->queue->limits.discard_alignment = stripe;
7734		mddev->queue->limits.discard_granularity = stripe;
7735
7736		blk_queue_max_write_same_sectors(mddev->queue, 0);
7737		blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
7738
7739		rdev_for_each(rdev, mddev) {
7740			disk_stack_limits(mddev->gendisk, rdev->bdev,
7741					  rdev->data_offset << 9);
7742			disk_stack_limits(mddev->gendisk, rdev->bdev,
7743					  rdev->new_data_offset << 9);
7744		}
7745
7746		/*
7747		 * zeroing is required, otherwise data
7748		 * could be lost. Consider a scenario: discard a stripe
7749		 * (the stripe could be inconsistent if
7750		 * discard_zeroes_data is 0); write one disk of the
7751		 * stripe (the stripe could be inconsistent again
7752		 * depending on which disks are used to calculate
7753		 * parity); the disk is broken; The stripe data of this
7754		 * disk is lost.
7755		 *
7756		 * We only allow DISCARD if the sysadmin has confirmed that
7757		 * only safe devices are in use by setting a module parameter.
7758		 * A better idea might be to turn DISCARD into WRITE_ZEROES
7759		 * requests, as that is required to be safe.
7760		 */
7761		if (devices_handle_discard_safely &&
7762		    mddev->queue->limits.max_discard_sectors >= (stripe >> 9) &&
7763		    mddev->queue->limits.discard_granularity >= stripe)
7764			blk_queue_flag_set(QUEUE_FLAG_DISCARD,
7765						mddev->queue);
7766		else
7767			blk_queue_flag_clear(QUEUE_FLAG_DISCARD,
7768						mddev->queue);
7769
7770		blk_queue_max_hw_sectors(mddev->queue, UINT_MAX);
7771	}
7772
7773	if (log_init(conf, journal_dev, raid5_has_ppl(conf)))
7774		goto abort;
7775
7776	return 0;
7777abort:
7778	md_unregister_thread(&mddev->thread);
7779	print_raid5_conf(conf);
7780	free_conf(conf);
7781	mddev->private = NULL;
7782	pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
7783	return -EIO;
7784}
7785
7786static void raid5_free(struct mddev *mddev, void *priv)
7787{
7788	struct r5conf *conf = priv;
7789
7790	free_conf(conf);
7791	mddev->to_remove = &raid5_attrs_group;
7792}
7793
7794static void raid5_status(struct seq_file *seq, struct mddev *mddev)
7795{
7796	struct r5conf *conf = mddev->private;
7797	int i;
7798
7799	seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
7800		conf->chunk_sectors / 2, mddev->layout);
7801	seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
7802	rcu_read_lock();
7803	for (i = 0; i < conf->raid_disks; i++) {
7804		struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
7805		seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
7806	}
7807	rcu_read_unlock();
7808	seq_printf (seq, "]");
7809}
7810
7811static void print_raid5_conf (struct r5conf *conf)
7812{
7813	int i;
7814	struct disk_info *tmp;
7815
7816	pr_debug("RAID conf printout:\n");
7817	if (!conf) {
7818		pr_debug("(conf==NULL)\n");
7819		return;
7820	}
7821	pr_debug(" --- level:%d rd:%d wd:%d\n", conf->level,
7822	       conf->raid_disks,
7823	       conf->raid_disks - conf->mddev->degraded);
7824
7825	for (i = 0; i < conf->raid_disks; i++) {
7826		char b[BDEVNAME_SIZE];
7827		tmp = conf->disks + i;
7828		if (tmp->rdev)
7829			pr_debug(" disk %d, o:%d, dev:%s\n",
7830			       i, !test_bit(Faulty, &tmp->rdev->flags),
7831			       bdevname(tmp->rdev->bdev, b));
7832	}
7833}
7834
7835static int raid5_spare_active(struct mddev *mddev)
7836{
7837	int i;
7838	struct r5conf *conf = mddev->private;
7839	struct disk_info *tmp;
7840	int count = 0;
7841	unsigned long flags;
7842
7843	for (i = 0; i < conf->raid_disks; i++) {
7844		tmp = conf->disks + i;
7845		if (tmp->replacement
7846		    && tmp->replacement->recovery_offset == MaxSector
7847		    && !test_bit(Faulty, &tmp->replacement->flags)
7848		    && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
7849			/* Replacement has just become active. */
7850			if (!tmp->rdev
7851			    || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
7852				count++;
7853			if (tmp->rdev) {
7854				/* Replaced device not technically faulty,
7855				 * but we need to be sure it gets removed
7856				 * and never re-added.
7857				 */
7858				set_bit(Faulty, &tmp->rdev->flags);
7859				sysfs_notify_dirent_safe(
7860					tmp->rdev->sysfs_state);
7861			}
7862			sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
7863		} else if (tmp->rdev
7864		    && tmp->rdev->recovery_offset == MaxSector
7865		    && !test_bit(Faulty, &tmp->rdev->flags)
7866		    && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
7867			count++;
7868			sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
7869		}
7870	}
7871	spin_lock_irqsave(&conf->device_lock, flags);
7872	mddev->degraded = raid5_calc_degraded(conf);
7873	spin_unlock_irqrestore(&conf->device_lock, flags);
7874	print_raid5_conf(conf);
7875	return count;
7876}
7877
7878static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
7879{
7880	struct r5conf *conf = mddev->private;
7881	int err = 0;
7882	int number = rdev->raid_disk;
7883	struct md_rdev **rdevp;
7884	struct disk_info *p = conf->disks + number;
7885
7886	print_raid5_conf(conf);
7887	if (test_bit(Journal, &rdev->flags) && conf->log) {
7888		/*
7889		 * we can't wait pending write here, as this is called in
7890		 * raid5d, wait will deadlock.
7891		 * neilb: there is no locking about new writes here,
7892		 * so this cannot be safe.
7893		 */
7894		if (atomic_read(&conf->active_stripes) ||
7895		    atomic_read(&conf->r5c_cached_full_stripes) ||
7896		    atomic_read(&conf->r5c_cached_partial_stripes)) {
7897			return -EBUSY;
7898		}
7899		log_exit(conf);
7900		return 0;
7901	}
7902	if (rdev == p->rdev)
7903		rdevp = &p->rdev;
7904	else if (rdev == p->replacement)
7905		rdevp = &p->replacement;
7906	else
7907		return 0;
7908
7909	if (number >= conf->raid_disks &&
7910	    conf->reshape_progress == MaxSector)
7911		clear_bit(In_sync, &rdev->flags);
7912
7913	if (test_bit(In_sync, &rdev->flags) ||
7914	    atomic_read(&rdev->nr_pending)) {
7915		err = -EBUSY;
7916		goto abort;
7917	}
7918	/* Only remove non-faulty devices if recovery
7919	 * isn't possible.
7920	 */
7921	if (!test_bit(Faulty, &rdev->flags) &&
7922	    mddev->recovery_disabled != conf->recovery_disabled &&
7923	    !has_failed(conf) &&
7924	    (!p->replacement || p->replacement == rdev) &&
7925	    number < conf->raid_disks) {
7926		err = -EBUSY;
7927		goto abort;
7928	}
7929	*rdevp = NULL;
7930	if (!test_bit(RemoveSynchronized, &rdev->flags)) {
7931		synchronize_rcu();
7932		if (atomic_read(&rdev->nr_pending)) {
7933			/* lost the race, try later */
7934			err = -EBUSY;
7935			*rdevp = rdev;
7936		}
7937	}
7938	if (!err) {
7939		err = log_modify(conf, rdev, false);
7940		if (err)
7941			goto abort;
7942	}
7943	if (p->replacement) {
7944		/* We must have just cleared 'rdev' */
7945		p->rdev = p->replacement;
7946		clear_bit(Replacement, &p->replacement->flags);
7947		smp_mb(); /* Make sure other CPUs may see both as identical
7948			   * but will never see neither - if they are careful
7949			   */
7950		p->replacement = NULL;
7951
7952		if (!err)
7953			err = log_modify(conf, p->rdev, true);
7954	}
7955
7956	clear_bit(WantReplacement, &rdev->flags);
7957abort:
7958
7959	print_raid5_conf(conf);
7960	return err;
7961}
7962
7963static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
7964{
7965	struct r5conf *conf = mddev->private;
7966	int ret, err = -EEXIST;
7967	int disk;
7968	struct disk_info *p;
7969	int first = 0;
7970	int last = conf->raid_disks - 1;
7971
7972	if (test_bit(Journal, &rdev->flags)) {
7973		if (conf->log)
7974			return -EBUSY;
7975
7976		rdev->raid_disk = 0;
7977		/*
7978		 * The array is in readonly mode if journal is missing, so no
7979		 * write requests running. We should be safe
7980		 */
7981		ret = log_init(conf, rdev, false);
7982		if (ret)
7983			return ret;
7984
7985		ret = r5l_start(conf->log);
7986		if (ret)
7987			return ret;
7988
7989		return 0;
7990	}
7991	if (mddev->recovery_disabled == conf->recovery_disabled)
7992		return -EBUSY;
7993
7994	if (rdev->saved_raid_disk < 0 && has_failed(conf))
7995		/* no point adding a device */
7996		return -EINVAL;
7997
7998	if (rdev->raid_disk >= 0)
7999		first = last = rdev->raid_disk;
8000
8001	/*
8002	 * find the disk ... but prefer rdev->saved_raid_disk
8003	 * if possible.
8004	 */
8005	if (rdev->saved_raid_disk >= 0 &&
8006	    rdev->saved_raid_disk >= first &&
8007	    rdev->saved_raid_disk <= last &&
8008	    conf->disks[rdev->saved_raid_disk].rdev == NULL)
8009		first = rdev->saved_raid_disk;
8010
8011	for (disk = first; disk <= last; disk++) {
8012		p = conf->disks + disk;
8013		if (p->rdev == NULL) {
8014			clear_bit(In_sync, &rdev->flags);
8015			rdev->raid_disk = disk;
8016			if (rdev->saved_raid_disk != disk)
8017				conf->fullsync = 1;
8018			rcu_assign_pointer(p->rdev, rdev);
8019
8020			err = log_modify(conf, rdev, true);
8021
8022			goto out;
8023		}
8024	}
8025	for (disk = first; disk <= last; disk++) {
8026		p = conf->disks + disk;
8027		if (test_bit(WantReplacement, &p->rdev->flags) &&
8028		    p->replacement == NULL) {
8029			clear_bit(In_sync, &rdev->flags);
8030			set_bit(Replacement, &rdev->flags);
8031			rdev->raid_disk = disk;
8032			err = 0;
8033			conf->fullsync = 1;
8034			rcu_assign_pointer(p->replacement, rdev);
8035			break;
8036		}
8037	}
8038out:
8039	print_raid5_conf(conf);
8040	return err;
8041}
8042
8043static int raid5_resize(struct mddev *mddev, sector_t sectors)
8044{
8045	/* no resync is happening, and there is enough space
8046	 * on all devices, so we can resize.
8047	 * We need to make sure resync covers any new space.
8048	 * If the array is shrinking we should possibly wait until
8049	 * any io in the removed space completes, but it hardly seems
8050	 * worth it.
8051	 */
8052	sector_t newsize;
8053	struct r5conf *conf = mddev->private;
8054
8055	if (raid5_has_log(conf) || raid5_has_ppl(conf))
8056		return -EINVAL;
8057	sectors &= ~((sector_t)conf->chunk_sectors - 1);
8058	newsize = raid5_size(mddev, sectors, mddev->raid_disks);
8059	if (mddev->external_size &&
8060	    mddev->array_sectors > newsize)
8061		return -EINVAL;
8062	if (mddev->bitmap) {
8063		int ret = md_bitmap_resize(mddev->bitmap, sectors, 0, 0);
8064		if (ret)
8065			return ret;
8066	}
8067	md_set_array_sectors(mddev, newsize);
8068	if (sectors > mddev->dev_sectors &&
8069	    mddev->recovery_cp > mddev->dev_sectors) {
8070		mddev->recovery_cp = mddev->dev_sectors;
8071		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8072	}
8073	mddev->dev_sectors = sectors;
8074	mddev->resync_max_sectors = sectors;
8075	return 0;
8076}
8077
8078static int check_stripe_cache(struct mddev *mddev)
8079{
8080	/* Can only proceed if there are plenty of stripe_heads.
8081	 * We need a minimum of one full stripe,, and for sensible progress
8082	 * it is best to have about 4 times that.
8083	 * If we require 4 times, then the default 256 4K stripe_heads will
8084	 * allow for chunk sizes up to 256K, which is probably OK.
8085	 * If the chunk size is greater, user-space should request more
8086	 * stripe_heads first.
8087	 */
8088	struct r5conf *conf = mddev->private;
8089	if (((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4
8090	    > conf->min_nr_stripes ||
8091	    ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4
8092	    > conf->min_nr_stripes) {
8093		pr_warn("md/raid:%s: reshape: not enough stripes.  Needed %lu\n",
8094			mdname(mddev),
8095			((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
8096			 / RAID5_STRIPE_SIZE(conf))*4);
8097		return 0;
8098	}
8099	return 1;
8100}
8101
8102static int check_reshape(struct mddev *mddev)
8103{
8104	struct r5conf *conf = mddev->private;
8105
8106	if (raid5_has_log(conf) || raid5_has_ppl(conf))
8107		return -EINVAL;
8108	if (mddev->delta_disks == 0 &&
8109	    mddev->new_layout == mddev->layout &&
8110	    mddev->new_chunk_sectors == mddev->chunk_sectors)
8111		return 0; /* nothing to do */
8112	if (has_failed(conf))
8113		return -EINVAL;
8114	if (mddev->delta_disks < 0 && mddev->reshape_position == MaxSector) {
8115		/* We might be able to shrink, but the devices must
8116		 * be made bigger first.
8117		 * For raid6, 4 is the minimum size.
8118		 * Otherwise 2 is the minimum
8119		 */
8120		int min = 2;
8121		if (mddev->level == 6)
8122			min = 4;
8123		if (mddev->raid_disks + mddev->delta_disks < min)
8124			return -EINVAL;
8125	}
8126
8127	if (!check_stripe_cache(mddev))
8128		return -ENOSPC;
8129
8130	if (mddev->new_chunk_sectors > mddev->chunk_sectors ||
8131	    mddev->delta_disks > 0)
8132		if (resize_chunks(conf,
8133				  conf->previous_raid_disks
8134				  + max(0, mddev->delta_disks),
8135				  max(mddev->new_chunk_sectors,
8136				      mddev->chunk_sectors)
8137			    ) < 0)
8138			return -ENOMEM;
8139
8140	if (conf->previous_raid_disks + mddev->delta_disks <= conf->pool_size)
8141		return 0; /* never bother to shrink */
8142	return resize_stripes(conf, (conf->previous_raid_disks
8143				     + mddev->delta_disks));
8144}
8145
8146static int raid5_start_reshape(struct mddev *mddev)
8147{
8148	struct r5conf *conf = mddev->private;
8149	struct md_rdev *rdev;
8150	int spares = 0;
8151	unsigned long flags;
8152
8153	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
8154		return -EBUSY;
8155
8156	if (!check_stripe_cache(mddev))
8157		return -ENOSPC;
8158
8159	if (has_failed(conf))
8160		return -EINVAL;
8161
8162	rdev_for_each(rdev, mddev) {
8163		if (!test_bit(In_sync, &rdev->flags)
8164		    && !test_bit(Faulty, &rdev->flags))
8165			spares++;
8166	}
8167
8168	if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
8169		/* Not enough devices even to make a degraded array
8170		 * of that size
8171		 */
8172		return -EINVAL;
8173
8174	/* Refuse to reduce size of the array.  Any reductions in
8175	 * array size must be through explicit setting of array_size
8176	 * attribute.
8177	 */
8178	if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks)
8179	    < mddev->array_sectors) {
8180		pr_warn("md/raid:%s: array size must be reduced before number of disks\n",
8181			mdname(mddev));
8182		return -EINVAL;
8183	}
8184
8185	atomic_set(&conf->reshape_stripes, 0);
8186	spin_lock_irq(&conf->device_lock);
8187	write_seqcount_begin(&conf->gen_lock);
8188	conf->previous_raid_disks = conf->raid_disks;
8189	conf->raid_disks += mddev->delta_disks;
8190	conf->prev_chunk_sectors = conf->chunk_sectors;
8191	conf->chunk_sectors = mddev->new_chunk_sectors;
8192	conf->prev_algo = conf->algorithm;
8193	conf->algorithm = mddev->new_layout;
8194	conf->generation++;
8195	/* Code that selects data_offset needs to see the generation update
8196	 * if reshape_progress has been set - so a memory barrier needed.
8197	 */
8198	smp_mb();
8199	if (mddev->reshape_backwards)
8200		conf->reshape_progress = raid5_size(mddev, 0, 0);
8201	else
8202		conf->reshape_progress = 0;
8203	conf->reshape_safe = conf->reshape_progress;
8204	write_seqcount_end(&conf->gen_lock);
8205	spin_unlock_irq(&conf->device_lock);
8206
8207	/* Now make sure any requests that proceeded on the assumption
8208	 * the reshape wasn't running - like Discard or Read - have
8209	 * completed.
8210	 */
8211	mddev_suspend(mddev);
8212	mddev_resume(mddev);
8213
8214	/* Add some new drives, as many as will fit.
8215	 * We know there are enough to make the newly sized array work.
8216	 * Don't add devices if we are reducing the number of
8217	 * devices in the array.  This is because it is not possible
8218	 * to correctly record the "partially reconstructed" state of
8219	 * such devices during the reshape and confusion could result.
8220	 */
8221	if (mddev->delta_disks >= 0) {
8222		rdev_for_each(rdev, mddev)
8223			if (rdev->raid_disk < 0 &&
8224			    !test_bit(Faulty, &rdev->flags)) {
8225				if (raid5_add_disk(mddev, rdev) == 0) {
8226					if (rdev->raid_disk
8227					    >= conf->previous_raid_disks)
8228						set_bit(In_sync, &rdev->flags);
8229					else
8230						rdev->recovery_offset = 0;
8231
8232					/* Failure here is OK */
8233					sysfs_link_rdev(mddev, rdev);
8234				}
8235			} else if (rdev->raid_disk >= conf->previous_raid_disks
8236				   && !test_bit(Faulty, &rdev->flags)) {
8237				/* This is a spare that was manually added */
8238				set_bit(In_sync, &rdev->flags);
8239			}
8240
8241		/* When a reshape changes the number of devices,
8242		 * ->degraded is measured against the larger of the
8243		 * pre and post number of devices.
8244		 */
8245		spin_lock_irqsave(&conf->device_lock, flags);
8246		mddev->degraded = raid5_calc_degraded(conf);
8247		spin_unlock_irqrestore(&conf->device_lock, flags);
8248	}
8249	mddev->raid_disks = conf->raid_disks;
8250	mddev->reshape_position = conf->reshape_progress;
8251	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8252
8253	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
8254	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
8255	clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
8256	set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
8257	set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
8258	mddev->sync_thread = md_register_thread(md_do_sync, mddev,
8259						"reshape");
8260	if (!mddev->sync_thread) {
8261		mddev->recovery = 0;
8262		spin_lock_irq(&conf->device_lock);
8263		write_seqcount_begin(&conf->gen_lock);
8264		mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
8265		mddev->new_chunk_sectors =
8266			conf->chunk_sectors = conf->prev_chunk_sectors;
8267		mddev->new_layout = conf->algorithm = conf->prev_algo;
8268		rdev_for_each(rdev, mddev)
8269			rdev->new_data_offset = rdev->data_offset;
8270		smp_wmb();
8271		conf->generation --;
8272		conf->reshape_progress = MaxSector;
8273		mddev->reshape_position = MaxSector;
8274		write_seqcount_end(&conf->gen_lock);
8275		spin_unlock_irq(&conf->device_lock);
8276		return -EAGAIN;
8277	}
8278	conf->reshape_checkpoint = jiffies;
8279	md_wakeup_thread(mddev->sync_thread);
8280	md_new_event(mddev);
8281	return 0;
8282}
8283
8284/* This is called from the reshape thread and should make any
8285 * changes needed in 'conf'
8286 */
8287static void end_reshape(struct r5conf *conf)
8288{
8289
8290	if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
8291		struct md_rdev *rdev;
8292
8293		spin_lock_irq(&conf->device_lock);
8294		conf->previous_raid_disks = conf->raid_disks;
8295		md_finish_reshape(conf->mddev);
8296		smp_wmb();
8297		conf->reshape_progress = MaxSector;
8298		conf->mddev->reshape_position = MaxSector;
8299		rdev_for_each(rdev, conf->mddev)
8300			if (rdev->raid_disk >= 0 &&
8301			    !test_bit(Journal, &rdev->flags) &&
8302			    !test_bit(In_sync, &rdev->flags))
8303				rdev->recovery_offset = MaxSector;
8304		spin_unlock_irq(&conf->device_lock);
8305		wake_up(&conf->wait_for_overlap);
8306
8307		if (conf->mddev->queue)
8308			raid5_set_io_opt(conf);
8309	}
8310}
8311
8312/* This is called from the raid5d thread with mddev_lock held.
8313 * It makes config changes to the device.
8314 */
8315static void raid5_finish_reshape(struct mddev *mddev)
8316{
8317	struct r5conf *conf = mddev->private;
8318
8319	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8320
8321		if (mddev->delta_disks <= 0) {
8322			int d;
8323			spin_lock_irq(&conf->device_lock);
8324			mddev->degraded = raid5_calc_degraded(conf);
8325			spin_unlock_irq(&conf->device_lock);
8326			for (d = conf->raid_disks ;
8327			     d < conf->raid_disks - mddev->delta_disks;
8328			     d++) {
8329				struct md_rdev *rdev = conf->disks[d].rdev;
8330				if (rdev)
8331					clear_bit(In_sync, &rdev->flags);
8332				rdev = conf->disks[d].replacement;
8333				if (rdev)
8334					clear_bit(In_sync, &rdev->flags);
8335			}
8336		}
8337		mddev->layout = conf->algorithm;
8338		mddev->chunk_sectors = conf->chunk_sectors;
8339		mddev->reshape_position = MaxSector;
8340		mddev->delta_disks = 0;
8341		mddev->reshape_backwards = 0;
8342	}
8343}
8344
8345static void raid5_quiesce(struct mddev *mddev, int quiesce)
8346{
8347	struct r5conf *conf = mddev->private;
8348
8349	if (quiesce) {
8350		/* stop all writes */
8351		lock_all_device_hash_locks_irq(conf);
8352		/* '2' tells resync/reshape to pause so that all
8353		 * active stripes can drain
8354		 */
8355		r5c_flush_cache(conf, INT_MAX);
8356		conf->quiesce = 2;
8357		wait_event_cmd(conf->wait_for_quiescent,
8358				    atomic_read(&conf->active_stripes) == 0 &&
8359				    atomic_read(&conf->active_aligned_reads) == 0,
8360				    unlock_all_device_hash_locks_irq(conf),
8361				    lock_all_device_hash_locks_irq(conf));
8362		conf->quiesce = 1;
8363		unlock_all_device_hash_locks_irq(conf);
8364		/* allow reshape to continue */
8365		wake_up(&conf->wait_for_overlap);
8366	} else {
8367		/* re-enable writes */
8368		lock_all_device_hash_locks_irq(conf);
8369		conf->quiesce = 0;
8370		wake_up(&conf->wait_for_quiescent);
8371		wake_up(&conf->wait_for_overlap);
8372		unlock_all_device_hash_locks_irq(conf);
8373	}
8374	log_quiesce(conf, quiesce);
8375}
8376
8377static void *raid45_takeover_raid0(struct mddev *mddev, int level)
8378{
8379	struct r0conf *raid0_conf = mddev->private;
8380	sector_t sectors;
8381
8382	/* for raid0 takeover only one zone is supported */
8383	if (raid0_conf->nr_strip_zones > 1) {
8384		pr_warn("md/raid:%s: cannot takeover raid0 with more than one zone.\n",
8385			mdname(mddev));
8386		return ERR_PTR(-EINVAL);
8387	}
8388
8389	sectors = raid0_conf->strip_zone[0].zone_end;
8390	sector_div(sectors, raid0_conf->strip_zone[0].nb_dev);
8391	mddev->dev_sectors = sectors;
8392	mddev->new_level = level;
8393	mddev->new_layout = ALGORITHM_PARITY_N;
8394	mddev->new_chunk_sectors = mddev->chunk_sectors;
8395	mddev->raid_disks += 1;
8396	mddev->delta_disks = 1;
8397	/* make sure it will be not marked as dirty */
8398	mddev->recovery_cp = MaxSector;
8399
8400	return setup_conf(mddev);
8401}
8402
8403static void *raid5_takeover_raid1(struct mddev *mddev)
8404{
8405	int chunksect;
8406	void *ret;
8407
8408	if (mddev->raid_disks != 2 ||
8409	    mddev->degraded > 1)
8410		return ERR_PTR(-EINVAL);
8411
8412	/* Should check if there are write-behind devices? */
8413
8414	chunksect = 64*2; /* 64K by default */
8415
8416	/* The array must be an exact multiple of chunksize */
8417	while (chunksect && (mddev->array_sectors & (chunksect-1)))
8418		chunksect >>= 1;
8419
8420	if ((chunksect<<9) < RAID5_STRIPE_SIZE((struct r5conf *)mddev->private))
8421		/* array size does not allow a suitable chunk size */
8422		return ERR_PTR(-EINVAL);
8423
8424	mddev->new_level = 5;
8425	mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC;
8426	mddev->new_chunk_sectors = chunksect;
8427
8428	ret = setup_conf(mddev);
8429	if (!IS_ERR(ret))
8430		mddev_clear_unsupported_flags(mddev,
8431			UNSUPPORTED_MDDEV_FLAGS);
8432	return ret;
8433}
8434
8435static void *raid5_takeover_raid6(struct mddev *mddev)
8436{
8437	int new_layout;
8438
8439	switch (mddev->layout) {
8440	case ALGORITHM_LEFT_ASYMMETRIC_6:
8441		new_layout = ALGORITHM_LEFT_ASYMMETRIC;
8442		break;
8443	case ALGORITHM_RIGHT_ASYMMETRIC_6:
8444		new_layout = ALGORITHM_RIGHT_ASYMMETRIC;
8445		break;
8446	case ALGORITHM_LEFT_SYMMETRIC_6:
8447		new_layout = ALGORITHM_LEFT_SYMMETRIC;
8448		break;
8449	case ALGORITHM_RIGHT_SYMMETRIC_6:
8450		new_layout = ALGORITHM_RIGHT_SYMMETRIC;
8451		break;
8452	case ALGORITHM_PARITY_0_6:
8453		new_layout = ALGORITHM_PARITY_0;
8454		break;
8455	case ALGORITHM_PARITY_N:
8456		new_layout = ALGORITHM_PARITY_N;
8457		break;
8458	default:
8459		return ERR_PTR(-EINVAL);
8460	}
8461	mddev->new_level = 5;
8462	mddev->new_layout = new_layout;
8463	mddev->delta_disks = -1;
8464	mddev->raid_disks -= 1;
8465	return setup_conf(mddev);
8466}
8467
8468static int raid5_check_reshape(struct mddev *mddev)
8469{
8470	/* For a 2-drive array, the layout and chunk size can be changed
8471	 * immediately as not restriping is needed.
8472	 * For larger arrays we record the new value - after validation
8473	 * to be used by a reshape pass.
8474	 */
8475	struct r5conf *conf = mddev->private;
8476	int new_chunk = mddev->new_chunk_sectors;
8477
8478	if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout))
8479		return -EINVAL;
8480	if (new_chunk > 0) {
8481		if (!is_power_of_2(new_chunk))
8482			return -EINVAL;
8483		if (new_chunk < (PAGE_SIZE>>9))
8484			return -EINVAL;
8485		if (mddev->array_sectors & (new_chunk-1))
8486			/* not factor of array size */
8487			return -EINVAL;
8488	}
8489
8490	/* They look valid */
8491
8492	if (mddev->raid_disks == 2) {
8493		/* can make the change immediately */
8494		if (mddev->new_layout >= 0) {
8495			conf->algorithm = mddev->new_layout;
8496			mddev->layout = mddev->new_layout;
8497		}
8498		if (new_chunk > 0) {
8499			conf->chunk_sectors = new_chunk ;
8500			mddev->chunk_sectors = new_chunk;
8501		}
8502		set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8503		md_wakeup_thread(mddev->thread);
8504	}
8505	return check_reshape(mddev);
8506}
8507
8508static int raid6_check_reshape(struct mddev *mddev)
8509{
8510	int new_chunk = mddev->new_chunk_sectors;
8511
8512	if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout))
8513		return -EINVAL;
8514	if (new_chunk > 0) {
8515		if (!is_power_of_2(new_chunk))
8516			return -EINVAL;
8517		if (new_chunk < (PAGE_SIZE >> 9))
8518			return -EINVAL;
8519		if (mddev->array_sectors & (new_chunk-1))
8520			/* not factor of array size */
8521			return -EINVAL;
8522	}
8523
8524	/* They look valid */
8525	return check_reshape(mddev);
8526}
8527
8528static void *raid5_takeover(struct mddev *mddev)
8529{
8530	/* raid5 can take over:
8531	 *  raid0 - if there is only one strip zone - make it a raid4 layout
8532	 *  raid1 - if there are two drives.  We need to know the chunk size
8533	 *  raid4 - trivial - just use a raid4 layout.
8534	 *  raid6 - Providing it is a *_6 layout
8535	 */
8536	if (mddev->level == 0)
8537		return raid45_takeover_raid0(mddev, 5);
8538	if (mddev->level == 1)
8539		return raid5_takeover_raid1(mddev);
8540	if (mddev->level == 4) {
8541		mddev->new_layout = ALGORITHM_PARITY_N;
8542		mddev->new_level = 5;
8543		return setup_conf(mddev);
8544	}
8545	if (mddev->level == 6)
8546		return raid5_takeover_raid6(mddev);
8547
8548	return ERR_PTR(-EINVAL);
8549}
8550
8551static void *raid4_takeover(struct mddev *mddev)
8552{
8553	/* raid4 can take over:
8554	 *  raid0 - if there is only one strip zone
8555	 *  raid5 - if layout is right
8556	 */
8557	if (mddev->level == 0)
8558		return raid45_takeover_raid0(mddev, 4);
8559	if (mddev->level == 5 &&
8560	    mddev->layout == ALGORITHM_PARITY_N) {
8561		mddev->new_layout = 0;
8562		mddev->new_level = 4;
8563		return setup_conf(mddev);
8564	}
8565	return ERR_PTR(-EINVAL);
8566}
8567
8568static struct md_personality raid5_personality;
8569
8570static void *raid6_takeover(struct mddev *mddev)
8571{
8572	/* Currently can only take over a raid5.  We map the
8573	 * personality to an equivalent raid6 personality
8574	 * with the Q block at the end.
8575	 */
8576	int new_layout;
8577
8578	if (mddev->pers != &raid5_personality)
8579		return ERR_PTR(-EINVAL);
8580	if (mddev->degraded > 1)
8581		return ERR_PTR(-EINVAL);
8582	if (mddev->raid_disks > 253)
8583		return ERR_PTR(-EINVAL);
8584	if (mddev->raid_disks < 3)
8585		return ERR_PTR(-EINVAL);
8586
8587	switch (mddev->layout) {
8588	case ALGORITHM_LEFT_ASYMMETRIC:
8589		new_layout = ALGORITHM_LEFT_ASYMMETRIC_6;
8590		break;
8591	case ALGORITHM_RIGHT_ASYMMETRIC:
8592		new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6;
8593		break;
8594	case ALGORITHM_LEFT_SYMMETRIC:
8595		new_layout = ALGORITHM_LEFT_SYMMETRIC_6;
8596		break;
8597	case ALGORITHM_RIGHT_SYMMETRIC:
8598		new_layout = ALGORITHM_RIGHT_SYMMETRIC_6;
8599		break;
8600	case ALGORITHM_PARITY_0:
8601		new_layout = ALGORITHM_PARITY_0_6;
8602		break;
8603	case ALGORITHM_PARITY_N:
8604		new_layout = ALGORITHM_PARITY_N;
8605		break;
8606	default:
8607		return ERR_PTR(-EINVAL);
8608	}
8609	mddev->new_level = 6;
8610	mddev->new_layout = new_layout;
8611	mddev->delta_disks = 1;
8612	mddev->raid_disks += 1;
8613	return setup_conf(mddev);
8614}
8615
8616static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
8617{
8618	struct r5conf *conf;
8619	int err;
8620
8621	err = mddev_lock(mddev);
8622	if (err)
8623		return err;
8624	conf = mddev->private;
8625	if (!conf) {
8626		mddev_unlock(mddev);
8627		return -ENODEV;
8628	}
8629
8630	if (strncmp(buf, "ppl", 3) == 0) {
8631		/* ppl only works with RAID 5 */
8632		if (!raid5_has_ppl(conf) && conf->level == 5) {
8633			err = log_init(conf, NULL, true);
8634			if (!err) {
8635				err = resize_stripes(conf, conf->pool_size);
8636				if (err)
8637					log_exit(conf);
8638			}
8639		} else
8640			err = -EINVAL;
8641	} else if (strncmp(buf, "resync", 6) == 0) {
8642		if (raid5_has_ppl(conf)) {
8643			mddev_suspend(mddev);
8644			log_exit(conf);
8645			mddev_resume(mddev);
8646			err = resize_stripes(conf, conf->pool_size);
8647		} else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) &&
8648			   r5l_log_disk_error(conf)) {
8649			bool journal_dev_exists = false;
8650			struct md_rdev *rdev;
8651
8652			rdev_for_each(rdev, mddev)
8653				if (test_bit(Journal, &rdev->flags)) {
8654					journal_dev_exists = true;
8655					break;
8656				}
8657
8658			if (!journal_dev_exists) {
8659				mddev_suspend(mddev);
8660				clear_bit(MD_HAS_JOURNAL, &mddev->flags);
8661				mddev_resume(mddev);
8662			} else  /* need remove journal device first */
8663				err = -EBUSY;
8664		} else
8665			err = -EINVAL;
8666	} else {
8667		err = -EINVAL;
8668	}
8669
8670	if (!err)
8671		md_update_sb(mddev, 1);
8672
8673	mddev_unlock(mddev);
8674
8675	return err;
8676}
8677
8678static int raid5_start(struct mddev *mddev)
8679{
8680	struct r5conf *conf = mddev->private;
8681
8682	return r5l_start(conf->log);
8683}
8684
8685static struct md_personality raid6_personality =
8686{
8687	.name		= "raid6",
8688	.level		= 6,
8689	.owner		= THIS_MODULE,
8690	.make_request	= raid5_make_request,
8691	.run		= raid5_run,
8692	.start		= raid5_start,
8693	.free		= raid5_free,
8694	.status		= raid5_status,
8695	.error_handler	= raid5_error,
8696	.hot_add_disk	= raid5_add_disk,
8697	.hot_remove_disk= raid5_remove_disk,
8698	.spare_active	= raid5_spare_active,
8699	.sync_request	= raid5_sync_request,
8700	.resize		= raid5_resize,
8701	.size		= raid5_size,
8702	.check_reshape	= raid6_check_reshape,
8703	.start_reshape  = raid5_start_reshape,
8704	.finish_reshape = raid5_finish_reshape,
8705	.quiesce	= raid5_quiesce,
8706	.takeover	= raid6_takeover,
8707	.change_consistency_policy = raid5_change_consistency_policy,
8708};
8709static struct md_personality raid5_personality =
8710{
8711	.name		= "raid5",
8712	.level		= 5,
8713	.owner		= THIS_MODULE,
8714	.make_request	= raid5_make_request,
8715	.run		= raid5_run,
8716	.start		= raid5_start,
8717	.free		= raid5_free,
8718	.status		= raid5_status,
8719	.error_handler	= raid5_error,
8720	.hot_add_disk	= raid5_add_disk,
8721	.hot_remove_disk= raid5_remove_disk,
8722	.spare_active	= raid5_spare_active,
8723	.sync_request	= raid5_sync_request,
8724	.resize		= raid5_resize,
8725	.size		= raid5_size,
8726	.check_reshape	= raid5_check_reshape,
8727	.start_reshape  = raid5_start_reshape,
8728	.finish_reshape = raid5_finish_reshape,
8729	.quiesce	= raid5_quiesce,
8730	.takeover	= raid5_takeover,
8731	.change_consistency_policy = raid5_change_consistency_policy,
8732};
8733
8734static struct md_personality raid4_personality =
8735{
8736	.name		= "raid4",
8737	.level		= 4,
8738	.owner		= THIS_MODULE,
8739	.make_request	= raid5_make_request,
8740	.run		= raid5_run,
8741	.start		= raid5_start,
8742	.free		= raid5_free,
8743	.status		= raid5_status,
8744	.error_handler	= raid5_error,
8745	.hot_add_disk	= raid5_add_disk,
8746	.hot_remove_disk= raid5_remove_disk,
8747	.spare_active	= raid5_spare_active,
8748	.sync_request	= raid5_sync_request,
8749	.resize		= raid5_resize,
8750	.size		= raid5_size,
8751	.check_reshape	= raid5_check_reshape,
8752	.start_reshape  = raid5_start_reshape,
8753	.finish_reshape = raid5_finish_reshape,
8754	.quiesce	= raid5_quiesce,
8755	.takeover	= raid4_takeover,
8756	.change_consistency_policy = raid5_change_consistency_policy,
8757};
8758
8759static int __init raid5_init(void)
8760{
8761	int ret;
8762
8763	raid5_wq = alloc_workqueue("raid5wq",
8764		WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0);
8765	if (!raid5_wq)
8766		return -ENOMEM;
8767
8768	ret = cpuhp_setup_state_multi(CPUHP_MD_RAID5_PREPARE,
8769				      "md/raid5:prepare",
8770				      raid456_cpu_up_prepare,
8771				      raid456_cpu_dead);
8772	if (ret) {
8773		destroy_workqueue(raid5_wq);
8774		return ret;
8775	}
8776	register_md_personality(&raid6_personality);
8777	register_md_personality(&raid5_personality);
8778	register_md_personality(&raid4_personality);
8779	return 0;
8780}
8781
8782static void raid5_exit(void)
8783{
8784	unregister_md_personality(&raid6_personality);
8785	unregister_md_personality(&raid5_personality);
8786	unregister_md_personality(&raid4_personality);
8787	cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE);
8788	destroy_workqueue(raid5_wq);
8789}
8790
8791module_init(raid5_init);
8792module_exit(raid5_exit);
8793MODULE_LICENSE("GPL");
8794MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD");
8795MODULE_ALIAS("md-personality-4"); /* RAID5 */
8796MODULE_ALIAS("md-raid5");
8797MODULE_ALIAS("md-raid4");
8798MODULE_ALIAS("md-level-5");
8799MODULE_ALIAS("md-level-4");
8800MODULE_ALIAS("md-personality-8"); /* RAID6 */
8801MODULE_ALIAS("md-raid6");
8802MODULE_ALIAS("md-level-6");
8803
8804/* This used to be two separate modules, they were: */
8805MODULE_ALIAS("raid5");
8806MODULE_ALIAS("raid6");
8807