1// SPDX-License-Identifier: GPL-2.0
2/*
3 * bcache setup/teardown code, and some metadata io - read a superblock and
4 * figure out what to do with it.
5 *
6 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
7 * Copyright 2012 Google, Inc.
8 */
9
10#include "bcache.h"
11#include "btree.h"
12#include "debug.h"
13#include "extents.h"
14#include "request.h"
15#include "writeback.h"
16#include "features.h"
17
18#include <linux/blkdev.h>
19#include <linux/debugfs.h>
20#include <linux/genhd.h>
21#include <linux/idr.h>
22#include <linux/kthread.h>
23#include <linux/workqueue.h>
24#include <linux/module.h>
25#include <linux/random.h>
26#include <linux/reboot.h>
27#include <linux/sysfs.h>
28
29unsigned int bch_cutoff_writeback;
30unsigned int bch_cutoff_writeback_sync;
31
32static const char bcache_magic[] = {
33	0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca,
34	0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81
35};
36
37static const char invalid_uuid[] = {
38	0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78,
39	0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
40};
41
42static struct kobject *bcache_kobj;
43struct mutex bch_register_lock;
44bool bcache_is_reboot;
45LIST_HEAD(bch_cache_sets);
46static LIST_HEAD(uncached_devices);
47
48static int bcache_major;
49static DEFINE_IDA(bcache_device_idx);
50static wait_queue_head_t unregister_wait;
51struct workqueue_struct *bcache_wq;
52struct workqueue_struct *bch_flush_wq;
53struct workqueue_struct *bch_journal_wq;
54
55
56#define BTREE_MAX_PAGES		(256 * 1024 / PAGE_SIZE)
57/* limitation of partitions number on single bcache device */
58#define BCACHE_MINORS		128
59/* limitation of bcache devices number on single system */
60#define BCACHE_DEVICE_IDX_MAX	((1U << MINORBITS)/BCACHE_MINORS)
61
62/* Superblock */
63
64static unsigned int get_bucket_size(struct cache_sb *sb, struct cache_sb_disk *s)
65{
66	unsigned int bucket_size = le16_to_cpu(s->bucket_size);
67
68	if (sb->version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES) {
69		if (bch_has_feature_large_bucket(sb)) {
70			unsigned int max, order;
71
72			max = sizeof(unsigned int) * BITS_PER_BYTE - 1;
73			order = le16_to_cpu(s->bucket_size);
74			/*
75			 * bcache tool will make sure the overflow won't
76			 * happen, an error message here is enough.
77			 */
78			if (order > max)
79				pr_err("Bucket size (1 << %u) overflows\n",
80					order);
81			bucket_size = 1 << order;
82		} else if (bch_has_feature_obso_large_bucket(sb)) {
83			bucket_size +=
84				le16_to_cpu(s->obso_bucket_size_hi) << 16;
85		}
86	}
87
88	return bucket_size;
89}
90
91static const char *read_super_common(struct cache_sb *sb,  struct block_device *bdev,
92				     struct cache_sb_disk *s)
93{
94	const char *err;
95	unsigned int i;
96
97	sb->first_bucket= le16_to_cpu(s->first_bucket);
98	sb->nbuckets	= le64_to_cpu(s->nbuckets);
99	sb->bucket_size	= get_bucket_size(sb, s);
100
101	sb->nr_in_set	= le16_to_cpu(s->nr_in_set);
102	sb->nr_this_dev	= le16_to_cpu(s->nr_this_dev);
103
104	err = "Too many journal buckets";
105	if (sb->keys > SB_JOURNAL_BUCKETS)
106		goto err;
107
108	err = "Too many buckets";
109	if (sb->nbuckets > LONG_MAX)
110		goto err;
111
112	err = "Not enough buckets";
113	if (sb->nbuckets < 1 << 7)
114		goto err;
115
116	err = "Bad block size (not power of 2)";
117	if (!is_power_of_2(sb->block_size))
118		goto err;
119
120	err = "Bad block size (larger than page size)";
121	if (sb->block_size > PAGE_SECTORS)
122		goto err;
123
124	err = "Bad bucket size (not power of 2)";
125	if (!is_power_of_2(sb->bucket_size))
126		goto err;
127
128	err = "Bad bucket size (smaller than page size)";
129	if (sb->bucket_size < PAGE_SECTORS)
130		goto err;
131
132	err = "Invalid superblock: device too small";
133	if (get_capacity(bdev->bd_disk) <
134	    sb->bucket_size * sb->nbuckets)
135		goto err;
136
137	err = "Bad UUID";
138	if (bch_is_zero(sb->set_uuid, 16))
139		goto err;
140
141	err = "Bad cache device number in set";
142	if (!sb->nr_in_set ||
143	    sb->nr_in_set <= sb->nr_this_dev ||
144	    sb->nr_in_set > MAX_CACHES_PER_SET)
145		goto err;
146
147	err = "Journal buckets not sequential";
148	for (i = 0; i < sb->keys; i++)
149		if (sb->d[i] != sb->first_bucket + i)
150			goto err;
151
152	err = "Too many journal buckets";
153	if (sb->first_bucket + sb->keys > sb->nbuckets)
154		goto err;
155
156	err = "Invalid superblock: first bucket comes before end of super";
157	if (sb->first_bucket * sb->bucket_size < 16)
158		goto err;
159
160	err = NULL;
161err:
162	return err;
163}
164
165
166static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
167			      struct cache_sb_disk **res)
168{
169	const char *err;
170	struct cache_sb_disk *s;
171	struct page *page;
172	unsigned int i;
173
174	page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
175				   SB_OFFSET >> PAGE_SHIFT, GFP_KERNEL);
176	if (IS_ERR(page))
177		return "IO error";
178	s = page_address(page) + offset_in_page(SB_OFFSET);
179
180	sb->offset		= le64_to_cpu(s->offset);
181	sb->version		= le64_to_cpu(s->version);
182
183	memcpy(sb->magic,	s->magic, 16);
184	memcpy(sb->uuid,	s->uuid, 16);
185	memcpy(sb->set_uuid,	s->set_uuid, 16);
186	memcpy(sb->label,	s->label, SB_LABEL_SIZE);
187
188	sb->flags		= le64_to_cpu(s->flags);
189	sb->seq			= le64_to_cpu(s->seq);
190	sb->last_mount		= le32_to_cpu(s->last_mount);
191	sb->keys		= le16_to_cpu(s->keys);
192
193	for (i = 0; i < SB_JOURNAL_BUCKETS; i++)
194		sb->d[i] = le64_to_cpu(s->d[i]);
195
196	pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u\n",
197		 sb->version, sb->flags, sb->seq, sb->keys);
198
199	err = "Not a bcache superblock (bad offset)";
200	if (sb->offset != SB_SECTOR)
201		goto err;
202
203	err = "Not a bcache superblock (bad magic)";
204	if (memcmp(sb->magic, bcache_magic, 16))
205		goto err;
206
207	err = "Bad checksum";
208	if (s->csum != csum_set(s))
209		goto err;
210
211	err = "Bad UUID";
212	if (bch_is_zero(sb->uuid, 16))
213		goto err;
214
215	sb->block_size	= le16_to_cpu(s->block_size);
216
217	err = "Superblock block size smaller than device block size";
218	if (sb->block_size << 9 < bdev_logical_block_size(bdev))
219		goto err;
220
221	switch (sb->version) {
222	case BCACHE_SB_VERSION_BDEV:
223		sb->data_offset	= BDEV_DATA_START_DEFAULT;
224		break;
225	case BCACHE_SB_VERSION_BDEV_WITH_OFFSET:
226	case BCACHE_SB_VERSION_BDEV_WITH_FEATURES:
227		sb->data_offset	= le64_to_cpu(s->data_offset);
228
229		err = "Bad data offset";
230		if (sb->data_offset < BDEV_DATA_START_DEFAULT)
231			goto err;
232
233		break;
234	case BCACHE_SB_VERSION_CDEV:
235	case BCACHE_SB_VERSION_CDEV_WITH_UUID:
236		err = read_super_common(sb, bdev, s);
237		if (err)
238			goto err;
239		break;
240	case BCACHE_SB_VERSION_CDEV_WITH_FEATURES:
241		/*
242		 * Feature bits are needed in read_super_common(),
243		 * convert them firstly.
244		 */
245		sb->feature_compat = le64_to_cpu(s->feature_compat);
246		sb->feature_incompat = le64_to_cpu(s->feature_incompat);
247		sb->feature_ro_compat = le64_to_cpu(s->feature_ro_compat);
248
249		/* Check incompatible features */
250		err = "Unsupported compatible feature found";
251		if (bch_has_unknown_compat_features(sb))
252			goto err;
253
254		err = "Unsupported read-only compatible feature found";
255		if (bch_has_unknown_ro_compat_features(sb))
256			goto err;
257
258		err = "Unsupported incompatible feature found";
259		if (bch_has_unknown_incompat_features(sb))
260			goto err;
261
262		err = read_super_common(sb, bdev, s);
263		if (err)
264			goto err;
265		break;
266	default:
267		err = "Unsupported superblock version";
268		goto err;
269	}
270
271	sb->last_mount = (u32)ktime_get_real_seconds();
272	*res = s;
273	return NULL;
274err:
275	put_page(page);
276	return err;
277}
278
279static void write_bdev_super_endio(struct bio *bio)
280{
281	struct cached_dev *dc = bio->bi_private;
282
283	if (bio->bi_status)
284		bch_count_backing_io_errors(dc, bio);
285
286	closure_put(&dc->sb_write);
287}
288
289static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out,
290		struct bio *bio)
291{
292	unsigned int i;
293
294	bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META;
295	bio->bi_iter.bi_sector	= SB_SECTOR;
296	__bio_add_page(bio, virt_to_page(out), SB_SIZE,
297			offset_in_page(out));
298
299	out->offset		= cpu_to_le64(sb->offset);
300
301	memcpy(out->uuid,	sb->uuid, 16);
302	memcpy(out->set_uuid,	sb->set_uuid, 16);
303	memcpy(out->label,	sb->label, SB_LABEL_SIZE);
304
305	out->flags		= cpu_to_le64(sb->flags);
306	out->seq		= cpu_to_le64(sb->seq);
307
308	out->last_mount		= cpu_to_le32(sb->last_mount);
309	out->first_bucket	= cpu_to_le16(sb->first_bucket);
310	out->keys		= cpu_to_le16(sb->keys);
311
312	for (i = 0; i < sb->keys; i++)
313		out->d[i] = cpu_to_le64(sb->d[i]);
314
315	if (sb->version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES) {
316		out->feature_compat    = cpu_to_le64(sb->feature_compat);
317		out->feature_incompat  = cpu_to_le64(sb->feature_incompat);
318		out->feature_ro_compat = cpu_to_le64(sb->feature_ro_compat);
319	}
320
321	out->version		= cpu_to_le64(sb->version);
322	out->csum = csum_set(out);
323
324	pr_debug("ver %llu, flags %llu, seq %llu\n",
325		 sb->version, sb->flags, sb->seq);
326
327	submit_bio(bio);
328}
329
330static void bch_write_bdev_super_unlock(struct closure *cl)
331{
332	struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write);
333
334	up(&dc->sb_write_mutex);
335}
336
337void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
338{
339	struct closure *cl = &dc->sb_write;
340	struct bio *bio = &dc->sb_bio;
341
342	down(&dc->sb_write_mutex);
343	closure_init(cl, parent);
344
345	bio_init(bio, dc->sb_bv, 1);
346	bio_set_dev(bio, dc->bdev);
347	bio->bi_end_io	= write_bdev_super_endio;
348	bio->bi_private = dc;
349
350	closure_get(cl);
351	/* I/O request sent to backing device */
352	__write_super(&dc->sb, dc->sb_disk, bio);
353
354	closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
355}
356
357static void write_super_endio(struct bio *bio)
358{
359	struct cache *ca = bio->bi_private;
360
361	/* is_read = 0 */
362	bch_count_io_errors(ca, bio->bi_status, 0,
363			    "writing superblock");
364	closure_put(&ca->set->sb_write);
365}
366
367static void bcache_write_super_unlock(struct closure *cl)
368{
369	struct cache_set *c = container_of(cl, struct cache_set, sb_write);
370
371	up(&c->sb_write_mutex);
372}
373
374void bcache_write_super(struct cache_set *c)
375{
376	struct closure *cl = &c->sb_write;
377	struct cache *ca = c->cache;
378	struct bio *bio = &ca->sb_bio;
379	unsigned int version = BCACHE_SB_VERSION_CDEV_WITH_UUID;
380
381	down(&c->sb_write_mutex);
382	closure_init(cl, &c->cl);
383
384	ca->sb.seq++;
385
386	if (ca->sb.version < version)
387		ca->sb.version = version;
388
389	bio_init(bio, ca->sb_bv, 1);
390	bio_set_dev(bio, ca->bdev);
391	bio->bi_end_io	= write_super_endio;
392	bio->bi_private = ca;
393
394	closure_get(cl);
395	__write_super(&ca->sb, ca->sb_disk, bio);
396
397	closure_return_with_destructor(cl, bcache_write_super_unlock);
398}
399
400/* UUID io */
401
402static void uuid_endio(struct bio *bio)
403{
404	struct closure *cl = bio->bi_private;
405	struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
406
407	cache_set_err_on(bio->bi_status, c, "accessing uuids");
408	bch_bbio_free(bio, c);
409	closure_put(cl);
410}
411
412static void uuid_io_unlock(struct closure *cl)
413{
414	struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
415
416	up(&c->uuid_write_mutex);
417}
418
419static void uuid_io(struct cache_set *c, int op, unsigned long op_flags,
420		    struct bkey *k, struct closure *parent)
421{
422	struct closure *cl = &c->uuid_write;
423	struct uuid_entry *u;
424	unsigned int i;
425	char buf[80];
426
427	BUG_ON(!parent);
428	down(&c->uuid_write_mutex);
429	closure_init(cl, parent);
430
431	for (i = 0; i < KEY_PTRS(k); i++) {
432		struct bio *bio = bch_bbio_alloc(c);
433
434		bio->bi_opf = REQ_SYNC | REQ_META | op_flags;
435		bio->bi_iter.bi_size = KEY_SIZE(k) << 9;
436
437		bio->bi_end_io	= uuid_endio;
438		bio->bi_private = cl;
439		bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags);
440		bch_bio_map(bio, c->uuids);
441
442		bch_submit_bbio(bio, c, k, i);
443
444		if (op != REQ_OP_WRITE)
445			break;
446	}
447
448	bch_extent_to_text(buf, sizeof(buf), k);
449	pr_debug("%s UUIDs at %s\n", op == REQ_OP_WRITE ? "wrote" : "read", buf);
450
451	for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
452		if (!bch_is_zero(u->uuid, 16))
453			pr_debug("Slot %zi: %pU: %s: 1st: %u last: %u inv: %u\n",
454				 u - c->uuids, u->uuid, u->label,
455				 u->first_reg, u->last_reg, u->invalidated);
456
457	closure_return_with_destructor(cl, uuid_io_unlock);
458}
459
460static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
461{
462	struct bkey *k = &j->uuid_bucket;
463
464	if (__bch_btree_ptr_invalid(c, k))
465		return "bad uuid pointer";
466
467	bkey_copy(&c->uuid_bucket, k);
468	uuid_io(c, REQ_OP_READ, 0, k, cl);
469
470	if (j->version < BCACHE_JSET_VERSION_UUIDv1) {
471		struct uuid_entry_v0	*u0 = (void *) c->uuids;
472		struct uuid_entry	*u1 = (void *) c->uuids;
473		int i;
474
475		closure_sync(cl);
476
477		/*
478		 * Since the new uuid entry is bigger than the old, we have to
479		 * convert starting at the highest memory address and work down
480		 * in order to do it in place
481		 */
482
483		for (i = c->nr_uuids - 1;
484		     i >= 0;
485		     --i) {
486			memcpy(u1[i].uuid,	u0[i].uuid, 16);
487			memcpy(u1[i].label,	u0[i].label, 32);
488
489			u1[i].first_reg		= u0[i].first_reg;
490			u1[i].last_reg		= u0[i].last_reg;
491			u1[i].invalidated	= u0[i].invalidated;
492
493			u1[i].flags	= 0;
494			u1[i].sectors	= 0;
495		}
496	}
497
498	return NULL;
499}
500
501static int __uuid_write(struct cache_set *c)
502{
503	BKEY_PADDED(key) k;
504	struct closure cl;
505	struct cache *ca = c->cache;
506	unsigned int size;
507
508	closure_init_stack(&cl);
509	lockdep_assert_held(&bch_register_lock);
510
511	if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, true))
512		return 1;
513
514	size =  meta_bucket_pages(&ca->sb) * PAGE_SECTORS;
515	SET_KEY_SIZE(&k.key, size);
516	uuid_io(c, REQ_OP_WRITE, 0, &k.key, &cl);
517	closure_sync(&cl);
518
519	/* Only one bucket used for uuid write */
520	atomic_long_add(ca->sb.bucket_size, &ca->meta_sectors_written);
521
522	bkey_copy(&c->uuid_bucket, &k.key);
523	bkey_put(c, &k.key);
524	return 0;
525}
526
527int bch_uuid_write(struct cache_set *c)
528{
529	int ret = __uuid_write(c);
530
531	if (!ret)
532		bch_journal_meta(c, NULL);
533
534	return ret;
535}
536
537static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid)
538{
539	struct uuid_entry *u;
540
541	for (u = c->uuids;
542	     u < c->uuids + c->nr_uuids; u++)
543		if (!memcmp(u->uuid, uuid, 16))
544			return u;
545
546	return NULL;
547}
548
549static struct uuid_entry *uuid_find_empty(struct cache_set *c)
550{
551	static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
552
553	return uuid_find(c, zero_uuid);
554}
555
556/*
557 * Bucket priorities/gens:
558 *
559 * For each bucket, we store on disk its
560 *   8 bit gen
561 *  16 bit priority
562 *
563 * See alloc.c for an explanation of the gen. The priority is used to implement
564 * lru (and in the future other) cache replacement policies; for most purposes
565 * it's just an opaque integer.
566 *
567 * The gens and the priorities don't have a whole lot to do with each other, and
568 * it's actually the gens that must be written out at specific times - it's no
569 * big deal if the priorities don't get written, if we lose them we just reuse
570 * buckets in suboptimal order.
571 *
572 * On disk they're stored in a packed array, and in as many buckets are required
573 * to fit them all. The buckets we use to store them form a list; the journal
574 * header points to the first bucket, the first bucket points to the second
575 * bucket, et cetera.
576 *
577 * This code is used by the allocation code; periodically (whenever it runs out
578 * of buckets to allocate from) the allocation code will invalidate some
579 * buckets, but it can't use those buckets until their new gens are safely on
580 * disk.
581 */
582
583static void prio_endio(struct bio *bio)
584{
585	struct cache *ca = bio->bi_private;
586
587	cache_set_err_on(bio->bi_status, ca->set, "accessing priorities");
588	bch_bbio_free(bio, ca->set);
589	closure_put(&ca->prio);
590}
591
592static void prio_io(struct cache *ca, uint64_t bucket, int op,
593		    unsigned long op_flags)
594{
595	struct closure *cl = &ca->prio;
596	struct bio *bio = bch_bbio_alloc(ca->set);
597
598	closure_init_stack(cl);
599
600	bio->bi_iter.bi_sector	= bucket * ca->sb.bucket_size;
601	bio_set_dev(bio, ca->bdev);
602	bio->bi_iter.bi_size	= meta_bucket_bytes(&ca->sb);
603
604	bio->bi_end_io	= prio_endio;
605	bio->bi_private = ca;
606	bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags);
607	bch_bio_map(bio, ca->disk_buckets);
608
609	closure_bio_submit(ca->set, bio, &ca->prio);
610	closure_sync(cl);
611}
612
613int bch_prio_write(struct cache *ca, bool wait)
614{
615	int i;
616	struct bucket *b;
617	struct closure cl;
618
619	pr_debug("free_prio=%zu, free_none=%zu, free_inc=%zu\n",
620		 fifo_used(&ca->free[RESERVE_PRIO]),
621		 fifo_used(&ca->free[RESERVE_NONE]),
622		 fifo_used(&ca->free_inc));
623
624	/*
625	 * Pre-check if there are enough free buckets. In the non-blocking
626	 * scenario it's better to fail early rather than starting to allocate
627	 * buckets and do a cleanup later in case of failure.
628	 */
629	if (!wait) {
630		size_t avail = fifo_used(&ca->free[RESERVE_PRIO]) +
631			       fifo_used(&ca->free[RESERVE_NONE]);
632		if (prio_buckets(ca) > avail)
633			return -ENOMEM;
634	}
635
636	closure_init_stack(&cl);
637
638	lockdep_assert_held(&ca->set->bucket_lock);
639
640	ca->disk_buckets->seq++;
641
642	atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
643			&ca->meta_sectors_written);
644
645	for (i = prio_buckets(ca) - 1; i >= 0; --i) {
646		long bucket;
647		struct prio_set *p = ca->disk_buckets;
648		struct bucket_disk *d = p->data;
649		struct bucket_disk *end = d + prios_per_bucket(ca);
650
651		for (b = ca->buckets + i * prios_per_bucket(ca);
652		     b < ca->buckets + ca->sb.nbuckets && d < end;
653		     b++, d++) {
654			d->prio = cpu_to_le16(b->prio);
655			d->gen = b->gen;
656		}
657
658		p->next_bucket	= ca->prio_buckets[i + 1];
659		p->magic	= pset_magic(&ca->sb);
660		p->csum		= bch_crc64(&p->magic, meta_bucket_bytes(&ca->sb) - 8);
661
662		bucket = bch_bucket_alloc(ca, RESERVE_PRIO, wait);
663		BUG_ON(bucket == -1);
664
665		mutex_unlock(&ca->set->bucket_lock);
666		prio_io(ca, bucket, REQ_OP_WRITE, 0);
667		mutex_lock(&ca->set->bucket_lock);
668
669		ca->prio_buckets[i] = bucket;
670		atomic_dec_bug(&ca->buckets[bucket].pin);
671	}
672
673	mutex_unlock(&ca->set->bucket_lock);
674
675	bch_journal_meta(ca->set, &cl);
676	closure_sync(&cl);
677
678	mutex_lock(&ca->set->bucket_lock);
679
680	/*
681	 * Don't want the old priorities to get garbage collected until after we
682	 * finish writing the new ones, and they're journalled
683	 */
684	for (i = 0; i < prio_buckets(ca); i++) {
685		if (ca->prio_last_buckets[i])
686			__bch_bucket_free(ca,
687				&ca->buckets[ca->prio_last_buckets[i]]);
688
689		ca->prio_last_buckets[i] = ca->prio_buckets[i];
690	}
691	return 0;
692}
693
694static int prio_read(struct cache *ca, uint64_t bucket)
695{
696	struct prio_set *p = ca->disk_buckets;
697	struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d;
698	struct bucket *b;
699	unsigned int bucket_nr = 0;
700	int ret = -EIO;
701
702	for (b = ca->buckets;
703	     b < ca->buckets + ca->sb.nbuckets;
704	     b++, d++) {
705		if (d == end) {
706			ca->prio_buckets[bucket_nr] = bucket;
707			ca->prio_last_buckets[bucket_nr] = bucket;
708			bucket_nr++;
709
710			prio_io(ca, bucket, REQ_OP_READ, 0);
711
712			if (p->csum !=
713			    bch_crc64(&p->magic, meta_bucket_bytes(&ca->sb) - 8)) {
714				pr_warn("bad csum reading priorities\n");
715				goto out;
716			}
717
718			if (p->magic != pset_magic(&ca->sb)) {
719				pr_warn("bad magic reading priorities\n");
720				goto out;
721			}
722
723			bucket = p->next_bucket;
724			d = p->data;
725		}
726
727		b->prio = le16_to_cpu(d->prio);
728		b->gen = b->last_gc = d->gen;
729	}
730
731	ret = 0;
732out:
733	return ret;
734}
735
736/* Bcache device */
737
738static int open_dev(struct block_device *b, fmode_t mode)
739{
740	struct bcache_device *d = b->bd_disk->private_data;
741
742	if (test_bit(BCACHE_DEV_CLOSING, &d->flags))
743		return -ENXIO;
744
745	closure_get(&d->cl);
746	return 0;
747}
748
749static void release_dev(struct gendisk *b, fmode_t mode)
750{
751	struct bcache_device *d = b->private_data;
752
753	closure_put(&d->cl);
754}
755
756static int ioctl_dev(struct block_device *b, fmode_t mode,
757		     unsigned int cmd, unsigned long arg)
758{
759	struct bcache_device *d = b->bd_disk->private_data;
760
761	return d->ioctl(d, mode, cmd, arg);
762}
763
764static const struct block_device_operations bcache_cached_ops = {
765	.submit_bio	= cached_dev_submit_bio,
766	.open		= open_dev,
767	.release	= release_dev,
768	.ioctl		= ioctl_dev,
769	.owner		= THIS_MODULE,
770};
771
772static const struct block_device_operations bcache_flash_ops = {
773	.submit_bio	= flash_dev_submit_bio,
774	.open		= open_dev,
775	.release	= release_dev,
776	.ioctl		= ioctl_dev,
777	.owner		= THIS_MODULE,
778};
779
780void bcache_device_stop(struct bcache_device *d)
781{
782	if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags))
783		/*
784		 * closure_fn set to
785		 * - cached device: cached_dev_flush()
786		 * - flash dev: flash_dev_flush()
787		 */
788		closure_queue(&d->cl);
789}
790
791static void bcache_device_unlink(struct bcache_device *d)
792{
793	lockdep_assert_held(&bch_register_lock);
794
795	if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) {
796		struct cache *ca = d->c->cache;
797
798		sysfs_remove_link(&d->c->kobj, d->name);
799		sysfs_remove_link(&d->kobj, "cache");
800
801		bd_unlink_disk_holder(ca->bdev, d->disk);
802	}
803}
804
805static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
806			       const char *name)
807{
808	struct cache *ca = c->cache;
809	int ret;
810
811	bd_link_disk_holder(ca->bdev, d->disk);
812
813	snprintf(d->name, BCACHEDEVNAME_SIZE,
814		 "%s%u", name, d->id);
815
816	ret = sysfs_create_link(&d->kobj, &c->kobj, "cache");
817	if (ret < 0)
818		pr_err("Couldn't create device -> cache set symlink\n");
819
820	ret = sysfs_create_link(&c->kobj, &d->kobj, d->name);
821	if (ret < 0)
822		pr_err("Couldn't create cache set -> device symlink\n");
823
824	clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags);
825}
826
827static void bcache_device_detach(struct bcache_device *d)
828{
829	lockdep_assert_held(&bch_register_lock);
830
831	atomic_dec(&d->c->attached_dev_nr);
832
833	if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) {
834		struct uuid_entry *u = d->c->uuids + d->id;
835
836		SET_UUID_FLASH_ONLY(u, 0);
837		memcpy(u->uuid, invalid_uuid, 16);
838		u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds());
839		bch_uuid_write(d->c);
840	}
841
842	bcache_device_unlink(d);
843
844	d->c->devices[d->id] = NULL;
845	closure_put(&d->c->caching);
846	d->c = NULL;
847}
848
849static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
850				 unsigned int id)
851{
852	d->id = id;
853	d->c = c;
854	c->devices[id] = d;
855
856	if (id >= c->devices_max_used)
857		c->devices_max_used = id + 1;
858
859	closure_get(&c->caching);
860}
861
862static inline int first_minor_to_idx(int first_minor)
863{
864	return (first_minor/BCACHE_MINORS);
865}
866
867static inline int idx_to_first_minor(int idx)
868{
869	return (idx * BCACHE_MINORS);
870}
871
872static void bcache_device_free(struct bcache_device *d)
873{
874	struct gendisk *disk = d->disk;
875
876	lockdep_assert_held(&bch_register_lock);
877
878	if (disk)
879		pr_info("%s stopped\n", disk->disk_name);
880	else
881		pr_err("bcache device (NULL gendisk) stopped\n");
882
883	if (d->c)
884		bcache_device_detach(d);
885
886	if (disk) {
887		bool disk_added = (disk->flags & GENHD_FL_UP) != 0;
888
889		if (disk_added)
890			del_gendisk(disk);
891
892		if (disk->queue)
893			blk_cleanup_queue(disk->queue);
894
895		ida_simple_remove(&bcache_device_idx,
896				  first_minor_to_idx(disk->first_minor));
897		if (disk_added)
898			put_disk(disk);
899	}
900
901	bioset_exit(&d->bio_split);
902	kvfree(d->full_dirty_stripes);
903	kvfree(d->stripe_sectors_dirty);
904
905	closure_debug_destroy(&d->cl);
906}
907
908static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
909		sector_t sectors, struct block_device *cached_bdev,
910		const struct block_device_operations *ops)
911{
912	struct request_queue *q;
913	const size_t max_stripes = min_t(size_t, INT_MAX,
914					 SIZE_MAX / sizeof(atomic_t));
915	uint64_t n;
916	int idx;
917
918	if (!d->stripe_size)
919		d->stripe_size = 1 << 31;
920	else if (d->stripe_size < BCH_MIN_STRIPE_SZ)
921		d->stripe_size = roundup(BCH_MIN_STRIPE_SZ, d->stripe_size);
922
923	n = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
924	if (!n || n > max_stripes) {
925		pr_err("nr_stripes too large or invalid: %llu (start sector beyond end of disk?)\n",
926			n);
927		return -ENOMEM;
928	}
929	d->nr_stripes = n;
930
931	n = d->nr_stripes * sizeof(atomic_t);
932	d->stripe_sectors_dirty = kvzalloc(n, GFP_KERNEL);
933	if (!d->stripe_sectors_dirty)
934		return -ENOMEM;
935
936	n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long);
937	d->full_dirty_stripes = kvzalloc(n, GFP_KERNEL);
938	if (!d->full_dirty_stripes)
939		goto out_free_stripe_sectors_dirty;
940
941	idx = ida_simple_get(&bcache_device_idx, 0,
942				BCACHE_DEVICE_IDX_MAX, GFP_KERNEL);
943	if (idx < 0)
944		goto out_free_full_dirty_stripes;
945
946	if (bioset_init(&d->bio_split, 4, offsetof(struct bbio, bio),
947			BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER))
948		goto out_ida_remove;
949
950	d->disk = alloc_disk(BCACHE_MINORS);
951	if (!d->disk)
952		goto out_bioset_exit;
953
954	set_capacity(d->disk, sectors);
955	snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx);
956
957	d->disk->major		= bcache_major;
958	d->disk->first_minor	= idx_to_first_minor(idx);
959	d->disk->fops		= ops;
960	d->disk->private_data	= d;
961
962	q = blk_alloc_queue(NUMA_NO_NODE);
963	if (!q)
964		return -ENOMEM;
965
966	d->disk->queue			= q;
967	q->limits.max_hw_sectors	= UINT_MAX;
968	q->limits.max_sectors		= UINT_MAX;
969	q->limits.max_segment_size	= UINT_MAX;
970	q->limits.max_segments		= BIO_MAX_PAGES;
971	blk_queue_max_discard_sectors(q, UINT_MAX);
972	q->limits.discard_granularity	= 512;
973	q->limits.io_min		= block_size;
974	q->limits.logical_block_size	= block_size;
975	q->limits.physical_block_size	= block_size;
976
977	if (q->limits.logical_block_size > PAGE_SIZE && cached_bdev) {
978		/*
979		 * This should only happen with BCACHE_SB_VERSION_BDEV.
980		 * Block/page size is checked for BCACHE_SB_VERSION_CDEV.
981		 */
982		pr_info("%s: sb/logical block size (%u) greater than page size (%lu) falling back to device logical block size (%u)\n",
983			d->disk->disk_name, q->limits.logical_block_size,
984			PAGE_SIZE, bdev_logical_block_size(cached_bdev));
985
986		/* This also adjusts physical block size/min io size if needed */
987		blk_queue_logical_block_size(q, bdev_logical_block_size(cached_bdev));
988	}
989
990	blk_queue_flag_set(QUEUE_FLAG_NONROT, d->disk->queue);
991	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, d->disk->queue);
992	blk_queue_flag_set(QUEUE_FLAG_DISCARD, d->disk->queue);
993
994	blk_queue_write_cache(q, true, true);
995
996	return 0;
997
998out_bioset_exit:
999	bioset_exit(&d->bio_split);
1000out_ida_remove:
1001	ida_simple_remove(&bcache_device_idx, idx);
1002out_free_full_dirty_stripes:
1003	kvfree(d->full_dirty_stripes);
1004out_free_stripe_sectors_dirty:
1005	kvfree(d->stripe_sectors_dirty);
1006	return -ENOMEM;
1007
1008}
1009
1010/* Cached device */
1011
1012static void calc_cached_dev_sectors(struct cache_set *c)
1013{
1014	uint64_t sectors = 0;
1015	struct cached_dev *dc;
1016
1017	list_for_each_entry(dc, &c->cached_devs, list)
1018		sectors += bdev_sectors(dc->bdev);
1019
1020	c->cached_dev_sectors = sectors;
1021}
1022
1023#define BACKING_DEV_OFFLINE_TIMEOUT 5
1024static int cached_dev_status_update(void *arg)
1025{
1026	struct cached_dev *dc = arg;
1027	struct request_queue *q;
1028
1029	/*
1030	 * If this delayed worker is stopping outside, directly quit here.
1031	 * dc->io_disable might be set via sysfs interface, so check it
1032	 * here too.
1033	 */
1034	while (!kthread_should_stop() && !dc->io_disable) {
1035		q = bdev_get_queue(dc->bdev);
1036		if (blk_queue_dying(q))
1037			dc->offline_seconds++;
1038		else
1039			dc->offline_seconds = 0;
1040
1041		if (dc->offline_seconds >= BACKING_DEV_OFFLINE_TIMEOUT) {
1042			pr_err("%s: device offline for %d seconds\n",
1043			       dc->backing_dev_name,
1044			       BACKING_DEV_OFFLINE_TIMEOUT);
1045			pr_err("%s: disable I/O request due to backing device offline\n",
1046			       dc->disk.name);
1047			dc->io_disable = true;
1048			/* let others know earlier that io_disable is true */
1049			smp_mb();
1050			bcache_device_stop(&dc->disk);
1051			break;
1052		}
1053		schedule_timeout_interruptible(HZ);
1054	}
1055
1056	wait_for_kthread_stop();
1057	return 0;
1058}
1059
1060
1061int bch_cached_dev_run(struct cached_dev *dc)
1062{
1063	struct bcache_device *d = &dc->disk;
1064	char *buf = kmemdup_nul(dc->sb.label, SB_LABEL_SIZE, GFP_KERNEL);
1065	char *env[] = {
1066		"DRIVER=bcache",
1067		kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid),
1068		kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf ? : ""),
1069		NULL,
1070	};
1071
1072	if (dc->io_disable) {
1073		pr_err("I/O disabled on cached dev %s\n",
1074		       dc->backing_dev_name);
1075		kfree(env[1]);
1076		kfree(env[2]);
1077		kfree(buf);
1078		return -EIO;
1079	}
1080
1081	if (atomic_xchg(&dc->running, 1)) {
1082		kfree(env[1]);
1083		kfree(env[2]);
1084		kfree(buf);
1085		pr_info("cached dev %s is running already\n",
1086		       dc->backing_dev_name);
1087		return -EBUSY;
1088	}
1089
1090	if (!d->c &&
1091	    BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) {
1092		struct closure cl;
1093
1094		closure_init_stack(&cl);
1095
1096		SET_BDEV_STATE(&dc->sb, BDEV_STATE_STALE);
1097		bch_write_bdev_super(dc, &cl);
1098		closure_sync(&cl);
1099	}
1100
1101	add_disk(d->disk);
1102	bd_link_disk_holder(dc->bdev, dc->disk.disk);
1103	/*
1104	 * won't show up in the uevent file, use udevadm monitor -e instead
1105	 * only class / kset properties are persistent
1106	 */
1107	kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
1108	kfree(env[1]);
1109	kfree(env[2]);
1110	kfree(buf);
1111
1112	if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
1113	    sysfs_create_link(&disk_to_dev(d->disk)->kobj,
1114			      &d->kobj, "bcache")) {
1115		pr_err("Couldn't create bcache dev <-> disk sysfs symlinks\n");
1116		return -ENOMEM;
1117	}
1118
1119	dc->status_update_thread = kthread_run(cached_dev_status_update,
1120					       dc, "bcache_status_update");
1121	if (IS_ERR(dc->status_update_thread)) {
1122		pr_warn("failed to create bcache_status_update kthread, continue to run without monitoring backing device status\n");
1123	}
1124
1125	return 0;
1126}
1127
1128/*
1129 * If BCACHE_DEV_RATE_DW_RUNNING is set, it means routine of the delayed
1130 * work dc->writeback_rate_update is running. Wait until the routine
1131 * quits (BCACHE_DEV_RATE_DW_RUNNING is clear), then continue to
1132 * cancel it. If BCACHE_DEV_RATE_DW_RUNNING is not clear after time_out
1133 * seconds, give up waiting here and continue to cancel it too.
1134 */
1135static void cancel_writeback_rate_update_dwork(struct cached_dev *dc)
1136{
1137	int time_out = WRITEBACK_RATE_UPDATE_SECS_MAX * HZ;
1138
1139	do {
1140		if (!test_bit(BCACHE_DEV_RATE_DW_RUNNING,
1141			      &dc->disk.flags))
1142			break;
1143		time_out--;
1144		schedule_timeout_interruptible(1);
1145	} while (time_out > 0);
1146
1147	if (time_out == 0)
1148		pr_warn("give up waiting for dc->writeback_write_update to quit\n");
1149
1150	cancel_delayed_work_sync(&dc->writeback_rate_update);
1151}
1152
1153static void cached_dev_detach_finish(struct work_struct *w)
1154{
1155	struct cached_dev *dc = container_of(w, struct cached_dev, detach);
1156	struct closure cl;
1157
1158	closure_init_stack(&cl);
1159
1160	BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags));
1161	BUG_ON(refcount_read(&dc->count));
1162
1163
1164	if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
1165		cancel_writeback_rate_update_dwork(dc);
1166
1167	if (!IS_ERR_OR_NULL(dc->writeback_thread)) {
1168		kthread_stop(dc->writeback_thread);
1169		dc->writeback_thread = NULL;
1170	}
1171
1172	memset(&dc->sb.set_uuid, 0, 16);
1173	SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE);
1174
1175	bch_write_bdev_super(dc, &cl);
1176	closure_sync(&cl);
1177
1178	mutex_lock(&bch_register_lock);
1179
1180	calc_cached_dev_sectors(dc->disk.c);
1181	bcache_device_detach(&dc->disk);
1182	list_move(&dc->list, &uncached_devices);
1183
1184	clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags);
1185	clear_bit(BCACHE_DEV_UNLINK_DONE, &dc->disk.flags);
1186
1187	mutex_unlock(&bch_register_lock);
1188
1189	pr_info("Caching disabled for %s\n", dc->backing_dev_name);
1190
1191	/* Drop ref we took in cached_dev_detach() */
1192	closure_put(&dc->disk.cl);
1193}
1194
1195void bch_cached_dev_detach(struct cached_dev *dc)
1196{
1197	lockdep_assert_held(&bch_register_lock);
1198
1199	if (test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
1200		return;
1201
1202	if (test_and_set_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
1203		return;
1204
1205	/*
1206	 * Block the device from being closed and freed until we're finished
1207	 * detaching
1208	 */
1209	closure_get(&dc->disk.cl);
1210
1211	bch_writeback_queue(dc);
1212
1213	cached_dev_put(dc);
1214}
1215
1216int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
1217			  uint8_t *set_uuid)
1218{
1219	uint32_t rtime = cpu_to_le32((u32)ktime_get_real_seconds());
1220	struct uuid_entry *u;
1221	struct cached_dev *exist_dc, *t;
1222	int ret = 0;
1223
1224	if ((set_uuid && memcmp(set_uuid, c->set_uuid, 16)) ||
1225	    (!set_uuid && memcmp(dc->sb.set_uuid, c->set_uuid, 16)))
1226		return -ENOENT;
1227
1228	if (dc->disk.c) {
1229		pr_err("Can't attach %s: already attached\n",
1230		       dc->backing_dev_name);
1231		return -EINVAL;
1232	}
1233
1234	if (test_bit(CACHE_SET_STOPPING, &c->flags)) {
1235		pr_err("Can't attach %s: shutting down\n",
1236		       dc->backing_dev_name);
1237		return -EINVAL;
1238	}
1239
1240	if (dc->sb.block_size < c->cache->sb.block_size) {
1241		/* Will die */
1242		pr_err("Couldn't attach %s: block size less than set's block size\n",
1243		       dc->backing_dev_name);
1244		return -EINVAL;
1245	}
1246
1247	/* Check whether already attached */
1248	list_for_each_entry_safe(exist_dc, t, &c->cached_devs, list) {
1249		if (!memcmp(dc->sb.uuid, exist_dc->sb.uuid, 16)) {
1250			pr_err("Tried to attach %s but duplicate UUID already attached\n",
1251				dc->backing_dev_name);
1252
1253			return -EINVAL;
1254		}
1255	}
1256
1257	u = uuid_find(c, dc->sb.uuid);
1258
1259	if (u &&
1260	    (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE ||
1261	     BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) {
1262		memcpy(u->uuid, invalid_uuid, 16);
1263		u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds());
1264		u = NULL;
1265	}
1266
1267	if (!u) {
1268		if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
1269			pr_err("Couldn't find uuid for %s in set\n",
1270			       dc->backing_dev_name);
1271			return -ENOENT;
1272		}
1273
1274		u = uuid_find_empty(c);
1275		if (!u) {
1276			pr_err("Not caching %s, no room for UUID\n",
1277			       dc->backing_dev_name);
1278			return -EINVAL;
1279		}
1280	}
1281
1282	/*
1283	 * Deadlocks since we're called via sysfs...
1284	 * sysfs_remove_file(&dc->kobj, &sysfs_attach);
1285	 */
1286
1287	if (bch_is_zero(u->uuid, 16)) {
1288		struct closure cl;
1289
1290		closure_init_stack(&cl);
1291
1292		memcpy(u->uuid, dc->sb.uuid, 16);
1293		memcpy(u->label, dc->sb.label, SB_LABEL_SIZE);
1294		u->first_reg = u->last_reg = rtime;
1295		bch_uuid_write(c);
1296
1297		memcpy(dc->sb.set_uuid, c->set_uuid, 16);
1298		SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
1299
1300		bch_write_bdev_super(dc, &cl);
1301		closure_sync(&cl);
1302	} else {
1303		u->last_reg = rtime;
1304		bch_uuid_write(c);
1305	}
1306
1307	bcache_device_attach(&dc->disk, c, u - c->uuids);
1308	list_move(&dc->list, &c->cached_devs);
1309	calc_cached_dev_sectors(c);
1310
1311	/*
1312	 * dc->c must be set before dc->count != 0 - paired with the mb in
1313	 * cached_dev_get()
1314	 */
1315	smp_wmb();
1316	refcount_set(&dc->count, 1);
1317
1318	/* Block writeback thread, but spawn it */
1319	down_write(&dc->writeback_lock);
1320	if (bch_cached_dev_writeback_start(dc)) {
1321		up_write(&dc->writeback_lock);
1322		pr_err("Couldn't start writeback facilities for %s\n",
1323		       dc->disk.disk->disk_name);
1324		return -ENOMEM;
1325	}
1326
1327	if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
1328		atomic_set(&dc->has_dirty, 1);
1329		bch_writeback_queue(dc);
1330	}
1331
1332	bch_sectors_dirty_init(&dc->disk);
1333
1334	ret = bch_cached_dev_run(dc);
1335	if (ret && (ret != -EBUSY)) {
1336		up_write(&dc->writeback_lock);
1337		/*
1338		 * bch_register_lock is held, bcache_device_stop() is not
1339		 * able to be directly called. The kthread and kworker
1340		 * created previously in bch_cached_dev_writeback_start()
1341		 * have to be stopped manually here.
1342		 */
1343		kthread_stop(dc->writeback_thread);
1344		cancel_writeback_rate_update_dwork(dc);
1345		pr_err("Couldn't run cached device %s\n",
1346		       dc->backing_dev_name);
1347		return ret;
1348	}
1349
1350	bcache_device_link(&dc->disk, c, "bdev");
1351	atomic_inc(&c->attached_dev_nr);
1352
1353	if (bch_has_feature_obso_large_bucket(&(c->cache->sb))) {
1354		pr_err("The obsoleted large bucket layout is unsupported, set the bcache device into read-only\n");
1355		pr_err("Please update to the latest bcache-tools to create the cache device\n");
1356		set_disk_ro(dc->disk.disk, 1);
1357	}
1358
1359	/* Allow the writeback thread to proceed */
1360	up_write(&dc->writeback_lock);
1361
1362	pr_info("Caching %s as %s on set %pU\n",
1363		dc->backing_dev_name,
1364		dc->disk.disk->disk_name,
1365		dc->disk.c->set_uuid);
1366	return 0;
1367}
1368
1369/* when dc->disk.kobj released */
1370void bch_cached_dev_release(struct kobject *kobj)
1371{
1372	struct cached_dev *dc = container_of(kobj, struct cached_dev,
1373					     disk.kobj);
1374	kfree(dc);
1375	module_put(THIS_MODULE);
1376}
1377
1378static void cached_dev_free(struct closure *cl)
1379{
1380	struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
1381
1382	if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
1383		cancel_writeback_rate_update_dwork(dc);
1384
1385	if (!IS_ERR_OR_NULL(dc->writeback_thread))
1386		kthread_stop(dc->writeback_thread);
1387	if (!IS_ERR_OR_NULL(dc->status_update_thread))
1388		kthread_stop(dc->status_update_thread);
1389
1390	mutex_lock(&bch_register_lock);
1391
1392	if (atomic_read(&dc->running))
1393		bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
1394	bcache_device_free(&dc->disk);
1395	list_del(&dc->list);
1396
1397	mutex_unlock(&bch_register_lock);
1398
1399	if (dc->sb_disk)
1400		put_page(virt_to_page(dc->sb_disk));
1401
1402	if (!IS_ERR_OR_NULL(dc->bdev))
1403		blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1404
1405	wake_up(&unregister_wait);
1406
1407	kobject_put(&dc->disk.kobj);
1408}
1409
1410static void cached_dev_flush(struct closure *cl)
1411{
1412	struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
1413	struct bcache_device *d = &dc->disk;
1414
1415	mutex_lock(&bch_register_lock);
1416	bcache_device_unlink(d);
1417	mutex_unlock(&bch_register_lock);
1418
1419	bch_cache_accounting_destroy(&dc->accounting);
1420	kobject_del(&d->kobj);
1421
1422	continue_at(cl, cached_dev_free, system_wq);
1423}
1424
1425static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
1426{
1427	int ret;
1428	struct io *io;
1429	struct request_queue *q = bdev_get_queue(dc->bdev);
1430
1431	__module_get(THIS_MODULE);
1432	INIT_LIST_HEAD(&dc->list);
1433	closure_init(&dc->disk.cl, NULL);
1434	set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
1435	kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
1436	INIT_WORK(&dc->detach, cached_dev_detach_finish);
1437	sema_init(&dc->sb_write_mutex, 1);
1438	INIT_LIST_HEAD(&dc->io_lru);
1439	spin_lock_init(&dc->io_lock);
1440	bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
1441
1442	dc->sequential_cutoff		= 4 << 20;
1443
1444	for (io = dc->io; io < dc->io + RECENT_IO; io++) {
1445		list_add(&io->lru, &dc->io_lru);
1446		hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
1447	}
1448
1449	dc->disk.stripe_size = q->limits.io_opt >> 9;
1450
1451	if (dc->disk.stripe_size)
1452		dc->partial_stripes_expensive =
1453			q->limits.raid_partial_stripes_expensive;
1454
1455	ret = bcache_device_init(&dc->disk, block_size,
1456			 dc->bdev->bd_part->nr_sects - dc->sb.data_offset,
1457			 dc->bdev, &bcache_cached_ops);
1458	if (ret)
1459		return ret;
1460
1461	blk_queue_io_opt(dc->disk.disk->queue,
1462		max(queue_io_opt(dc->disk.disk->queue), queue_io_opt(q)));
1463
1464	atomic_set(&dc->io_errors, 0);
1465	dc->io_disable = false;
1466	dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT;
1467	/* default to auto */
1468	dc->stop_when_cache_set_failed = BCH_CACHED_DEV_STOP_AUTO;
1469
1470	bch_cached_dev_request_init(dc);
1471	bch_cached_dev_writeback_init(dc);
1472	return 0;
1473}
1474
1475/* Cached device - bcache superblock */
1476
1477static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
1478				 struct block_device *bdev,
1479				 struct cached_dev *dc)
1480{
1481	const char *err = "cannot allocate memory";
1482	struct cache_set *c;
1483	int ret = -ENOMEM;
1484
1485	bdevname(bdev, dc->backing_dev_name);
1486	memcpy(&dc->sb, sb, sizeof(struct cache_sb));
1487	dc->bdev = bdev;
1488	dc->bdev->bd_holder = dc;
1489	dc->sb_disk = sb_disk;
1490
1491	if (cached_dev_init(dc, sb->block_size << 9))
1492		goto err;
1493
1494	err = "error creating kobject";
1495	if (kobject_add(&dc->disk.kobj, &part_to_dev(bdev->bd_part)->kobj,
1496			"bcache"))
1497		goto err;
1498	if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj))
1499		goto err;
1500
1501	pr_info("registered backing device %s\n", dc->backing_dev_name);
1502
1503	list_add(&dc->list, &uncached_devices);
1504	/* attach to a matched cache set if it exists */
1505	list_for_each_entry(c, &bch_cache_sets, list)
1506		bch_cached_dev_attach(dc, c, NULL);
1507
1508	if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE ||
1509	    BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) {
1510		err = "failed to run cached device";
1511		ret = bch_cached_dev_run(dc);
1512		if (ret)
1513			goto err;
1514	}
1515
1516	return 0;
1517err:
1518	pr_notice("error %s: %s\n", dc->backing_dev_name, err);
1519	bcache_device_stop(&dc->disk);
1520	return ret;
1521}
1522
1523/* Flash only volumes */
1524
1525/* When d->kobj released */
1526void bch_flash_dev_release(struct kobject *kobj)
1527{
1528	struct bcache_device *d = container_of(kobj, struct bcache_device,
1529					       kobj);
1530	kfree(d);
1531}
1532
1533static void flash_dev_free(struct closure *cl)
1534{
1535	struct bcache_device *d = container_of(cl, struct bcache_device, cl);
1536
1537	mutex_lock(&bch_register_lock);
1538	atomic_long_sub(bcache_dev_sectors_dirty(d),
1539			&d->c->flash_dev_dirty_sectors);
1540	bcache_device_free(d);
1541	mutex_unlock(&bch_register_lock);
1542	kobject_put(&d->kobj);
1543}
1544
1545static void flash_dev_flush(struct closure *cl)
1546{
1547	struct bcache_device *d = container_of(cl, struct bcache_device, cl);
1548
1549	mutex_lock(&bch_register_lock);
1550	bcache_device_unlink(d);
1551	mutex_unlock(&bch_register_lock);
1552	kobject_del(&d->kobj);
1553	continue_at(cl, flash_dev_free, system_wq);
1554}
1555
1556static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
1557{
1558	struct bcache_device *d = kzalloc(sizeof(struct bcache_device),
1559					  GFP_KERNEL);
1560	if (!d)
1561		return -ENOMEM;
1562
1563	closure_init(&d->cl, NULL);
1564	set_closure_fn(&d->cl, flash_dev_flush, system_wq);
1565
1566	kobject_init(&d->kobj, &bch_flash_dev_ktype);
1567
1568	if (bcache_device_init(d, block_bytes(c->cache), u->sectors,
1569			NULL, &bcache_flash_ops))
1570		goto err;
1571
1572	bcache_device_attach(d, c, u - c->uuids);
1573	bch_sectors_dirty_init(d);
1574	bch_flash_dev_request_init(d);
1575	add_disk(d->disk);
1576
1577	if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache"))
1578		goto err;
1579
1580	bcache_device_link(d, c, "volume");
1581
1582	if (bch_has_feature_obso_large_bucket(&c->cache->sb)) {
1583		pr_err("The obsoleted large bucket layout is unsupported, set the bcache device into read-only\n");
1584		pr_err("Please update to the latest bcache-tools to create the cache device\n");
1585		set_disk_ro(d->disk, 1);
1586	}
1587
1588	return 0;
1589err:
1590	kobject_put(&d->kobj);
1591	return -ENOMEM;
1592}
1593
1594static int flash_devs_run(struct cache_set *c)
1595{
1596	int ret = 0;
1597	struct uuid_entry *u;
1598
1599	for (u = c->uuids;
1600	     u < c->uuids + c->nr_uuids && !ret;
1601	     u++)
1602		if (UUID_FLASH_ONLY(u))
1603			ret = flash_dev_run(c, u);
1604
1605	return ret;
1606}
1607
1608int bch_flash_dev_create(struct cache_set *c, uint64_t size)
1609{
1610	struct uuid_entry *u;
1611
1612	if (test_bit(CACHE_SET_STOPPING, &c->flags))
1613		return -EINTR;
1614
1615	if (!test_bit(CACHE_SET_RUNNING, &c->flags))
1616		return -EPERM;
1617
1618	u = uuid_find_empty(c);
1619	if (!u) {
1620		pr_err("Can't create volume, no room for UUID\n");
1621		return -EINVAL;
1622	}
1623
1624	get_random_bytes(u->uuid, 16);
1625	memset(u->label, 0, 32);
1626	u->first_reg = u->last_reg = cpu_to_le32((u32)ktime_get_real_seconds());
1627
1628	SET_UUID_FLASH_ONLY(u, 1);
1629	u->sectors = size >> 9;
1630
1631	bch_uuid_write(c);
1632
1633	return flash_dev_run(c, u);
1634}
1635
1636bool bch_cached_dev_error(struct cached_dev *dc)
1637{
1638	if (!dc || test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
1639		return false;
1640
1641	dc->io_disable = true;
1642	/* make others know io_disable is true earlier */
1643	smp_mb();
1644
1645	pr_err("stop %s: too many IO errors on backing device %s\n",
1646	       dc->disk.disk->disk_name, dc->backing_dev_name);
1647
1648	bcache_device_stop(&dc->disk);
1649	return true;
1650}
1651
1652/* Cache set */
1653
1654__printf(2, 3)
1655bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
1656{
1657	struct va_format vaf;
1658	va_list args;
1659
1660	if (c->on_error != ON_ERROR_PANIC &&
1661	    test_bit(CACHE_SET_STOPPING, &c->flags))
1662		return false;
1663
1664	if (test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags))
1665		pr_info("CACHE_SET_IO_DISABLE already set\n");
1666
1667	/*
1668	 * XXX: we can be called from atomic context
1669	 * acquire_console_sem();
1670	 */
1671
1672	va_start(args, fmt);
1673
1674	vaf.fmt = fmt;
1675	vaf.va = &args;
1676
1677	pr_err("error on %pU: %pV, disabling caching\n",
1678	       c->set_uuid, &vaf);
1679
1680	va_end(args);
1681
1682	if (c->on_error == ON_ERROR_PANIC)
1683		panic("panic forced after error\n");
1684
1685	bch_cache_set_unregister(c);
1686	return true;
1687}
1688
1689/* When c->kobj released */
1690void bch_cache_set_release(struct kobject *kobj)
1691{
1692	struct cache_set *c = container_of(kobj, struct cache_set, kobj);
1693
1694	kfree(c);
1695	module_put(THIS_MODULE);
1696}
1697
1698static void cache_set_free(struct closure *cl)
1699{
1700	struct cache_set *c = container_of(cl, struct cache_set, cl);
1701	struct cache *ca;
1702
1703	debugfs_remove(c->debug);
1704
1705	bch_open_buckets_free(c);
1706	bch_btree_cache_free(c);
1707	bch_journal_free(c);
1708
1709	mutex_lock(&bch_register_lock);
1710	bch_bset_sort_state_free(&c->sort);
1711	free_pages((unsigned long) c->uuids, ilog2(meta_bucket_pages(&c->cache->sb)));
1712
1713	ca = c->cache;
1714	if (ca) {
1715		ca->set = NULL;
1716		c->cache = NULL;
1717		kobject_put(&ca->kobj);
1718	}
1719
1720
1721	if (c->moving_gc_wq)
1722		destroy_workqueue(c->moving_gc_wq);
1723	bioset_exit(&c->bio_split);
1724	mempool_exit(&c->fill_iter);
1725	mempool_exit(&c->bio_meta);
1726	mempool_exit(&c->search);
1727	kfree(c->devices);
1728
1729	list_del(&c->list);
1730	mutex_unlock(&bch_register_lock);
1731
1732	pr_info("Cache set %pU unregistered\n", c->set_uuid);
1733	wake_up(&unregister_wait);
1734
1735	closure_debug_destroy(&c->cl);
1736	kobject_put(&c->kobj);
1737}
1738
1739static void cache_set_flush(struct closure *cl)
1740{
1741	struct cache_set *c = container_of(cl, struct cache_set, caching);
1742	struct cache *ca = c->cache;
1743	struct btree *b;
1744
1745	bch_cache_accounting_destroy(&c->accounting);
1746
1747	kobject_put(&c->internal);
1748	kobject_del(&c->kobj);
1749
1750	if (!IS_ERR_OR_NULL(c->gc_thread))
1751		kthread_stop(c->gc_thread);
1752
1753	if (!IS_ERR(c->root))
1754		list_add(&c->root->list, &c->btree_cache);
1755
1756	/*
1757	 * Avoid flushing cached nodes if cache set is retiring
1758	 * due to too many I/O errors detected.
1759	 */
1760	if (!test_bit(CACHE_SET_IO_DISABLE, &c->flags))
1761		list_for_each_entry(b, &c->btree_cache, list) {
1762			mutex_lock(&b->write_lock);
1763			if (btree_node_dirty(b))
1764				__bch_btree_node_write(b, NULL);
1765			mutex_unlock(&b->write_lock);
1766		}
1767
1768	if (ca->alloc_thread)
1769		kthread_stop(ca->alloc_thread);
1770
1771	if (c->journal.cur) {
1772		cancel_delayed_work_sync(&c->journal.work);
1773		/* flush last journal entry if needed */
1774		c->journal.work.work.func(&c->journal.work.work);
1775	}
1776
1777	closure_return(cl);
1778}
1779
1780/*
1781 * This function is only called when CACHE_SET_IO_DISABLE is set, which means
1782 * cache set is unregistering due to too many I/O errors. In this condition,
1783 * the bcache device might be stopped, it depends on stop_when_cache_set_failed
1784 * value and whether the broken cache has dirty data:
1785 *
1786 * dc->stop_when_cache_set_failed    dc->has_dirty   stop bcache device
1787 *  BCH_CACHED_STOP_AUTO               0               NO
1788 *  BCH_CACHED_STOP_AUTO               1               YES
1789 *  BCH_CACHED_DEV_STOP_ALWAYS         0               YES
1790 *  BCH_CACHED_DEV_STOP_ALWAYS         1               YES
1791 *
1792 * The expected behavior is, if stop_when_cache_set_failed is configured to
1793 * "auto" via sysfs interface, the bcache device will not be stopped if the
1794 * backing device is clean on the broken cache device.
1795 */
1796static void conditional_stop_bcache_device(struct cache_set *c,
1797					   struct bcache_device *d,
1798					   struct cached_dev *dc)
1799{
1800	if (dc->stop_when_cache_set_failed == BCH_CACHED_DEV_STOP_ALWAYS) {
1801		pr_warn("stop_when_cache_set_failed of %s is \"always\", stop it for failed cache set %pU.\n",
1802			d->disk->disk_name, c->set_uuid);
1803		bcache_device_stop(d);
1804	} else if (atomic_read(&dc->has_dirty)) {
1805		/*
1806		 * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO
1807		 * and dc->has_dirty == 1
1808		 */
1809		pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is dirty, stop it to avoid potential data corruption.\n",
1810			d->disk->disk_name);
1811		/*
1812		 * There might be a small time gap that cache set is
1813		 * released but bcache device is not. Inside this time
1814		 * gap, regular I/O requests will directly go into
1815		 * backing device as no cache set attached to. This
1816		 * behavior may also introduce potential inconsistence
1817		 * data in writeback mode while cache is dirty.
1818		 * Therefore before calling bcache_device_stop() due
1819		 * to a broken cache device, dc->io_disable should be
1820		 * explicitly set to true.
1821		 */
1822		dc->io_disable = true;
1823		/* make others know io_disable is true earlier */
1824		smp_mb();
1825		bcache_device_stop(d);
1826	} else {
1827		/*
1828		 * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO
1829		 * and dc->has_dirty == 0
1830		 */
1831		pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is clean, keep it alive.\n",
1832			d->disk->disk_name);
1833	}
1834}
1835
1836static void __cache_set_unregister(struct closure *cl)
1837{
1838	struct cache_set *c = container_of(cl, struct cache_set, caching);
1839	struct cached_dev *dc;
1840	struct bcache_device *d;
1841	size_t i;
1842
1843	mutex_lock(&bch_register_lock);
1844
1845	for (i = 0; i < c->devices_max_used; i++) {
1846		d = c->devices[i];
1847		if (!d)
1848			continue;
1849
1850		if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
1851		    test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
1852			dc = container_of(d, struct cached_dev, disk);
1853			bch_cached_dev_detach(dc);
1854			if (test_bit(CACHE_SET_IO_DISABLE, &c->flags))
1855				conditional_stop_bcache_device(c, d, dc);
1856		} else {
1857			bcache_device_stop(d);
1858		}
1859	}
1860
1861	mutex_unlock(&bch_register_lock);
1862
1863	continue_at(cl, cache_set_flush, system_wq);
1864}
1865
1866void bch_cache_set_stop(struct cache_set *c)
1867{
1868	if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags))
1869		/* closure_fn set to __cache_set_unregister() */
1870		closure_queue(&c->caching);
1871}
1872
1873void bch_cache_set_unregister(struct cache_set *c)
1874{
1875	set_bit(CACHE_SET_UNREGISTERING, &c->flags);
1876	bch_cache_set_stop(c);
1877}
1878
1879#define alloc_meta_bucket_pages(gfp, sb)		\
1880	((void *) __get_free_pages(__GFP_ZERO|__GFP_COMP|gfp, ilog2(meta_bucket_pages(sb))))
1881
1882struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1883{
1884	int iter_size;
1885	struct cache *ca = container_of(sb, struct cache, sb);
1886	struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL);
1887
1888	if (!c)
1889		return NULL;
1890
1891	__module_get(THIS_MODULE);
1892	closure_init(&c->cl, NULL);
1893	set_closure_fn(&c->cl, cache_set_free, system_wq);
1894
1895	closure_init(&c->caching, &c->cl);
1896	set_closure_fn(&c->caching, __cache_set_unregister, system_wq);
1897
1898	/* Maybe create continue_at_noreturn() and use it here? */
1899	closure_set_stopped(&c->cl);
1900	closure_put(&c->cl);
1901
1902	kobject_init(&c->kobj, &bch_cache_set_ktype);
1903	kobject_init(&c->internal, &bch_cache_set_internal_ktype);
1904
1905	bch_cache_accounting_init(&c->accounting, &c->cl);
1906
1907	memcpy(c->set_uuid, sb->set_uuid, 16);
1908
1909	c->cache		= ca;
1910	c->cache->set		= c;
1911	c->bucket_bits		= ilog2(sb->bucket_size);
1912	c->block_bits		= ilog2(sb->block_size);
1913	c->nr_uuids		= meta_bucket_bytes(sb) / sizeof(struct uuid_entry);
1914	c->devices_max_used	= 0;
1915	atomic_set(&c->attached_dev_nr, 0);
1916	c->btree_pages		= meta_bucket_pages(sb);
1917	if (c->btree_pages > BTREE_MAX_PAGES)
1918		c->btree_pages = max_t(int, c->btree_pages / 4,
1919				       BTREE_MAX_PAGES);
1920
1921	sema_init(&c->sb_write_mutex, 1);
1922	mutex_init(&c->bucket_lock);
1923	init_waitqueue_head(&c->btree_cache_wait);
1924	spin_lock_init(&c->btree_cannibalize_lock);
1925	init_waitqueue_head(&c->bucket_wait);
1926	init_waitqueue_head(&c->gc_wait);
1927	sema_init(&c->uuid_write_mutex, 1);
1928
1929	spin_lock_init(&c->btree_gc_time.lock);
1930	spin_lock_init(&c->btree_split_time.lock);
1931	spin_lock_init(&c->btree_read_time.lock);
1932
1933	bch_moving_init_cache_set(c);
1934
1935	INIT_LIST_HEAD(&c->list);
1936	INIT_LIST_HEAD(&c->cached_devs);
1937	INIT_LIST_HEAD(&c->btree_cache);
1938	INIT_LIST_HEAD(&c->btree_cache_freeable);
1939	INIT_LIST_HEAD(&c->btree_cache_freed);
1940	INIT_LIST_HEAD(&c->data_buckets);
1941
1942	iter_size = ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size + 1) *
1943		sizeof(struct btree_iter_set);
1944
1945	c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL);
1946	if (!c->devices)
1947		goto err;
1948
1949	if (mempool_init_slab_pool(&c->search, 32, bch_search_cache))
1950		goto err;
1951
1952	if (mempool_init_kmalloc_pool(&c->bio_meta, 2,
1953			sizeof(struct bbio) +
1954			sizeof(struct bio_vec) * meta_bucket_pages(sb)))
1955		goto err;
1956
1957	if (mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size))
1958		goto err;
1959
1960	if (bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio),
1961			BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER))
1962		goto err;
1963
1964	c->uuids = alloc_meta_bucket_pages(GFP_KERNEL, sb);
1965	if (!c->uuids)
1966		goto err;
1967
1968	c->moving_gc_wq = alloc_workqueue("bcache_gc", WQ_MEM_RECLAIM, 0);
1969	if (!c->moving_gc_wq)
1970		goto err;
1971
1972	if (bch_journal_alloc(c))
1973		goto err;
1974
1975	if (bch_btree_cache_alloc(c))
1976		goto err;
1977
1978	if (bch_open_buckets_alloc(c))
1979		goto err;
1980
1981	if (bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages)))
1982		goto err;
1983
1984	c->congested_read_threshold_us	= 2000;
1985	c->congested_write_threshold_us	= 20000;
1986	c->error_limit	= DEFAULT_IO_ERROR_LIMIT;
1987	c->idle_max_writeback_rate_enabled = 1;
1988	WARN_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags));
1989
1990	return c;
1991err:
1992	bch_cache_set_unregister(c);
1993	return NULL;
1994}
1995
1996static int run_cache_set(struct cache_set *c)
1997{
1998	const char *err = "cannot allocate memory";
1999	struct cached_dev *dc, *t;
2000	struct cache *ca = c->cache;
2001	struct closure cl;
2002	LIST_HEAD(journal);
2003	struct journal_replay *l;
2004
2005	closure_init_stack(&cl);
2006
2007	c->nbuckets = ca->sb.nbuckets;
2008	set_gc_sectors(c);
2009
2010	if (CACHE_SYNC(&c->cache->sb)) {
2011		struct bkey *k;
2012		struct jset *j;
2013
2014		err = "cannot allocate memory for journal";
2015		if (bch_journal_read(c, &journal))
2016			goto err;
2017
2018		pr_debug("btree_journal_read() done\n");
2019
2020		err = "no journal entries found";
2021		if (list_empty(&journal))
2022			goto err;
2023
2024		j = &list_entry(journal.prev, struct journal_replay, list)->j;
2025
2026		err = "IO error reading priorities";
2027		if (prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]))
2028			goto err;
2029
2030		/*
2031		 * If prio_read() fails it'll call cache_set_error and we'll
2032		 * tear everything down right away, but if we perhaps checked
2033		 * sooner we could avoid journal replay.
2034		 */
2035
2036		k = &j->btree_root;
2037
2038		err = "bad btree root";
2039		if (__bch_btree_ptr_invalid(c, k))
2040			goto err;
2041
2042		err = "error reading btree root";
2043		c->root = bch_btree_node_get(c, NULL, k,
2044					     j->btree_level,
2045					     true, NULL);
2046		if (IS_ERR(c->root))
2047			goto err;
2048
2049		list_del_init(&c->root->list);
2050		rw_unlock(true, c->root);
2051
2052		err = uuid_read(c, j, &cl);
2053		if (err)
2054			goto err;
2055
2056		err = "error in recovery";
2057		if (bch_btree_check(c))
2058			goto err;
2059
2060		bch_journal_mark(c, &journal);
2061		bch_initial_gc_finish(c);
2062		pr_debug("btree_check() done\n");
2063
2064		/*
2065		 * bcache_journal_next() can't happen sooner, or
2066		 * btree_gc_finish() will give spurious errors about last_gc >
2067		 * gc_gen - this is a hack but oh well.
2068		 */
2069		bch_journal_next(&c->journal);
2070
2071		err = "error starting allocator thread";
2072		if (bch_cache_allocator_start(ca))
2073			goto err;
2074
2075		/*
2076		 * First place it's safe to allocate: btree_check() and
2077		 * btree_gc_finish() have to run before we have buckets to
2078		 * allocate, and bch_bucket_alloc_set() might cause a journal
2079		 * entry to be written so bcache_journal_next() has to be called
2080		 * first.
2081		 *
2082		 * If the uuids were in the old format we have to rewrite them
2083		 * before the next journal entry is written:
2084		 */
2085		if (j->version < BCACHE_JSET_VERSION_UUID)
2086			__uuid_write(c);
2087
2088		err = "bcache: replay journal failed";
2089		if (bch_journal_replay(c, &journal))
2090			goto err;
2091	} else {
2092		unsigned int j;
2093
2094		pr_notice("invalidating existing data\n");
2095		ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
2096					2, SB_JOURNAL_BUCKETS);
2097
2098		for (j = 0; j < ca->sb.keys; j++)
2099			ca->sb.d[j] = ca->sb.first_bucket + j;
2100
2101		bch_initial_gc_finish(c);
2102
2103		err = "error starting allocator thread";
2104		if (bch_cache_allocator_start(ca))
2105			goto err;
2106
2107		mutex_lock(&c->bucket_lock);
2108		bch_prio_write(ca, true);
2109		mutex_unlock(&c->bucket_lock);
2110
2111		err = "cannot allocate new UUID bucket";
2112		if (__uuid_write(c))
2113			goto err;
2114
2115		err = "cannot allocate new btree root";
2116		c->root = __bch_btree_node_alloc(c, NULL, 0, true, NULL);
2117		if (IS_ERR(c->root))
2118			goto err;
2119
2120		mutex_lock(&c->root->write_lock);
2121		bkey_copy_key(&c->root->key, &MAX_KEY);
2122		bch_btree_node_write(c->root, &cl);
2123		mutex_unlock(&c->root->write_lock);
2124
2125		bch_btree_set_root(c->root);
2126		rw_unlock(true, c->root);
2127
2128		/*
2129		 * We don't want to write the first journal entry until
2130		 * everything is set up - fortunately journal entries won't be
2131		 * written until the SET_CACHE_SYNC() here:
2132		 */
2133		SET_CACHE_SYNC(&c->cache->sb, true);
2134
2135		bch_journal_next(&c->journal);
2136		bch_journal_meta(c, &cl);
2137	}
2138
2139	err = "error starting gc thread";
2140	if (bch_gc_thread_start(c))
2141		goto err;
2142
2143	closure_sync(&cl);
2144	c->cache->sb.last_mount = (u32)ktime_get_real_seconds();
2145	bcache_write_super(c);
2146
2147	if (bch_has_feature_obso_large_bucket(&c->cache->sb))
2148		pr_err("Detect obsoleted large bucket layout, all attached bcache device will be read-only\n");
2149
2150	list_for_each_entry_safe(dc, t, &uncached_devices, list)
2151		bch_cached_dev_attach(dc, c, NULL);
2152
2153	flash_devs_run(c);
2154
2155	bch_journal_space_reserve(&c->journal);
2156	set_bit(CACHE_SET_RUNNING, &c->flags);
2157	return 0;
2158err:
2159	while (!list_empty(&journal)) {
2160		l = list_first_entry(&journal, struct journal_replay, list);
2161		list_del(&l->list);
2162		kfree(l);
2163	}
2164
2165	closure_sync(&cl);
2166
2167	bch_cache_set_error(c, "%s", err);
2168
2169	return -EIO;
2170}
2171
2172static const char *register_cache_set(struct cache *ca)
2173{
2174	char buf[12];
2175	const char *err = "cannot allocate memory";
2176	struct cache_set *c;
2177
2178	list_for_each_entry(c, &bch_cache_sets, list)
2179		if (!memcmp(c->set_uuid, ca->sb.set_uuid, 16)) {
2180			if (c->cache)
2181				return "duplicate cache set member";
2182
2183			goto found;
2184		}
2185
2186	c = bch_cache_set_alloc(&ca->sb);
2187	if (!c)
2188		return err;
2189
2190	err = "error creating kobject";
2191	if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->set_uuid) ||
2192	    kobject_add(&c->internal, &c->kobj, "internal"))
2193		goto err;
2194
2195	if (bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
2196		goto err;
2197
2198	bch_debug_init_cache_set(c);
2199
2200	list_add(&c->list, &bch_cache_sets);
2201found:
2202	sprintf(buf, "cache%i", ca->sb.nr_this_dev);
2203	if (sysfs_create_link(&ca->kobj, &c->kobj, "set") ||
2204	    sysfs_create_link(&c->kobj, &ca->kobj, buf))
2205		goto err;
2206
2207	kobject_get(&ca->kobj);
2208	ca->set = c;
2209	ca->set->cache = ca;
2210
2211	err = "failed to run cache set";
2212	if (run_cache_set(c) < 0)
2213		goto err;
2214
2215	return NULL;
2216err:
2217	bch_cache_set_unregister(c);
2218	return err;
2219}
2220
2221/* Cache device */
2222
2223/* When ca->kobj released */
2224void bch_cache_release(struct kobject *kobj)
2225{
2226	struct cache *ca = container_of(kobj, struct cache, kobj);
2227	unsigned int i;
2228
2229	if (ca->set) {
2230		BUG_ON(ca->set->cache != ca);
2231		ca->set->cache = NULL;
2232	}
2233
2234	free_pages((unsigned long) ca->disk_buckets, ilog2(meta_bucket_pages(&ca->sb)));
2235	kfree(ca->prio_buckets);
2236	vfree(ca->buckets);
2237
2238	free_heap(&ca->heap);
2239	free_fifo(&ca->free_inc);
2240
2241	for (i = 0; i < RESERVE_NR; i++)
2242		free_fifo(&ca->free[i]);
2243
2244	if (ca->sb_disk)
2245		put_page(virt_to_page(ca->sb_disk));
2246
2247	if (!IS_ERR_OR_NULL(ca->bdev))
2248		blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2249
2250	kfree(ca);
2251	module_put(THIS_MODULE);
2252}
2253
2254static int cache_alloc(struct cache *ca)
2255{
2256	size_t free;
2257	size_t btree_buckets;
2258	struct bucket *b;
2259	int ret = -ENOMEM;
2260	const char *err = NULL;
2261
2262	__module_get(THIS_MODULE);
2263	kobject_init(&ca->kobj, &bch_cache_ktype);
2264
2265	bio_init(&ca->journal.bio, ca->journal.bio.bi_inline_vecs, 8);
2266
2267	/*
2268	 * when ca->sb.njournal_buckets is not zero, journal exists,
2269	 * and in bch_journal_replay(), tree node may split,
2270	 * so bucket of RESERVE_BTREE type is needed,
2271	 * the worst situation is all journal buckets are valid journal,
2272	 * and all the keys need to replay,
2273	 * so the number of  RESERVE_BTREE type buckets should be as much
2274	 * as journal buckets
2275	 */
2276	btree_buckets = ca->sb.njournal_buckets ?: 8;
2277	free = roundup_pow_of_two(ca->sb.nbuckets) >> 10;
2278	if (!free) {
2279		ret = -EPERM;
2280		err = "ca->sb.nbuckets is too small";
2281		goto err_free;
2282	}
2283
2284	if (!init_fifo(&ca->free[RESERVE_BTREE], btree_buckets,
2285						GFP_KERNEL)) {
2286		err = "ca->free[RESERVE_BTREE] alloc failed";
2287		goto err_btree_alloc;
2288	}
2289
2290	if (!init_fifo_exact(&ca->free[RESERVE_PRIO], prio_buckets(ca),
2291							GFP_KERNEL)) {
2292		err = "ca->free[RESERVE_PRIO] alloc failed";
2293		goto err_prio_alloc;
2294	}
2295
2296	if (!init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL)) {
2297		err = "ca->free[RESERVE_MOVINGGC] alloc failed";
2298		goto err_movinggc_alloc;
2299	}
2300
2301	if (!init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL)) {
2302		err = "ca->free[RESERVE_NONE] alloc failed";
2303		goto err_none_alloc;
2304	}
2305
2306	if (!init_fifo(&ca->free_inc, free << 2, GFP_KERNEL)) {
2307		err = "ca->free_inc alloc failed";
2308		goto err_free_inc_alloc;
2309	}
2310
2311	if (!init_heap(&ca->heap, free << 3, GFP_KERNEL)) {
2312		err = "ca->heap alloc failed";
2313		goto err_heap_alloc;
2314	}
2315
2316	ca->buckets = vzalloc(array_size(sizeof(struct bucket),
2317			      ca->sb.nbuckets));
2318	if (!ca->buckets) {
2319		err = "ca->buckets alloc failed";
2320		goto err_buckets_alloc;
2321	}
2322
2323	ca->prio_buckets = kzalloc(array3_size(sizeof(uint64_t),
2324				   prio_buckets(ca), 2),
2325				   GFP_KERNEL);
2326	if (!ca->prio_buckets) {
2327		err = "ca->prio_buckets alloc failed";
2328		goto err_prio_buckets_alloc;
2329	}
2330
2331	ca->disk_buckets = alloc_meta_bucket_pages(GFP_KERNEL, &ca->sb);
2332	if (!ca->disk_buckets) {
2333		err = "ca->disk_buckets alloc failed";
2334		goto err_disk_buckets_alloc;
2335	}
2336
2337	ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
2338
2339	for_each_bucket(b, ca)
2340		atomic_set(&b->pin, 0);
2341	return 0;
2342
2343err_disk_buckets_alloc:
2344	kfree(ca->prio_buckets);
2345err_prio_buckets_alloc:
2346	vfree(ca->buckets);
2347err_buckets_alloc:
2348	free_heap(&ca->heap);
2349err_heap_alloc:
2350	free_fifo(&ca->free_inc);
2351err_free_inc_alloc:
2352	free_fifo(&ca->free[RESERVE_NONE]);
2353err_none_alloc:
2354	free_fifo(&ca->free[RESERVE_MOVINGGC]);
2355err_movinggc_alloc:
2356	free_fifo(&ca->free[RESERVE_PRIO]);
2357err_prio_alloc:
2358	free_fifo(&ca->free[RESERVE_BTREE]);
2359err_btree_alloc:
2360err_free:
2361	module_put(THIS_MODULE);
2362	if (err)
2363		pr_notice("error %s: %s\n", ca->cache_dev_name, err);
2364	return ret;
2365}
2366
2367static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
2368				struct block_device *bdev, struct cache *ca)
2369{
2370	const char *err = NULL; /* must be set for any error case */
2371	int ret = 0;
2372
2373	bdevname(bdev, ca->cache_dev_name);
2374	memcpy(&ca->sb, sb, sizeof(struct cache_sb));
2375	ca->bdev = bdev;
2376	ca->bdev->bd_holder = ca;
2377	ca->sb_disk = sb_disk;
2378
2379	if (blk_queue_discard(bdev_get_queue(bdev)))
2380		ca->discard = CACHE_DISCARD(&ca->sb);
2381
2382	ret = cache_alloc(ca);
2383	if (ret != 0) {
2384		/*
2385		 * If we failed here, it means ca->kobj is not initialized yet,
2386		 * kobject_put() won't be called and there is no chance to
2387		 * call blkdev_put() to bdev in bch_cache_release(). So we
2388		 * explicitly call blkdev_put() here.
2389		 */
2390		blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2391		if (ret == -ENOMEM)
2392			err = "cache_alloc(): -ENOMEM";
2393		else if (ret == -EPERM)
2394			err = "cache_alloc(): cache device is too small";
2395		else
2396			err = "cache_alloc(): unknown error";
2397		goto err;
2398	}
2399
2400	if (kobject_add(&ca->kobj,
2401			&part_to_dev(bdev->bd_part)->kobj,
2402			"bcache")) {
2403		err = "error calling kobject_add";
2404		ret = -ENOMEM;
2405		goto out;
2406	}
2407
2408	mutex_lock(&bch_register_lock);
2409	err = register_cache_set(ca);
2410	mutex_unlock(&bch_register_lock);
2411
2412	if (err) {
2413		ret = -ENODEV;
2414		goto out;
2415	}
2416
2417	pr_info("registered cache device %s\n", ca->cache_dev_name);
2418
2419out:
2420	kobject_put(&ca->kobj);
2421
2422err:
2423	if (err)
2424		pr_notice("error %s: %s\n", ca->cache_dev_name, err);
2425
2426	return ret;
2427}
2428
2429/* Global interfaces/init */
2430
2431static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
2432			       const char *buffer, size_t size);
2433static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
2434					 struct kobj_attribute *attr,
2435					 const char *buffer, size_t size);
2436
2437kobj_attribute_write(register,		register_bcache);
2438kobj_attribute_write(register_quiet,	register_bcache);
2439kobj_attribute_write(pendings_cleanup,	bch_pending_bdevs_cleanup);
2440
2441static bool bch_is_open_backing(struct block_device *bdev)
2442{
2443	struct cache_set *c, *tc;
2444	struct cached_dev *dc, *t;
2445
2446	list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
2447		list_for_each_entry_safe(dc, t, &c->cached_devs, list)
2448			if (dc->bdev == bdev)
2449				return true;
2450	list_for_each_entry_safe(dc, t, &uncached_devices, list)
2451		if (dc->bdev == bdev)
2452			return true;
2453	return false;
2454}
2455
2456static bool bch_is_open_cache(struct block_device *bdev)
2457{
2458	struct cache_set *c, *tc;
2459
2460	list_for_each_entry_safe(c, tc, &bch_cache_sets, list) {
2461		struct cache *ca = c->cache;
2462
2463		if (ca->bdev == bdev)
2464			return true;
2465	}
2466
2467	return false;
2468}
2469
2470static bool bch_is_open(struct block_device *bdev)
2471{
2472	return bch_is_open_cache(bdev) || bch_is_open_backing(bdev);
2473}
2474
2475struct async_reg_args {
2476	struct delayed_work reg_work;
2477	char *path;
2478	struct cache_sb *sb;
2479	struct cache_sb_disk *sb_disk;
2480	struct block_device *bdev;
2481};
2482
2483static void register_bdev_worker(struct work_struct *work)
2484{
2485	int fail = false;
2486	struct async_reg_args *args =
2487		container_of(work, struct async_reg_args, reg_work.work);
2488	struct cached_dev *dc;
2489
2490	dc = kzalloc(sizeof(*dc), GFP_KERNEL);
2491	if (!dc) {
2492		fail = true;
2493		put_page(virt_to_page(args->sb_disk));
2494		blkdev_put(args->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2495		goto out;
2496	}
2497
2498	mutex_lock(&bch_register_lock);
2499	if (register_bdev(args->sb, args->sb_disk, args->bdev, dc) < 0)
2500		fail = true;
2501	mutex_unlock(&bch_register_lock);
2502
2503out:
2504	if (fail)
2505		pr_info("error %s: fail to register backing device\n",
2506			args->path);
2507	kfree(args->sb);
2508	kfree(args->path);
2509	kfree(args);
2510	module_put(THIS_MODULE);
2511}
2512
2513static void register_cache_worker(struct work_struct *work)
2514{
2515	int fail = false;
2516	struct async_reg_args *args =
2517		container_of(work, struct async_reg_args, reg_work.work);
2518	struct cache *ca;
2519
2520	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
2521	if (!ca) {
2522		fail = true;
2523		put_page(virt_to_page(args->sb_disk));
2524		blkdev_put(args->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2525		goto out;
2526	}
2527
2528	/* blkdev_put() will be called in bch_cache_release() */
2529	if (register_cache(args->sb, args->sb_disk, args->bdev, ca) != 0)
2530		fail = true;
2531
2532out:
2533	if (fail)
2534		pr_info("error %s: fail to register cache device\n",
2535			args->path);
2536	kfree(args->sb);
2537	kfree(args->path);
2538	kfree(args);
2539	module_put(THIS_MODULE);
2540}
2541
2542static void register_device_aync(struct async_reg_args *args)
2543{
2544	if (SB_IS_BDEV(args->sb))
2545		INIT_DELAYED_WORK(&args->reg_work, register_bdev_worker);
2546	else
2547		INIT_DELAYED_WORK(&args->reg_work, register_cache_worker);
2548
2549	/* 10 jiffies is enough for a delay */
2550	queue_delayed_work(system_wq, &args->reg_work, 10);
2551}
2552
2553static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
2554			       const char *buffer, size_t size)
2555{
2556	const char *err;
2557	char *path = NULL;
2558	struct cache_sb *sb;
2559	struct cache_sb_disk *sb_disk;
2560	struct block_device *bdev;
2561	ssize_t ret;
2562	bool async_registration = false;
2563
2564#ifdef CONFIG_BCACHE_ASYNC_REGISTRATION
2565	async_registration = true;
2566#endif
2567
2568	ret = -EBUSY;
2569	err = "failed to reference bcache module";
2570	if (!try_module_get(THIS_MODULE))
2571		goto out;
2572
2573	/* For latest state of bcache_is_reboot */
2574	smp_mb();
2575	err = "bcache is in reboot";
2576	if (bcache_is_reboot)
2577		goto out_module_put;
2578
2579	ret = -ENOMEM;
2580	err = "cannot allocate memory";
2581	path = kstrndup(buffer, size, GFP_KERNEL);
2582	if (!path)
2583		goto out_module_put;
2584
2585	sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL);
2586	if (!sb)
2587		goto out_free_path;
2588
2589	ret = -EINVAL;
2590	err = "failed to open device";
2591	bdev = blkdev_get_by_path(strim(path),
2592				  FMODE_READ|FMODE_WRITE|FMODE_EXCL,
2593				  sb);
2594	if (IS_ERR(bdev)) {
2595		if (bdev == ERR_PTR(-EBUSY)) {
2596			bdev = lookup_bdev(strim(path));
2597			mutex_lock(&bch_register_lock);
2598			if (!IS_ERR(bdev) && bch_is_open(bdev))
2599				err = "device already registered";
2600			else
2601				err = "device busy";
2602			mutex_unlock(&bch_register_lock);
2603			if (!IS_ERR(bdev))
2604				bdput(bdev);
2605			if (attr == &ksysfs_register_quiet)
2606				goto done;
2607		}
2608		goto out_free_sb;
2609	}
2610
2611	err = "failed to set blocksize";
2612	if (set_blocksize(bdev, 4096))
2613		goto out_blkdev_put;
2614
2615	err = read_super(sb, bdev, &sb_disk);
2616	if (err)
2617		goto out_blkdev_put;
2618
2619	err = "failed to register device";
2620
2621	if (async_registration) {
2622		/* register in asynchronous way */
2623		struct async_reg_args *args =
2624			kzalloc(sizeof(struct async_reg_args), GFP_KERNEL);
2625
2626		if (!args) {
2627			ret = -ENOMEM;
2628			err = "cannot allocate memory";
2629			goto out_put_sb_page;
2630		}
2631
2632		args->path	= path;
2633		args->sb	= sb;
2634		args->sb_disk	= sb_disk;
2635		args->bdev	= bdev;
2636		register_device_aync(args);
2637		/* No wait and returns to user space */
2638		goto async_done;
2639	}
2640
2641	if (SB_IS_BDEV(sb)) {
2642		struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL);
2643
2644		if (!dc)
2645			goto out_put_sb_page;
2646
2647		mutex_lock(&bch_register_lock);
2648		ret = register_bdev(sb, sb_disk, bdev, dc);
2649		mutex_unlock(&bch_register_lock);
2650		/* blkdev_put() will be called in cached_dev_free() */
2651		if (ret < 0)
2652			goto out_free_sb;
2653	} else {
2654		struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
2655
2656		if (!ca)
2657			goto out_put_sb_page;
2658
2659		/* blkdev_put() will be called in bch_cache_release() */
2660		if (register_cache(sb, sb_disk, bdev, ca) != 0)
2661			goto out_free_sb;
2662	}
2663
2664done:
2665	kfree(sb);
2666	kfree(path);
2667	module_put(THIS_MODULE);
2668async_done:
2669	return size;
2670
2671out_put_sb_page:
2672	put_page(virt_to_page(sb_disk));
2673out_blkdev_put:
2674	blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2675out_free_sb:
2676	kfree(sb);
2677out_free_path:
2678	kfree(path);
2679	path = NULL;
2680out_module_put:
2681	module_put(THIS_MODULE);
2682out:
2683	pr_info("error %s: %s\n", path?path:"", err);
2684	return ret;
2685}
2686
2687
2688struct pdev {
2689	struct list_head list;
2690	struct cached_dev *dc;
2691};
2692
2693static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
2694					 struct kobj_attribute *attr,
2695					 const char *buffer,
2696					 size_t size)
2697{
2698	LIST_HEAD(pending_devs);
2699	ssize_t ret = size;
2700	struct cached_dev *dc, *tdc;
2701	struct pdev *pdev, *tpdev;
2702	struct cache_set *c, *tc;
2703
2704	mutex_lock(&bch_register_lock);
2705	list_for_each_entry_safe(dc, tdc, &uncached_devices, list) {
2706		pdev = kmalloc(sizeof(struct pdev), GFP_KERNEL);
2707		if (!pdev)
2708			break;
2709		pdev->dc = dc;
2710		list_add(&pdev->list, &pending_devs);
2711	}
2712
2713	list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) {
2714		list_for_each_entry_safe(c, tc, &bch_cache_sets, list) {
2715			char *pdev_set_uuid = pdev->dc->sb.set_uuid;
2716			char *set_uuid = c->set_uuid;
2717
2718			if (!memcmp(pdev_set_uuid, set_uuid, 16)) {
2719				list_del(&pdev->list);
2720				kfree(pdev);
2721				break;
2722			}
2723		}
2724	}
2725	mutex_unlock(&bch_register_lock);
2726
2727	list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) {
2728		pr_info("delete pdev %p\n", pdev);
2729		list_del(&pdev->list);
2730		bcache_device_stop(&pdev->dc->disk);
2731		kfree(pdev);
2732	}
2733
2734	return ret;
2735}
2736
2737static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
2738{
2739	if (bcache_is_reboot)
2740		return NOTIFY_DONE;
2741
2742	if (code == SYS_DOWN ||
2743	    code == SYS_HALT ||
2744	    code == SYS_POWER_OFF) {
2745		DEFINE_WAIT(wait);
2746		unsigned long start = jiffies;
2747		bool stopped = false;
2748
2749		struct cache_set *c, *tc;
2750		struct cached_dev *dc, *tdc;
2751
2752		mutex_lock(&bch_register_lock);
2753
2754		if (bcache_is_reboot)
2755			goto out;
2756
2757		/* New registration is rejected since now */
2758		bcache_is_reboot = true;
2759		/*
2760		 * Make registering caller (if there is) on other CPU
2761		 * core know bcache_is_reboot set to true earlier
2762		 */
2763		smp_mb();
2764
2765		if (list_empty(&bch_cache_sets) &&
2766		    list_empty(&uncached_devices))
2767			goto out;
2768
2769		mutex_unlock(&bch_register_lock);
2770
2771		pr_info("Stopping all devices:\n");
2772
2773		/*
2774		 * The reason bch_register_lock is not held to call
2775		 * bch_cache_set_stop() and bcache_device_stop() is to
2776		 * avoid potential deadlock during reboot, because cache
2777		 * set or bcache device stopping process will acqurie
2778		 * bch_register_lock too.
2779		 *
2780		 * We are safe here because bcache_is_reboot sets to
2781		 * true already, register_bcache() will reject new
2782		 * registration now. bcache_is_reboot also makes sure
2783		 * bcache_reboot() won't be re-entered on by other thread,
2784		 * so there is no race in following list iteration by
2785		 * list_for_each_entry_safe().
2786		 */
2787		list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
2788			bch_cache_set_stop(c);
2789
2790		list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
2791			bcache_device_stop(&dc->disk);
2792
2793
2794		/*
2795		 * Give an early chance for other kthreads and
2796		 * kworkers to stop themselves
2797		 */
2798		schedule();
2799
2800		/* What's a condition variable? */
2801		while (1) {
2802			long timeout = start + 10 * HZ - jiffies;
2803
2804			mutex_lock(&bch_register_lock);
2805			stopped = list_empty(&bch_cache_sets) &&
2806				list_empty(&uncached_devices);
2807
2808			if (timeout < 0 || stopped)
2809				break;
2810
2811			prepare_to_wait(&unregister_wait, &wait,
2812					TASK_UNINTERRUPTIBLE);
2813
2814			mutex_unlock(&bch_register_lock);
2815			schedule_timeout(timeout);
2816		}
2817
2818		finish_wait(&unregister_wait, &wait);
2819
2820		if (stopped)
2821			pr_info("All devices stopped\n");
2822		else
2823			pr_notice("Timeout waiting for devices to be closed\n");
2824out:
2825		mutex_unlock(&bch_register_lock);
2826	}
2827
2828	return NOTIFY_DONE;
2829}
2830
2831static struct notifier_block reboot = {
2832	.notifier_call	= bcache_reboot,
2833	.priority	= INT_MAX, /* before any real devices */
2834};
2835
2836static void bcache_exit(void)
2837{
2838	bch_debug_exit();
2839	bch_request_exit();
2840	if (bcache_kobj)
2841		kobject_put(bcache_kobj);
2842	if (bcache_wq)
2843		destroy_workqueue(bcache_wq);
2844	if (bch_journal_wq)
2845		destroy_workqueue(bch_journal_wq);
2846	if (bch_flush_wq)
2847		destroy_workqueue(bch_flush_wq);
2848	bch_btree_exit();
2849
2850	if (bcache_major)
2851		unregister_blkdev(bcache_major, "bcache");
2852	unregister_reboot_notifier(&reboot);
2853	mutex_destroy(&bch_register_lock);
2854}
2855
2856/* Check and fixup module parameters */
2857static void check_module_parameters(void)
2858{
2859	if (bch_cutoff_writeback_sync == 0)
2860		bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC;
2861	else if (bch_cutoff_writeback_sync > CUTOFF_WRITEBACK_SYNC_MAX) {
2862		pr_warn("set bch_cutoff_writeback_sync (%u) to max value %u\n",
2863			bch_cutoff_writeback_sync, CUTOFF_WRITEBACK_SYNC_MAX);
2864		bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC_MAX;
2865	}
2866
2867	if (bch_cutoff_writeback == 0)
2868		bch_cutoff_writeback = CUTOFF_WRITEBACK;
2869	else if (bch_cutoff_writeback > CUTOFF_WRITEBACK_MAX) {
2870		pr_warn("set bch_cutoff_writeback (%u) to max value %u\n",
2871			bch_cutoff_writeback, CUTOFF_WRITEBACK_MAX);
2872		bch_cutoff_writeback = CUTOFF_WRITEBACK_MAX;
2873	}
2874
2875	if (bch_cutoff_writeback > bch_cutoff_writeback_sync) {
2876		pr_warn("set bch_cutoff_writeback (%u) to %u\n",
2877			bch_cutoff_writeback, bch_cutoff_writeback_sync);
2878		bch_cutoff_writeback = bch_cutoff_writeback_sync;
2879	}
2880}
2881
2882static int __init bcache_init(void)
2883{
2884	static const struct attribute *files[] = {
2885		&ksysfs_register.attr,
2886		&ksysfs_register_quiet.attr,
2887		&ksysfs_pendings_cleanup.attr,
2888		NULL
2889	};
2890
2891	check_module_parameters();
2892
2893	mutex_init(&bch_register_lock);
2894	init_waitqueue_head(&unregister_wait);
2895	register_reboot_notifier(&reboot);
2896
2897	bcache_major = register_blkdev(0, "bcache");
2898	if (bcache_major < 0) {
2899		unregister_reboot_notifier(&reboot);
2900		mutex_destroy(&bch_register_lock);
2901		return bcache_major;
2902	}
2903
2904	if (bch_btree_init())
2905		goto err;
2906
2907	bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0);
2908	if (!bcache_wq)
2909		goto err;
2910
2911	/*
2912	 * Let's not make this `WQ_MEM_RECLAIM` for the following reasons:
2913	 *
2914	 * 1. It used `system_wq` before which also does no memory reclaim.
2915	 * 2. With `WQ_MEM_RECLAIM` desktop stalls, increased boot times, and
2916	 *    reduced throughput can be observed.
2917	 *
2918	 * We still want to user our own queue to not congest the `system_wq`.
2919	 */
2920	bch_flush_wq = alloc_workqueue("bch_flush", 0, 0);
2921	if (!bch_flush_wq)
2922		goto err;
2923
2924	bch_journal_wq = alloc_workqueue("bch_journal", WQ_MEM_RECLAIM, 0);
2925	if (!bch_journal_wq)
2926		goto err;
2927
2928	bcache_kobj = kobject_create_and_add("bcache", fs_kobj);
2929	if (!bcache_kobj)
2930		goto err;
2931
2932	if (bch_request_init() ||
2933	    sysfs_create_files(bcache_kobj, files))
2934		goto err;
2935
2936	bch_debug_init();
2937	closure_debug_init();
2938
2939	bcache_is_reboot = false;
2940
2941	return 0;
2942err:
2943	bcache_exit();
2944	return -ENOMEM;
2945}
2946
2947/*
2948 * Module hooks
2949 */
2950module_exit(bcache_exit);
2951module_init(bcache_init);
2952
2953module_param(bch_cutoff_writeback, uint, 0);
2954MODULE_PARM_DESC(bch_cutoff_writeback, "threshold to cutoff writeback");
2955
2956module_param(bch_cutoff_writeback_sync, uint, 0);
2957MODULE_PARM_DESC(bch_cutoff_writeback_sync, "hard threshold to cutoff writeback");
2958
2959MODULE_DESCRIPTION("Bcache: a Linux block layer cache");
2960MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
2961MODULE_LICENSE("GPL");
2962