1/*
2 * Copyright (C) 2011-2012 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-thin-metadata.h"
8#include "persistent-data/dm-btree.h"
9#include "persistent-data/dm-space-map.h"
10#include "persistent-data/dm-space-map-disk.h"
11#include "persistent-data/dm-transaction-manager.h"
12
13#include <linux/list.h>
14#include <linux/device-mapper.h>
15#include <linux/workqueue.h>
16
17/*--------------------------------------------------------------------------
18 * As far as the metadata goes, there is:
19 *
20 * - A superblock in block zero, taking up fewer than 512 bytes for
21 *   atomic writes.
22 *
23 * - A space map managing the metadata blocks.
24 *
25 * - A space map managing the data blocks.
26 *
27 * - A btree mapping our internal thin dev ids onto struct disk_device_details.
28 *
29 * - A hierarchical btree, with 2 levels which effectively maps (thin
30 *   dev id, virtual block) -> block_time.  Block time is a 64-bit
31 *   field holding the time in the low 24 bits, and block in the top 40
32 *   bits.
33 *
34 * BTrees consist solely of btree_nodes, that fill a block.  Some are
35 * internal nodes, as such their values are a __le64 pointing to other
36 * nodes.  Leaf nodes can store data of any reasonable size (ie. much
37 * smaller than the block size).  The nodes consist of the header,
38 * followed by an array of keys, followed by an array of values.  We have
39 * to binary search on the keys so they're all held together to help the
40 * cpu cache.
41 *
42 * Space maps have 2 btrees:
43 *
44 * - One maps a uint64_t onto a struct index_entry.  Which points to a
45 *   bitmap block, and has some details about how many free entries there
46 *   are etc.
47 *
48 * - The bitmap blocks have a header (for the checksum).  Then the rest
49 *   of the block is pairs of bits.  With the meaning being:
50 *
51 *   0 - ref count is 0
52 *   1 - ref count is 1
53 *   2 - ref count is 2
54 *   3 - ref count is higher than 2
55 *
56 * - If the count is higher than 2 then the ref count is entered in a
57 *   second btree that directly maps the block_address to a uint32_t ref
58 *   count.
59 *
60 * The space map metadata variant doesn't have a bitmaps btree.  Instead
61 * it has one single blocks worth of index_entries.  This avoids
62 * recursive issues with the bitmap btree needing to allocate space in
63 * order to insert.  With a small data block size such as 64k the
64 * metadata support data devices that are hundreds of terrabytes.
65 *
66 * The space maps allocate space linearly from front to back.  Space that
67 * is freed in a transaction is never recycled within that transaction.
68 * To try and avoid fragmenting _free_ space the allocator always goes
69 * back and fills in gaps.
70 *
71 * All metadata io is in THIN_METADATA_BLOCK_SIZE sized/aligned chunks
72 * from the block manager.
73 *--------------------------------------------------------------------------*/
74
75#define DM_MSG_PREFIX   "thin metadata"
76
77#define THIN_SUPERBLOCK_MAGIC 27022010
78#define THIN_SUPERBLOCK_LOCATION 0
79#define THIN_VERSION 2
80#define SECTOR_TO_BLOCK_SHIFT 3
81
82/*
83 * For btree insert:
84 *  3 for btree insert +
85 *  2 for btree lookup used within space map
86 * For btree remove:
87 *  2 for shadow spine +
88 *  4 for rebalance 3 child node
89 */
90#define THIN_MAX_CONCURRENT_LOCKS 6
91
92/* This should be plenty */
93#define SPACE_MAP_ROOT_SIZE 128
94
95/*
96 * Little endian on-disk superblock and device details.
97 */
98struct thin_disk_superblock {
99	__le32 csum;	/* Checksum of superblock except for this field. */
100	__le32 flags;
101	__le64 blocknr;	/* This block number, dm_block_t. */
102
103	__u8 uuid[16];
104	__le64 magic;
105	__le32 version;
106	__le32 time;
107
108	__le64 trans_id;
109
110	/*
111	 * Root held by userspace transactions.
112	 */
113	__le64 held_root;
114
115	__u8 data_space_map_root[SPACE_MAP_ROOT_SIZE];
116	__u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
117
118	/*
119	 * 2-level btree mapping (dev_id, (dev block, time)) -> data block
120	 */
121	__le64 data_mapping_root;
122
123	/*
124	 * Device detail root mapping dev_id -> device_details
125	 */
126	__le64 device_details_root;
127
128	__le32 data_block_size;		/* In 512-byte sectors. */
129
130	__le32 metadata_block_size;	/* In 512-byte sectors. */
131	__le64 metadata_nr_blocks;
132
133	__le32 compat_flags;
134	__le32 compat_ro_flags;
135	__le32 incompat_flags;
136} __packed;
137
138struct disk_device_details {
139	__le64 mapped_blocks;
140	__le64 transaction_id;		/* When created. */
141	__le32 creation_time;
142	__le32 snapshotted_time;
143} __packed;
144
145struct dm_pool_metadata {
146	struct hlist_node hash;
147
148	struct block_device *bdev;
149	struct dm_block_manager *bm;
150	struct dm_space_map *metadata_sm;
151	struct dm_space_map *data_sm;
152	struct dm_transaction_manager *tm;
153	struct dm_transaction_manager *nb_tm;
154
155	/*
156	 * Two-level btree.
157	 * First level holds thin_dev_t.
158	 * Second level holds mappings.
159	 */
160	struct dm_btree_info info;
161
162	/*
163	 * Non-blocking version of the above.
164	 */
165	struct dm_btree_info nb_info;
166
167	/*
168	 * Just the top level for deleting whole devices.
169	 */
170	struct dm_btree_info tl_info;
171
172	/*
173	 * Just the bottom level for creating new devices.
174	 */
175	struct dm_btree_info bl_info;
176
177	/*
178	 * Describes the device details btree.
179	 */
180	struct dm_btree_info details_info;
181
182	struct rw_semaphore root_lock;
183	uint32_t time;
184	dm_block_t root;
185	dm_block_t details_root;
186	struct list_head thin_devices;
187	uint64_t trans_id;
188	unsigned long flags;
189	sector_t data_block_size;
190
191	/*
192	 * Pre-commit callback.
193	 *
194	 * This allows the thin provisioning target to run a callback before
195	 * the metadata are committed.
196	 */
197	dm_pool_pre_commit_fn pre_commit_fn;
198	void *pre_commit_context;
199
200	/*
201	 * We reserve a section of the metadata for commit overhead.
202	 * All reported space does *not* include this.
203	 */
204	dm_block_t metadata_reserve;
205
206	/*
207	 * Set if a transaction has to be aborted but the attempt to roll back
208	 * to the previous (good) transaction failed.  The only pool metadata
209	 * operation possible in this state is the closing of the device.
210	 */
211	bool fail_io:1;
212
213	/*
214	 * Set once a thin-pool has been accessed through one of the interfaces
215	 * that imply the pool is in-service (e.g. thin devices created/deleted,
216	 * thin-pool message, metadata snapshots, etc).
217	 */
218	bool in_service:1;
219
220	/*
221	 * Reading the space map roots can fail, so we read it into these
222	 * buffers before the superblock is locked and updated.
223	 */
224	__u8 data_space_map_root[SPACE_MAP_ROOT_SIZE];
225	__u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
226};
227
228struct dm_thin_device {
229	struct list_head list;
230	struct dm_pool_metadata *pmd;
231	dm_thin_id id;
232
233	int open_count;
234	bool changed:1;
235	bool aborted_with_changes:1;
236	uint64_t mapped_blocks;
237	uint64_t transaction_id;
238	uint32_t creation_time;
239	uint32_t snapshotted_time;
240};
241
242/*----------------------------------------------------------------
243 * superblock validator
244 *--------------------------------------------------------------*/
245
246#define SUPERBLOCK_CSUM_XOR 160774
247
248static void sb_prepare_for_write(struct dm_block_validator *v,
249				 struct dm_block *b,
250				 size_t block_size)
251{
252	struct thin_disk_superblock *disk_super = dm_block_data(b);
253
254	disk_super->blocknr = cpu_to_le64(dm_block_location(b));
255	disk_super->csum = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
256						      block_size - sizeof(__le32),
257						      SUPERBLOCK_CSUM_XOR));
258}
259
260static int sb_check(struct dm_block_validator *v,
261		    struct dm_block *b,
262		    size_t block_size)
263{
264	struct thin_disk_superblock *disk_super = dm_block_data(b);
265	__le32 csum_le;
266
267	if (dm_block_location(b) != le64_to_cpu(disk_super->blocknr)) {
268		DMERR("sb_check failed: blocknr %llu: "
269		      "wanted %llu", le64_to_cpu(disk_super->blocknr),
270		      (unsigned long long)dm_block_location(b));
271		return -ENOTBLK;
272	}
273
274	if (le64_to_cpu(disk_super->magic) != THIN_SUPERBLOCK_MAGIC) {
275		DMERR("sb_check failed: magic %llu: "
276		      "wanted %llu", le64_to_cpu(disk_super->magic),
277		      (unsigned long long)THIN_SUPERBLOCK_MAGIC);
278		return -EILSEQ;
279	}
280
281	csum_le = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
282					     block_size - sizeof(__le32),
283					     SUPERBLOCK_CSUM_XOR));
284	if (csum_le != disk_super->csum) {
285		DMERR("sb_check failed: csum %u: wanted %u",
286		      le32_to_cpu(csum_le), le32_to_cpu(disk_super->csum));
287		return -EILSEQ;
288	}
289
290	return 0;
291}
292
293static struct dm_block_validator sb_validator = {
294	.name = "superblock",
295	.prepare_for_write = sb_prepare_for_write,
296	.check = sb_check
297};
298
299/*----------------------------------------------------------------
300 * Methods for the btree value types
301 *--------------------------------------------------------------*/
302
303static uint64_t pack_block_time(dm_block_t b, uint32_t t)
304{
305	return (b << 24) | t;
306}
307
308static void unpack_block_time(uint64_t v, dm_block_t *b, uint32_t *t)
309{
310	*b = v >> 24;
311	*t = v & ((1 << 24) - 1);
312}
313
314static void data_block_inc(void *context, const void *value_le)
315{
316	struct dm_space_map *sm = context;
317	__le64 v_le;
318	uint64_t b;
319	uint32_t t;
320
321	memcpy(&v_le, value_le, sizeof(v_le));
322	unpack_block_time(le64_to_cpu(v_le), &b, &t);
323	dm_sm_inc_block(sm, b);
324}
325
326static void data_block_dec(void *context, const void *value_le)
327{
328	struct dm_space_map *sm = context;
329	__le64 v_le;
330	uint64_t b;
331	uint32_t t;
332
333	memcpy(&v_le, value_le, sizeof(v_le));
334	unpack_block_time(le64_to_cpu(v_le), &b, &t);
335	dm_sm_dec_block(sm, b);
336}
337
338static int data_block_equal(void *context, const void *value1_le, const void *value2_le)
339{
340	__le64 v1_le, v2_le;
341	uint64_t b1, b2;
342	uint32_t t;
343
344	memcpy(&v1_le, value1_le, sizeof(v1_le));
345	memcpy(&v2_le, value2_le, sizeof(v2_le));
346	unpack_block_time(le64_to_cpu(v1_le), &b1, &t);
347	unpack_block_time(le64_to_cpu(v2_le), &b2, &t);
348
349	return b1 == b2;
350}
351
352static void subtree_inc(void *context, const void *value)
353{
354	struct dm_btree_info *info = context;
355	__le64 root_le;
356	uint64_t root;
357
358	memcpy(&root_le, value, sizeof(root_le));
359	root = le64_to_cpu(root_le);
360	dm_tm_inc(info->tm, root);
361}
362
363static void subtree_dec(void *context, const void *value)
364{
365	struct dm_btree_info *info = context;
366	__le64 root_le;
367	uint64_t root;
368
369	memcpy(&root_le, value, sizeof(root_le));
370	root = le64_to_cpu(root_le);
371	if (dm_btree_del(info, root))
372		DMERR("btree delete failed");
373}
374
375static int subtree_equal(void *context, const void *value1_le, const void *value2_le)
376{
377	__le64 v1_le, v2_le;
378	memcpy(&v1_le, value1_le, sizeof(v1_le));
379	memcpy(&v2_le, value2_le, sizeof(v2_le));
380
381	return v1_le == v2_le;
382}
383
384/*----------------------------------------------------------------*/
385
386/*
387 * Variant that is used for in-core only changes or code that
388 * shouldn't put the pool in service on its own (e.g. commit).
389 */
390static inline void pmd_write_lock_in_core(struct dm_pool_metadata *pmd)
391	__acquires(pmd->root_lock)
392{
393	down_write(&pmd->root_lock);
394}
395
396static inline void pmd_write_lock(struct dm_pool_metadata *pmd)
397{
398	pmd_write_lock_in_core(pmd);
399	if (unlikely(!pmd->in_service))
400		pmd->in_service = true;
401}
402
403static inline void pmd_write_unlock(struct dm_pool_metadata *pmd)
404	__releases(pmd->root_lock)
405{
406	up_write(&pmd->root_lock);
407}
408
409/*----------------------------------------------------------------*/
410
411static int superblock_lock_zero(struct dm_pool_metadata *pmd,
412				struct dm_block **sblock)
413{
414	return dm_bm_write_lock_zero(pmd->bm, THIN_SUPERBLOCK_LOCATION,
415				     &sb_validator, sblock);
416}
417
418static int superblock_lock(struct dm_pool_metadata *pmd,
419			   struct dm_block **sblock)
420{
421	return dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
422				&sb_validator, sblock);
423}
424
425static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result)
426{
427	int r;
428	unsigned i;
429	struct dm_block *b;
430	__le64 *data_le, zero = cpu_to_le64(0);
431	unsigned block_size = dm_bm_block_size(bm) / sizeof(__le64);
432
433	/*
434	 * We can't use a validator here - it may be all zeroes.
435	 */
436	r = dm_bm_read_lock(bm, THIN_SUPERBLOCK_LOCATION, NULL, &b);
437	if (r)
438		return r;
439
440	data_le = dm_block_data(b);
441	*result = 1;
442	for (i = 0; i < block_size; i++) {
443		if (data_le[i] != zero) {
444			*result = 0;
445			break;
446		}
447	}
448
449	dm_bm_unlock(b);
450
451	return 0;
452}
453
454static void __setup_btree_details(struct dm_pool_metadata *pmd)
455{
456	pmd->info.tm = pmd->tm;
457	pmd->info.levels = 2;
458	pmd->info.value_type.context = pmd->data_sm;
459	pmd->info.value_type.size = sizeof(__le64);
460	pmd->info.value_type.inc = data_block_inc;
461	pmd->info.value_type.dec = data_block_dec;
462	pmd->info.value_type.equal = data_block_equal;
463
464	memcpy(&pmd->nb_info, &pmd->info, sizeof(pmd->nb_info));
465	pmd->nb_info.tm = pmd->nb_tm;
466
467	pmd->tl_info.tm = pmd->tm;
468	pmd->tl_info.levels = 1;
469	pmd->tl_info.value_type.context = &pmd->bl_info;
470	pmd->tl_info.value_type.size = sizeof(__le64);
471	pmd->tl_info.value_type.inc = subtree_inc;
472	pmd->tl_info.value_type.dec = subtree_dec;
473	pmd->tl_info.value_type.equal = subtree_equal;
474
475	pmd->bl_info.tm = pmd->tm;
476	pmd->bl_info.levels = 1;
477	pmd->bl_info.value_type.context = pmd->data_sm;
478	pmd->bl_info.value_type.size = sizeof(__le64);
479	pmd->bl_info.value_type.inc = data_block_inc;
480	pmd->bl_info.value_type.dec = data_block_dec;
481	pmd->bl_info.value_type.equal = data_block_equal;
482
483	pmd->details_info.tm = pmd->tm;
484	pmd->details_info.levels = 1;
485	pmd->details_info.value_type.context = NULL;
486	pmd->details_info.value_type.size = sizeof(struct disk_device_details);
487	pmd->details_info.value_type.inc = NULL;
488	pmd->details_info.value_type.dec = NULL;
489	pmd->details_info.value_type.equal = NULL;
490}
491
492static int save_sm_roots(struct dm_pool_metadata *pmd)
493{
494	int r;
495	size_t len;
496
497	r = dm_sm_root_size(pmd->metadata_sm, &len);
498	if (r < 0)
499		return r;
500
501	r = dm_sm_copy_root(pmd->metadata_sm, &pmd->metadata_space_map_root, len);
502	if (r < 0)
503		return r;
504
505	r = dm_sm_root_size(pmd->data_sm, &len);
506	if (r < 0)
507		return r;
508
509	return dm_sm_copy_root(pmd->data_sm, &pmd->data_space_map_root, len);
510}
511
512static void copy_sm_roots(struct dm_pool_metadata *pmd,
513			  struct thin_disk_superblock *disk)
514{
515	memcpy(&disk->metadata_space_map_root,
516	       &pmd->metadata_space_map_root,
517	       sizeof(pmd->metadata_space_map_root));
518
519	memcpy(&disk->data_space_map_root,
520	       &pmd->data_space_map_root,
521	       sizeof(pmd->data_space_map_root));
522}
523
524static int __write_initial_superblock(struct dm_pool_metadata *pmd)
525{
526	int r;
527	struct dm_block *sblock;
528	struct thin_disk_superblock *disk_super;
529	sector_t bdev_size = i_size_read(pmd->bdev->bd_inode) >> SECTOR_SHIFT;
530
531	if (bdev_size > THIN_METADATA_MAX_SECTORS)
532		bdev_size = THIN_METADATA_MAX_SECTORS;
533
534	r = dm_sm_commit(pmd->data_sm);
535	if (r < 0)
536		return r;
537
538	r = dm_tm_pre_commit(pmd->tm);
539	if (r < 0)
540		return r;
541
542	r = save_sm_roots(pmd);
543	if (r < 0)
544		return r;
545
546	r = superblock_lock_zero(pmd, &sblock);
547	if (r)
548		return r;
549
550	disk_super = dm_block_data(sblock);
551	disk_super->flags = 0;
552	memset(disk_super->uuid, 0, sizeof(disk_super->uuid));
553	disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC);
554	disk_super->version = cpu_to_le32(THIN_VERSION);
555	disk_super->time = 0;
556	disk_super->trans_id = 0;
557	disk_super->held_root = 0;
558
559	copy_sm_roots(pmd, disk_super);
560
561	disk_super->data_mapping_root = cpu_to_le64(pmd->root);
562	disk_super->device_details_root = cpu_to_le64(pmd->details_root);
563	disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE);
564	disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT);
565	disk_super->data_block_size = cpu_to_le32(pmd->data_block_size);
566
567	return dm_tm_commit(pmd->tm, sblock);
568}
569
570static int __format_metadata(struct dm_pool_metadata *pmd)
571{
572	int r;
573
574	r = dm_tm_create_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION,
575				 &pmd->tm, &pmd->metadata_sm);
576	if (r < 0) {
577		DMERR("tm_create_with_sm failed");
578		return r;
579	}
580
581	pmd->data_sm = dm_sm_disk_create(pmd->tm, 0);
582	if (IS_ERR(pmd->data_sm)) {
583		DMERR("sm_disk_create failed");
584		r = PTR_ERR(pmd->data_sm);
585		goto bad_cleanup_tm;
586	}
587
588	pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm);
589	if (!pmd->nb_tm) {
590		DMERR("could not create non-blocking clone tm");
591		r = -ENOMEM;
592		goto bad_cleanup_data_sm;
593	}
594
595	__setup_btree_details(pmd);
596
597	r = dm_btree_empty(&pmd->info, &pmd->root);
598	if (r < 0)
599		goto bad_cleanup_nb_tm;
600
601	r = dm_btree_empty(&pmd->details_info, &pmd->details_root);
602	if (r < 0) {
603		DMERR("couldn't create devices root");
604		goto bad_cleanup_nb_tm;
605	}
606
607	r = __write_initial_superblock(pmd);
608	if (r)
609		goto bad_cleanup_nb_tm;
610
611	return 0;
612
613bad_cleanup_nb_tm:
614	dm_tm_destroy(pmd->nb_tm);
615bad_cleanup_data_sm:
616	dm_sm_destroy(pmd->data_sm);
617bad_cleanup_tm:
618	dm_tm_destroy(pmd->tm);
619	dm_sm_destroy(pmd->metadata_sm);
620
621	return r;
622}
623
624static int __check_incompat_features(struct thin_disk_superblock *disk_super,
625				     struct dm_pool_metadata *pmd)
626{
627	uint32_t features;
628
629	features = le32_to_cpu(disk_super->incompat_flags) & ~THIN_FEATURE_INCOMPAT_SUPP;
630	if (features) {
631		DMERR("could not access metadata due to unsupported optional features (%lx).",
632		      (unsigned long)features);
633		return -EINVAL;
634	}
635
636	/*
637	 * Check for read-only metadata to skip the following RDWR checks.
638	 */
639	if (get_disk_ro(pmd->bdev->bd_disk))
640		return 0;
641
642	features = le32_to_cpu(disk_super->compat_ro_flags) & ~THIN_FEATURE_COMPAT_RO_SUPP;
643	if (features) {
644		DMERR("could not access metadata RDWR due to unsupported optional features (%lx).",
645		      (unsigned long)features);
646		return -EINVAL;
647	}
648
649	return 0;
650}
651
652static int __open_metadata(struct dm_pool_metadata *pmd)
653{
654	int r;
655	struct dm_block *sblock;
656	struct thin_disk_superblock *disk_super;
657
658	r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
659			    &sb_validator, &sblock);
660	if (r < 0) {
661		DMERR("couldn't read superblock");
662		return r;
663	}
664
665	disk_super = dm_block_data(sblock);
666
667	/* Verify the data block size hasn't changed */
668	if (le32_to_cpu(disk_super->data_block_size) != pmd->data_block_size) {
669		DMERR("changing the data block size (from %u to %llu) is not supported",
670		      le32_to_cpu(disk_super->data_block_size),
671		      (unsigned long long)pmd->data_block_size);
672		r = -EINVAL;
673		goto bad_unlock_sblock;
674	}
675
676	r = __check_incompat_features(disk_super, pmd);
677	if (r < 0)
678		goto bad_unlock_sblock;
679
680	r = dm_tm_open_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION,
681			       disk_super->metadata_space_map_root,
682			       sizeof(disk_super->metadata_space_map_root),
683			       &pmd->tm, &pmd->metadata_sm);
684	if (r < 0) {
685		DMERR("tm_open_with_sm failed");
686		goto bad_unlock_sblock;
687	}
688
689	pmd->data_sm = dm_sm_disk_open(pmd->tm, disk_super->data_space_map_root,
690				       sizeof(disk_super->data_space_map_root));
691	if (IS_ERR(pmd->data_sm)) {
692		DMERR("sm_disk_open failed");
693		r = PTR_ERR(pmd->data_sm);
694		goto bad_cleanup_tm;
695	}
696
697	pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm);
698	if (!pmd->nb_tm) {
699		DMERR("could not create non-blocking clone tm");
700		r = -ENOMEM;
701		goto bad_cleanup_data_sm;
702	}
703
704	/*
705	 * For pool metadata opening process, root setting is redundant
706	 * because it will be set again in __begin_transaction(). But dm
707	 * pool aborting process really needs to get last transaction's
708	 * root to avoid accessing broken btree.
709	 */
710	pmd->root = le64_to_cpu(disk_super->data_mapping_root);
711	pmd->details_root = le64_to_cpu(disk_super->device_details_root);
712
713	__setup_btree_details(pmd);
714	dm_bm_unlock(sblock);
715
716	return 0;
717
718bad_cleanup_data_sm:
719	dm_sm_destroy(pmd->data_sm);
720bad_cleanup_tm:
721	dm_tm_destroy(pmd->tm);
722	dm_sm_destroy(pmd->metadata_sm);
723bad_unlock_sblock:
724	dm_bm_unlock(sblock);
725
726	return r;
727}
728
729static int __open_or_format_metadata(struct dm_pool_metadata *pmd, bool format_device)
730{
731	int r, unformatted;
732
733	r = __superblock_all_zeroes(pmd->bm, &unformatted);
734	if (r)
735		return r;
736
737	if (unformatted)
738		return format_device ? __format_metadata(pmd) : -EPERM;
739
740	return __open_metadata(pmd);
741}
742
743static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool format_device)
744{
745	int r;
746
747	pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
748					  THIN_MAX_CONCURRENT_LOCKS);
749	if (IS_ERR(pmd->bm)) {
750		DMERR("could not create block manager");
751		r = PTR_ERR(pmd->bm);
752		pmd->bm = NULL;
753		return r;
754	}
755
756	r = __open_or_format_metadata(pmd, format_device);
757	if (r) {
758		dm_block_manager_destroy(pmd->bm);
759		pmd->bm = NULL;
760	}
761
762	return r;
763}
764
765static void __destroy_persistent_data_objects(struct dm_pool_metadata *pmd,
766					      bool destroy_bm)
767{
768	dm_sm_destroy(pmd->data_sm);
769	dm_sm_destroy(pmd->metadata_sm);
770	dm_tm_destroy(pmd->nb_tm);
771	dm_tm_destroy(pmd->tm);
772	if (destroy_bm)
773		dm_block_manager_destroy(pmd->bm);
774}
775
776static int __begin_transaction(struct dm_pool_metadata *pmd)
777{
778	int r;
779	struct thin_disk_superblock *disk_super;
780	struct dm_block *sblock;
781
782	/*
783	 * We re-read the superblock every time.  Shouldn't need to do this
784	 * really.
785	 */
786	r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
787			    &sb_validator, &sblock);
788	if (r)
789		return r;
790
791	disk_super = dm_block_data(sblock);
792	pmd->time = le32_to_cpu(disk_super->time);
793	pmd->root = le64_to_cpu(disk_super->data_mapping_root);
794	pmd->details_root = le64_to_cpu(disk_super->device_details_root);
795	pmd->trans_id = le64_to_cpu(disk_super->trans_id);
796	pmd->flags = le32_to_cpu(disk_super->flags);
797	pmd->data_block_size = le32_to_cpu(disk_super->data_block_size);
798
799	dm_bm_unlock(sblock);
800	return 0;
801}
802
803static int __write_changed_details(struct dm_pool_metadata *pmd)
804{
805	int r;
806	struct dm_thin_device *td, *tmp;
807	struct disk_device_details details;
808	uint64_t key;
809
810	list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
811		if (!td->changed)
812			continue;
813
814		key = td->id;
815
816		details.mapped_blocks = cpu_to_le64(td->mapped_blocks);
817		details.transaction_id = cpu_to_le64(td->transaction_id);
818		details.creation_time = cpu_to_le32(td->creation_time);
819		details.snapshotted_time = cpu_to_le32(td->snapshotted_time);
820		__dm_bless_for_disk(&details);
821
822		r = dm_btree_insert(&pmd->details_info, pmd->details_root,
823				    &key, &details, &pmd->details_root);
824		if (r)
825			return r;
826
827		if (td->open_count)
828			td->changed = false;
829		else {
830			list_del(&td->list);
831			kfree(td);
832		}
833	}
834
835	return 0;
836}
837
838static int __commit_transaction(struct dm_pool_metadata *pmd)
839{
840	int r;
841	struct thin_disk_superblock *disk_super;
842	struct dm_block *sblock;
843
844	/*
845	 * We need to know if the thin_disk_superblock exceeds a 512-byte sector.
846	 */
847	BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512);
848	BUG_ON(!rwsem_is_locked(&pmd->root_lock));
849
850	if (unlikely(!pmd->in_service))
851		return 0;
852
853	if (pmd->pre_commit_fn) {
854		r = pmd->pre_commit_fn(pmd->pre_commit_context);
855		if (r < 0) {
856			DMERR("pre-commit callback failed");
857			return r;
858		}
859	}
860
861	r = __write_changed_details(pmd);
862	if (r < 0)
863		return r;
864
865	r = dm_sm_commit(pmd->data_sm);
866	if (r < 0)
867		return r;
868
869	r = dm_tm_pre_commit(pmd->tm);
870	if (r < 0)
871		return r;
872
873	r = save_sm_roots(pmd);
874	if (r < 0)
875		return r;
876
877	r = superblock_lock(pmd, &sblock);
878	if (r)
879		return r;
880
881	disk_super = dm_block_data(sblock);
882	disk_super->time = cpu_to_le32(pmd->time);
883	disk_super->data_mapping_root = cpu_to_le64(pmd->root);
884	disk_super->device_details_root = cpu_to_le64(pmd->details_root);
885	disk_super->trans_id = cpu_to_le64(pmd->trans_id);
886	disk_super->flags = cpu_to_le32(pmd->flags);
887
888	copy_sm_roots(pmd, disk_super);
889
890	return dm_tm_commit(pmd->tm, sblock);
891}
892
893static void __set_metadata_reserve(struct dm_pool_metadata *pmd)
894{
895	int r;
896	dm_block_t total;
897	dm_block_t max_blocks = 4096; /* 16M */
898
899	r = dm_sm_get_nr_blocks(pmd->metadata_sm, &total);
900	if (r) {
901		DMERR("could not get size of metadata device");
902		pmd->metadata_reserve = max_blocks;
903	} else
904		pmd->metadata_reserve = min(max_blocks, div_u64(total, 10));
905}
906
907struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
908					       sector_t data_block_size,
909					       bool format_device)
910{
911	int r;
912	struct dm_pool_metadata *pmd;
913
914	pmd = kmalloc(sizeof(*pmd), GFP_KERNEL);
915	if (!pmd) {
916		DMERR("could not allocate metadata struct");
917		return ERR_PTR(-ENOMEM);
918	}
919
920	init_rwsem(&pmd->root_lock);
921	pmd->time = 0;
922	INIT_LIST_HEAD(&pmd->thin_devices);
923	pmd->fail_io = false;
924	pmd->in_service = false;
925	pmd->bdev = bdev;
926	pmd->data_block_size = data_block_size;
927	pmd->pre_commit_fn = NULL;
928	pmd->pre_commit_context = NULL;
929
930	r = __create_persistent_data_objects(pmd, format_device);
931	if (r) {
932		kfree(pmd);
933		return ERR_PTR(r);
934	}
935
936	r = __begin_transaction(pmd);
937	if (r < 0) {
938		if (dm_pool_metadata_close(pmd) < 0)
939			DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
940		return ERR_PTR(r);
941	}
942
943	__set_metadata_reserve(pmd);
944
945	return pmd;
946}
947
948int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
949{
950	int r;
951	unsigned open_devices = 0;
952	struct dm_thin_device *td, *tmp;
953
954	down_read(&pmd->root_lock);
955	list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
956		if (td->open_count)
957			open_devices++;
958		else {
959			list_del(&td->list);
960			kfree(td);
961		}
962	}
963	up_read(&pmd->root_lock);
964
965	if (open_devices) {
966		DMERR("attempt to close pmd when %u device(s) are still open",
967		       open_devices);
968		return -EBUSY;
969	}
970
971	pmd_write_lock_in_core(pmd);
972	if (!pmd->fail_io && !dm_bm_is_read_only(pmd->bm)) {
973		r = __commit_transaction(pmd);
974		if (r < 0)
975			DMWARN("%s: __commit_transaction() failed, error = %d",
976			       __func__, r);
977	}
978	pmd_write_unlock(pmd);
979	if (!pmd->fail_io)
980		__destroy_persistent_data_objects(pmd, true);
981
982	kfree(pmd);
983	return 0;
984}
985
986/*
987 * __open_device: Returns @td corresponding to device with id @dev,
988 * creating it if @create is set and incrementing @td->open_count.
989 * On failure, @td is undefined.
990 */
991static int __open_device(struct dm_pool_metadata *pmd,
992			 dm_thin_id dev, int create,
993			 struct dm_thin_device **td)
994{
995	int r, changed = 0;
996	struct dm_thin_device *td2;
997	uint64_t key = dev;
998	struct disk_device_details details_le;
999
1000	/*
1001	 * If the device is already open, return it.
1002	 */
1003	list_for_each_entry(td2, &pmd->thin_devices, list)
1004		if (td2->id == dev) {
1005			/*
1006			 * May not create an already-open device.
1007			 */
1008			if (create)
1009				return -EEXIST;
1010
1011			td2->open_count++;
1012			*td = td2;
1013			return 0;
1014		}
1015
1016	/*
1017	 * Check the device exists.
1018	 */
1019	r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
1020			    &key, &details_le);
1021	if (r) {
1022		if (r != -ENODATA || !create)
1023			return r;
1024
1025		/*
1026		 * Create new device.
1027		 */
1028		changed = 1;
1029		details_le.mapped_blocks = 0;
1030		details_le.transaction_id = cpu_to_le64(pmd->trans_id);
1031		details_le.creation_time = cpu_to_le32(pmd->time);
1032		details_le.snapshotted_time = cpu_to_le32(pmd->time);
1033	}
1034
1035	*td = kmalloc(sizeof(**td), GFP_NOIO);
1036	if (!*td)
1037		return -ENOMEM;
1038
1039	(*td)->pmd = pmd;
1040	(*td)->id = dev;
1041	(*td)->open_count = 1;
1042	(*td)->changed = changed;
1043	(*td)->aborted_with_changes = false;
1044	(*td)->mapped_blocks = le64_to_cpu(details_le.mapped_blocks);
1045	(*td)->transaction_id = le64_to_cpu(details_le.transaction_id);
1046	(*td)->creation_time = le32_to_cpu(details_le.creation_time);
1047	(*td)->snapshotted_time = le32_to_cpu(details_le.snapshotted_time);
1048
1049	list_add(&(*td)->list, &pmd->thin_devices);
1050
1051	return 0;
1052}
1053
1054static void __close_device(struct dm_thin_device *td)
1055{
1056	--td->open_count;
1057}
1058
1059static int __create_thin(struct dm_pool_metadata *pmd,
1060			 dm_thin_id dev)
1061{
1062	int r;
1063	dm_block_t dev_root;
1064	uint64_t key = dev;
1065	struct dm_thin_device *td;
1066	__le64 value;
1067
1068	r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
1069			    &key, NULL);
1070	if (!r)
1071		return -EEXIST;
1072
1073	/*
1074	 * Create an empty btree for the mappings.
1075	 */
1076	r = dm_btree_empty(&pmd->bl_info, &dev_root);
1077	if (r)
1078		return r;
1079
1080	/*
1081	 * Insert it into the main mapping tree.
1082	 */
1083	value = cpu_to_le64(dev_root);
1084	__dm_bless_for_disk(&value);
1085	r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root);
1086	if (r) {
1087		dm_btree_del(&pmd->bl_info, dev_root);
1088		return r;
1089	}
1090
1091	r = __open_device(pmd, dev, 1, &td);
1092	if (r) {
1093		dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
1094		dm_btree_del(&pmd->bl_info, dev_root);
1095		return r;
1096	}
1097	__close_device(td);
1098
1099	return r;
1100}
1101
1102int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev)
1103{
1104	int r = -EINVAL;
1105
1106	pmd_write_lock(pmd);
1107	if (!pmd->fail_io)
1108		r = __create_thin(pmd, dev);
1109	pmd_write_unlock(pmd);
1110
1111	return r;
1112}
1113
1114static int __set_snapshot_details(struct dm_pool_metadata *pmd,
1115				  struct dm_thin_device *snap,
1116				  dm_thin_id origin, uint32_t time)
1117{
1118	int r;
1119	struct dm_thin_device *td;
1120
1121	r = __open_device(pmd, origin, 0, &td);
1122	if (r)
1123		return r;
1124
1125	td->changed = true;
1126	td->snapshotted_time = time;
1127
1128	snap->mapped_blocks = td->mapped_blocks;
1129	snap->snapshotted_time = time;
1130	__close_device(td);
1131
1132	return 0;
1133}
1134
1135static int __create_snap(struct dm_pool_metadata *pmd,
1136			 dm_thin_id dev, dm_thin_id origin)
1137{
1138	int r;
1139	dm_block_t origin_root;
1140	uint64_t key = origin, dev_key = dev;
1141	struct dm_thin_device *td;
1142	__le64 value;
1143
1144	/* check this device is unused */
1145	r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
1146			    &dev_key, NULL);
1147	if (!r)
1148		return -EEXIST;
1149
1150	/* find the mapping tree for the origin */
1151	r = dm_btree_lookup(&pmd->tl_info, pmd->root, &key, &value);
1152	if (r)
1153		return r;
1154	origin_root = le64_to_cpu(value);
1155
1156	/* clone the origin, an inc will do */
1157	dm_tm_inc(pmd->tm, origin_root);
1158
1159	/* insert into the main mapping tree */
1160	value = cpu_to_le64(origin_root);
1161	__dm_bless_for_disk(&value);
1162	key = dev;
1163	r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root);
1164	if (r) {
1165		dm_tm_dec(pmd->tm, origin_root);
1166		return r;
1167	}
1168
1169	pmd->time++;
1170
1171	r = __open_device(pmd, dev, 1, &td);
1172	if (r)
1173		goto bad;
1174
1175	r = __set_snapshot_details(pmd, td, origin, pmd->time);
1176	__close_device(td);
1177
1178	if (r)
1179		goto bad;
1180
1181	return 0;
1182
1183bad:
1184	dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
1185	dm_btree_remove(&pmd->details_info, pmd->details_root,
1186			&key, &pmd->details_root);
1187	return r;
1188}
1189
1190int dm_pool_create_snap(struct dm_pool_metadata *pmd,
1191				 dm_thin_id dev,
1192				 dm_thin_id origin)
1193{
1194	int r = -EINVAL;
1195
1196	pmd_write_lock(pmd);
1197	if (!pmd->fail_io)
1198		r = __create_snap(pmd, dev, origin);
1199	pmd_write_unlock(pmd);
1200
1201	return r;
1202}
1203
1204static int __delete_device(struct dm_pool_metadata *pmd, dm_thin_id dev)
1205{
1206	int r;
1207	uint64_t key = dev;
1208	struct dm_thin_device *td;
1209
1210	/* TODO: failure should mark the transaction invalid */
1211	r = __open_device(pmd, dev, 0, &td);
1212	if (r)
1213		return r;
1214
1215	if (td->open_count > 1) {
1216		__close_device(td);
1217		return -EBUSY;
1218	}
1219
1220	list_del(&td->list);
1221	kfree(td);
1222	r = dm_btree_remove(&pmd->details_info, pmd->details_root,
1223			    &key, &pmd->details_root);
1224	if (r)
1225		return r;
1226
1227	r = dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
1228	if (r)
1229		return r;
1230
1231	return 0;
1232}
1233
1234int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd,
1235			       dm_thin_id dev)
1236{
1237	int r = -EINVAL;
1238
1239	pmd_write_lock(pmd);
1240	if (!pmd->fail_io)
1241		r = __delete_device(pmd, dev);
1242	pmd_write_unlock(pmd);
1243
1244	return r;
1245}
1246
1247int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd,
1248					uint64_t current_id,
1249					uint64_t new_id)
1250{
1251	int r = -EINVAL;
1252
1253	pmd_write_lock(pmd);
1254
1255	if (pmd->fail_io)
1256		goto out;
1257
1258	if (pmd->trans_id != current_id) {
1259		DMERR("mismatched transaction id");
1260		goto out;
1261	}
1262
1263	pmd->trans_id = new_id;
1264	r = 0;
1265
1266out:
1267	pmd_write_unlock(pmd);
1268
1269	return r;
1270}
1271
1272int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd,
1273					uint64_t *result)
1274{
1275	int r = -EINVAL;
1276
1277	down_read(&pmd->root_lock);
1278	if (!pmd->fail_io) {
1279		*result = pmd->trans_id;
1280		r = 0;
1281	}
1282	up_read(&pmd->root_lock);
1283
1284	return r;
1285}
1286
1287static int __reserve_metadata_snap(struct dm_pool_metadata *pmd)
1288{
1289	int r, inc;
1290	struct thin_disk_superblock *disk_super;
1291	struct dm_block *copy, *sblock;
1292	dm_block_t held_root;
1293
1294	/*
1295	 * We commit to ensure the btree roots which we increment in a
1296	 * moment are up to date.
1297	 */
1298	r = __commit_transaction(pmd);
1299	if (r < 0) {
1300		DMWARN("%s: __commit_transaction() failed, error = %d",
1301		       __func__, r);
1302		return r;
1303	}
1304
1305	/*
1306	 * Copy the superblock.
1307	 */
1308	dm_sm_inc_block(pmd->metadata_sm, THIN_SUPERBLOCK_LOCATION);
1309	r = dm_tm_shadow_block(pmd->tm, THIN_SUPERBLOCK_LOCATION,
1310			       &sb_validator, &copy, &inc);
1311	if (r)
1312		return r;
1313
1314	BUG_ON(!inc);
1315
1316	held_root = dm_block_location(copy);
1317	disk_super = dm_block_data(copy);
1318
1319	if (le64_to_cpu(disk_super->held_root)) {
1320		DMWARN("Pool metadata snapshot already exists: release this before taking another.");
1321
1322		dm_tm_dec(pmd->tm, held_root);
1323		dm_tm_unlock(pmd->tm, copy);
1324		return -EBUSY;
1325	}
1326
1327	/*
1328	 * Wipe the spacemap since we're not publishing this.
1329	 */
1330	memset(&disk_super->data_space_map_root, 0,
1331	       sizeof(disk_super->data_space_map_root));
1332	memset(&disk_super->metadata_space_map_root, 0,
1333	       sizeof(disk_super->metadata_space_map_root));
1334
1335	/*
1336	 * Increment the data structures that need to be preserved.
1337	 */
1338	dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->data_mapping_root));
1339	dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->device_details_root));
1340	dm_tm_unlock(pmd->tm, copy);
1341
1342	/*
1343	 * Write the held root into the superblock.
1344	 */
1345	r = superblock_lock(pmd, &sblock);
1346	if (r) {
1347		dm_tm_dec(pmd->tm, held_root);
1348		return r;
1349	}
1350
1351	disk_super = dm_block_data(sblock);
1352	disk_super->held_root = cpu_to_le64(held_root);
1353	dm_bm_unlock(sblock);
1354	return 0;
1355}
1356
1357int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd)
1358{
1359	int r = -EINVAL;
1360
1361	pmd_write_lock(pmd);
1362	if (!pmd->fail_io)
1363		r = __reserve_metadata_snap(pmd);
1364	pmd_write_unlock(pmd);
1365
1366	return r;
1367}
1368
1369static int __release_metadata_snap(struct dm_pool_metadata *pmd)
1370{
1371	int r;
1372	struct thin_disk_superblock *disk_super;
1373	struct dm_block *sblock, *copy;
1374	dm_block_t held_root;
1375
1376	r = superblock_lock(pmd, &sblock);
1377	if (r)
1378		return r;
1379
1380	disk_super = dm_block_data(sblock);
1381	held_root = le64_to_cpu(disk_super->held_root);
1382	disk_super->held_root = cpu_to_le64(0);
1383
1384	dm_bm_unlock(sblock);
1385
1386	if (!held_root) {
1387		DMWARN("No pool metadata snapshot found: nothing to release.");
1388		return -EINVAL;
1389	}
1390
1391	r = dm_tm_read_lock(pmd->tm, held_root, &sb_validator, &copy);
1392	if (r)
1393		return r;
1394
1395	disk_super = dm_block_data(copy);
1396	dm_btree_del(&pmd->info, le64_to_cpu(disk_super->data_mapping_root));
1397	dm_btree_del(&pmd->details_info, le64_to_cpu(disk_super->device_details_root));
1398	dm_sm_dec_block(pmd->metadata_sm, held_root);
1399
1400	dm_tm_unlock(pmd->tm, copy);
1401
1402	return 0;
1403}
1404
1405int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd)
1406{
1407	int r = -EINVAL;
1408
1409	pmd_write_lock(pmd);
1410	if (!pmd->fail_io)
1411		r = __release_metadata_snap(pmd);
1412	pmd_write_unlock(pmd);
1413
1414	return r;
1415}
1416
1417static int __get_metadata_snap(struct dm_pool_metadata *pmd,
1418			       dm_block_t *result)
1419{
1420	int r;
1421	struct thin_disk_superblock *disk_super;
1422	struct dm_block *sblock;
1423
1424	r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
1425			    &sb_validator, &sblock);
1426	if (r)
1427		return r;
1428
1429	disk_super = dm_block_data(sblock);
1430	*result = le64_to_cpu(disk_super->held_root);
1431
1432	dm_bm_unlock(sblock);
1433
1434	return 0;
1435}
1436
1437int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd,
1438			      dm_block_t *result)
1439{
1440	int r = -EINVAL;
1441
1442	down_read(&pmd->root_lock);
1443	if (!pmd->fail_io)
1444		r = __get_metadata_snap(pmd, result);
1445	up_read(&pmd->root_lock);
1446
1447	return r;
1448}
1449
1450int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev,
1451			     struct dm_thin_device **td)
1452{
1453	int r = -EINVAL;
1454
1455	pmd_write_lock_in_core(pmd);
1456	if (!pmd->fail_io)
1457		r = __open_device(pmd, dev, 0, td);
1458	pmd_write_unlock(pmd);
1459
1460	return r;
1461}
1462
1463int dm_pool_close_thin_device(struct dm_thin_device *td)
1464{
1465	pmd_write_lock_in_core(td->pmd);
1466	__close_device(td);
1467	pmd_write_unlock(td->pmd);
1468
1469	return 0;
1470}
1471
1472dm_thin_id dm_thin_dev_id(struct dm_thin_device *td)
1473{
1474	return td->id;
1475}
1476
1477/*
1478 * Check whether @time (of block creation) is older than @td's last snapshot.
1479 * If so then the associated block is shared with the last snapshot device.
1480 * Any block on a device created *after* the device last got snapshotted is
1481 * necessarily not shared.
1482 */
1483static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time)
1484{
1485	return td->snapshotted_time > time;
1486}
1487
1488static void unpack_lookup_result(struct dm_thin_device *td, __le64 value,
1489				 struct dm_thin_lookup_result *result)
1490{
1491	uint64_t block_time = 0;
1492	dm_block_t exception_block;
1493	uint32_t exception_time;
1494
1495	block_time = le64_to_cpu(value);
1496	unpack_block_time(block_time, &exception_block, &exception_time);
1497	result->block = exception_block;
1498	result->shared = __snapshotted_since(td, exception_time);
1499}
1500
1501static int __find_block(struct dm_thin_device *td, dm_block_t block,
1502			int can_issue_io, struct dm_thin_lookup_result *result)
1503{
1504	int r;
1505	__le64 value;
1506	struct dm_pool_metadata *pmd = td->pmd;
1507	dm_block_t keys[2] = { td->id, block };
1508	struct dm_btree_info *info;
1509
1510	if (can_issue_io) {
1511		info = &pmd->info;
1512	} else
1513		info = &pmd->nb_info;
1514
1515	r = dm_btree_lookup(info, pmd->root, keys, &value);
1516	if (!r)
1517		unpack_lookup_result(td, value, result);
1518
1519	return r;
1520}
1521
1522int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
1523		       int can_issue_io, struct dm_thin_lookup_result *result)
1524{
1525	int r;
1526	struct dm_pool_metadata *pmd = td->pmd;
1527
1528	down_read(&pmd->root_lock);
1529	if (pmd->fail_io) {
1530		up_read(&pmd->root_lock);
1531		return -EINVAL;
1532	}
1533
1534	r = __find_block(td, block, can_issue_io, result);
1535
1536	up_read(&pmd->root_lock);
1537	return r;
1538}
1539
1540static int __find_next_mapped_block(struct dm_thin_device *td, dm_block_t block,
1541					  dm_block_t *vblock,
1542					  struct dm_thin_lookup_result *result)
1543{
1544	int r;
1545	__le64 value;
1546	struct dm_pool_metadata *pmd = td->pmd;
1547	dm_block_t keys[2] = { td->id, block };
1548
1549	r = dm_btree_lookup_next(&pmd->info, pmd->root, keys, vblock, &value);
1550	if (!r)
1551		unpack_lookup_result(td, value, result);
1552
1553	return r;
1554}
1555
1556static int __find_mapped_range(struct dm_thin_device *td,
1557			       dm_block_t begin, dm_block_t end,
1558			       dm_block_t *thin_begin, dm_block_t *thin_end,
1559			       dm_block_t *pool_begin, bool *maybe_shared)
1560{
1561	int r;
1562	dm_block_t pool_end;
1563	struct dm_thin_lookup_result lookup;
1564
1565	if (end < begin)
1566		return -ENODATA;
1567
1568	r = __find_next_mapped_block(td, begin, &begin, &lookup);
1569	if (r)
1570		return r;
1571
1572	if (begin >= end)
1573		return -ENODATA;
1574
1575	*thin_begin = begin;
1576	*pool_begin = lookup.block;
1577	*maybe_shared = lookup.shared;
1578
1579	begin++;
1580	pool_end = *pool_begin + 1;
1581	while (begin != end) {
1582		r = __find_block(td, begin, true, &lookup);
1583		if (r) {
1584			if (r == -ENODATA)
1585				break;
1586			else
1587				return r;
1588		}
1589
1590		if ((lookup.block != pool_end) ||
1591		    (lookup.shared != *maybe_shared))
1592			break;
1593
1594		pool_end++;
1595		begin++;
1596	}
1597
1598	*thin_end = begin;
1599	return 0;
1600}
1601
1602int dm_thin_find_mapped_range(struct dm_thin_device *td,
1603			      dm_block_t begin, dm_block_t end,
1604			      dm_block_t *thin_begin, dm_block_t *thin_end,
1605			      dm_block_t *pool_begin, bool *maybe_shared)
1606{
1607	int r = -EINVAL;
1608	struct dm_pool_metadata *pmd = td->pmd;
1609
1610	down_read(&pmd->root_lock);
1611	if (!pmd->fail_io) {
1612		r = __find_mapped_range(td, begin, end, thin_begin, thin_end,
1613					pool_begin, maybe_shared);
1614	}
1615	up_read(&pmd->root_lock);
1616
1617	return r;
1618}
1619
1620static int __insert(struct dm_thin_device *td, dm_block_t block,
1621		    dm_block_t data_block)
1622{
1623	int r, inserted;
1624	__le64 value;
1625	struct dm_pool_metadata *pmd = td->pmd;
1626	dm_block_t keys[2] = { td->id, block };
1627
1628	value = cpu_to_le64(pack_block_time(data_block, pmd->time));
1629	__dm_bless_for_disk(&value);
1630
1631	r = dm_btree_insert_notify(&pmd->info, pmd->root, keys, &value,
1632				   &pmd->root, &inserted);
1633	if (r)
1634		return r;
1635
1636	td->changed = true;
1637	if (inserted)
1638		td->mapped_blocks++;
1639
1640	return 0;
1641}
1642
1643int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block,
1644			 dm_block_t data_block)
1645{
1646	int r = -EINVAL;
1647
1648	pmd_write_lock(td->pmd);
1649	if (!td->pmd->fail_io)
1650		r = __insert(td, block, data_block);
1651	pmd_write_unlock(td->pmd);
1652
1653	return r;
1654}
1655
1656static int __remove(struct dm_thin_device *td, dm_block_t block)
1657{
1658	int r;
1659	struct dm_pool_metadata *pmd = td->pmd;
1660	dm_block_t keys[2] = { td->id, block };
1661
1662	r = dm_btree_remove(&pmd->info, pmd->root, keys, &pmd->root);
1663	if (r)
1664		return r;
1665
1666	td->mapped_blocks--;
1667	td->changed = true;
1668
1669	return 0;
1670}
1671
1672static int __remove_range(struct dm_thin_device *td, dm_block_t begin, dm_block_t end)
1673{
1674	int r;
1675	unsigned count, total_count = 0;
1676	struct dm_pool_metadata *pmd = td->pmd;
1677	dm_block_t keys[1] = { td->id };
1678	__le64 value;
1679	dm_block_t mapping_root;
1680
1681	/*
1682	 * Find the mapping tree
1683	 */
1684	r = dm_btree_lookup(&pmd->tl_info, pmd->root, keys, &value);
1685	if (r)
1686		return r;
1687
1688	/*
1689	 * Remove from the mapping tree, taking care to inc the
1690	 * ref count so it doesn't get deleted.
1691	 */
1692	mapping_root = le64_to_cpu(value);
1693	dm_tm_inc(pmd->tm, mapping_root);
1694	r = dm_btree_remove(&pmd->tl_info, pmd->root, keys, &pmd->root);
1695	if (r)
1696		return r;
1697
1698	/*
1699	 * Remove leaves stops at the first unmapped entry, so we have to
1700	 * loop round finding mapped ranges.
1701	 */
1702	while (begin < end) {
1703		r = dm_btree_lookup_next(&pmd->bl_info, mapping_root, &begin, &begin, &value);
1704		if (r == -ENODATA)
1705			break;
1706
1707		if (r)
1708			return r;
1709
1710		if (begin >= end)
1711			break;
1712
1713		r = dm_btree_remove_leaves(&pmd->bl_info, mapping_root, &begin, end, &mapping_root, &count);
1714		if (r)
1715			return r;
1716
1717		total_count += count;
1718	}
1719
1720	td->mapped_blocks -= total_count;
1721	td->changed = true;
1722
1723	/*
1724	 * Reinsert the mapping tree.
1725	 */
1726	value = cpu_to_le64(mapping_root);
1727	__dm_bless_for_disk(&value);
1728	return dm_btree_insert(&pmd->tl_info, pmd->root, keys, &value, &pmd->root);
1729}
1730
1731int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block)
1732{
1733	int r = -EINVAL;
1734
1735	pmd_write_lock(td->pmd);
1736	if (!td->pmd->fail_io)
1737		r = __remove(td, block);
1738	pmd_write_unlock(td->pmd);
1739
1740	return r;
1741}
1742
1743int dm_thin_remove_range(struct dm_thin_device *td,
1744			 dm_block_t begin, dm_block_t end)
1745{
1746	int r = -EINVAL;
1747
1748	pmd_write_lock(td->pmd);
1749	if (!td->pmd->fail_io)
1750		r = __remove_range(td, begin, end);
1751	pmd_write_unlock(td->pmd);
1752
1753	return r;
1754}
1755
1756int dm_pool_block_is_shared(struct dm_pool_metadata *pmd, dm_block_t b, bool *result)
1757{
1758	int r;
1759	uint32_t ref_count;
1760
1761	down_read(&pmd->root_lock);
1762	r = dm_sm_get_count(pmd->data_sm, b, &ref_count);
1763	if (!r)
1764		*result = (ref_count > 1);
1765	up_read(&pmd->root_lock);
1766
1767	return r;
1768}
1769
1770int dm_pool_inc_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e)
1771{
1772	int r = 0;
1773
1774	pmd_write_lock(pmd);
1775	for (; b != e; b++) {
1776		r = dm_sm_inc_block(pmd->data_sm, b);
1777		if (r)
1778			break;
1779	}
1780	pmd_write_unlock(pmd);
1781
1782	return r;
1783}
1784
1785int dm_pool_dec_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e)
1786{
1787	int r = 0;
1788
1789	pmd_write_lock(pmd);
1790	for (; b != e; b++) {
1791		r = dm_sm_dec_block(pmd->data_sm, b);
1792		if (r)
1793			break;
1794	}
1795	pmd_write_unlock(pmd);
1796
1797	return r;
1798}
1799
1800bool dm_thin_changed_this_transaction(struct dm_thin_device *td)
1801{
1802	int r;
1803
1804	down_read(&td->pmd->root_lock);
1805	r = td->changed;
1806	up_read(&td->pmd->root_lock);
1807
1808	return r;
1809}
1810
1811bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd)
1812{
1813	bool r = false;
1814	struct dm_thin_device *td, *tmp;
1815
1816	down_read(&pmd->root_lock);
1817	list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
1818		if (td->changed) {
1819			r = td->changed;
1820			break;
1821		}
1822	}
1823	up_read(&pmd->root_lock);
1824
1825	return r;
1826}
1827
1828bool dm_thin_aborted_changes(struct dm_thin_device *td)
1829{
1830	bool r;
1831
1832	down_read(&td->pmd->root_lock);
1833	r = td->aborted_with_changes;
1834	up_read(&td->pmd->root_lock);
1835
1836	return r;
1837}
1838
1839int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result)
1840{
1841	int r = -EINVAL;
1842
1843	pmd_write_lock(pmd);
1844	if (!pmd->fail_io)
1845		r = dm_sm_new_block(pmd->data_sm, result);
1846	pmd_write_unlock(pmd);
1847
1848	return r;
1849}
1850
1851int dm_pool_commit_metadata(struct dm_pool_metadata *pmd)
1852{
1853	int r = -EINVAL;
1854
1855	/*
1856	 * Care is taken to not have commit be what
1857	 * triggers putting the thin-pool in-service.
1858	 */
1859	pmd_write_lock_in_core(pmd);
1860	if (pmd->fail_io)
1861		goto out;
1862
1863	r = __commit_transaction(pmd);
1864	if (r < 0)
1865		goto out;
1866
1867	/*
1868	 * Open the next transaction.
1869	 */
1870	r = __begin_transaction(pmd);
1871out:
1872	pmd_write_unlock(pmd);
1873	return r;
1874}
1875
1876static void __set_abort_with_changes_flags(struct dm_pool_metadata *pmd)
1877{
1878	struct dm_thin_device *td;
1879
1880	list_for_each_entry(td, &pmd->thin_devices, list)
1881		td->aborted_with_changes = td->changed;
1882}
1883
1884int dm_pool_abort_metadata(struct dm_pool_metadata *pmd)
1885{
1886	int r = -EINVAL;
1887	struct dm_block_manager *old_bm = NULL, *new_bm = NULL;
1888
1889	/* fail_io is double-checked with pmd->root_lock held below */
1890	if (unlikely(pmd->fail_io))
1891		return r;
1892
1893	/*
1894	 * Replacement block manager (new_bm) is created and old_bm destroyed outside of
1895	 * pmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of
1896	 * shrinker associated with the block manager's bufio client vs pmd root_lock).
1897	 * - must take shrinker_rwsem without holding pmd->root_lock
1898	 */
1899	new_bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
1900					 THIN_MAX_CONCURRENT_LOCKS);
1901
1902	pmd_write_lock(pmd);
1903	if (pmd->fail_io) {
1904		pmd_write_unlock(pmd);
1905		goto out;
1906	}
1907
1908	__set_abort_with_changes_flags(pmd);
1909	__destroy_persistent_data_objects(pmd, false);
1910	old_bm = pmd->bm;
1911	if (IS_ERR(new_bm)) {
1912		DMERR("could not create block manager during abort");
1913		pmd->bm = NULL;
1914		r = PTR_ERR(new_bm);
1915		goto out_unlock;
1916	}
1917
1918	pmd->bm = new_bm;
1919	r = __open_or_format_metadata(pmd, false);
1920	if (r) {
1921		pmd->bm = NULL;
1922		goto out_unlock;
1923	}
1924	new_bm = NULL;
1925out_unlock:
1926	if (r)
1927		pmd->fail_io = true;
1928	pmd_write_unlock(pmd);
1929	dm_block_manager_destroy(old_bm);
1930out:
1931	if (new_bm && !IS_ERR(new_bm))
1932		dm_block_manager_destroy(new_bm);
1933
1934	return r;
1935}
1936
1937int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, dm_block_t *result)
1938{
1939	int r = -EINVAL;
1940
1941	down_read(&pmd->root_lock);
1942	if (!pmd->fail_io)
1943		r = dm_sm_get_nr_free(pmd->data_sm, result);
1944	up_read(&pmd->root_lock);
1945
1946	return r;
1947}
1948
1949int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd,
1950					  dm_block_t *result)
1951{
1952	int r = -EINVAL;
1953
1954	down_read(&pmd->root_lock);
1955	if (!pmd->fail_io)
1956		r = dm_sm_get_nr_free(pmd->metadata_sm, result);
1957
1958	if (!r) {
1959		if (*result < pmd->metadata_reserve)
1960			*result = 0;
1961		else
1962			*result -= pmd->metadata_reserve;
1963	}
1964	up_read(&pmd->root_lock);
1965
1966	return r;
1967}
1968
1969int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd,
1970				  dm_block_t *result)
1971{
1972	int r = -EINVAL;
1973
1974	down_read(&pmd->root_lock);
1975	if (!pmd->fail_io)
1976		r = dm_sm_get_nr_blocks(pmd->metadata_sm, result);
1977	up_read(&pmd->root_lock);
1978
1979	return r;
1980}
1981
1982int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result)
1983{
1984	int r = -EINVAL;
1985
1986	down_read(&pmd->root_lock);
1987	if (!pmd->fail_io)
1988		r = dm_sm_get_nr_blocks(pmd->data_sm, result);
1989	up_read(&pmd->root_lock);
1990
1991	return r;
1992}
1993
1994int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result)
1995{
1996	int r = -EINVAL;
1997	struct dm_pool_metadata *pmd = td->pmd;
1998
1999	down_read(&pmd->root_lock);
2000	if (!pmd->fail_io) {
2001		*result = td->mapped_blocks;
2002		r = 0;
2003	}
2004	up_read(&pmd->root_lock);
2005
2006	return r;
2007}
2008
2009static int __highest_block(struct dm_thin_device *td, dm_block_t *result)
2010{
2011	int r;
2012	__le64 value_le;
2013	dm_block_t thin_root;
2014	struct dm_pool_metadata *pmd = td->pmd;
2015
2016	r = dm_btree_lookup(&pmd->tl_info, pmd->root, &td->id, &value_le);
2017	if (r)
2018		return r;
2019
2020	thin_root = le64_to_cpu(value_le);
2021
2022	return dm_btree_find_highest_key(&pmd->bl_info, thin_root, result);
2023}
2024
2025int dm_thin_get_highest_mapped_block(struct dm_thin_device *td,
2026				     dm_block_t *result)
2027{
2028	int r = -EINVAL;
2029	struct dm_pool_metadata *pmd = td->pmd;
2030
2031	down_read(&pmd->root_lock);
2032	if (!pmd->fail_io)
2033		r = __highest_block(td, result);
2034	up_read(&pmd->root_lock);
2035
2036	return r;
2037}
2038
2039static int __resize_space_map(struct dm_space_map *sm, dm_block_t new_count)
2040{
2041	int r;
2042	dm_block_t old_count;
2043
2044	r = dm_sm_get_nr_blocks(sm, &old_count);
2045	if (r)
2046		return r;
2047
2048	if (new_count == old_count)
2049		return 0;
2050
2051	if (new_count < old_count) {
2052		DMERR("cannot reduce size of space map");
2053		return -EINVAL;
2054	}
2055
2056	return dm_sm_extend(sm, new_count - old_count);
2057}
2058
2059int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
2060{
2061	int r = -EINVAL;
2062
2063	pmd_write_lock(pmd);
2064	if (!pmd->fail_io)
2065		r = __resize_space_map(pmd->data_sm, new_count);
2066	pmd_write_unlock(pmd);
2067
2068	return r;
2069}
2070
2071int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
2072{
2073	int r = -EINVAL;
2074
2075	pmd_write_lock(pmd);
2076	if (!pmd->fail_io) {
2077		r = __resize_space_map(pmd->metadata_sm, new_count);
2078		if (!r)
2079			__set_metadata_reserve(pmd);
2080	}
2081	pmd_write_unlock(pmd);
2082
2083	return r;
2084}
2085
2086void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
2087{
2088	pmd_write_lock_in_core(pmd);
2089	dm_bm_set_read_only(pmd->bm);
2090	pmd_write_unlock(pmd);
2091}
2092
2093void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd)
2094{
2095	pmd_write_lock_in_core(pmd);
2096	dm_bm_set_read_write(pmd->bm);
2097	pmd_write_unlock(pmd);
2098}
2099
2100int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
2101					dm_block_t threshold,
2102					dm_sm_threshold_fn fn,
2103					void *context)
2104{
2105	int r = -EINVAL;
2106
2107	pmd_write_lock_in_core(pmd);
2108	if (!pmd->fail_io) {
2109		r = dm_sm_register_threshold_callback(pmd->metadata_sm,
2110						      threshold, fn, context);
2111	}
2112	pmd_write_unlock(pmd);
2113
2114	return r;
2115}
2116
2117void dm_pool_register_pre_commit_callback(struct dm_pool_metadata *pmd,
2118					  dm_pool_pre_commit_fn fn,
2119					  void *context)
2120{
2121	pmd_write_lock_in_core(pmd);
2122	pmd->pre_commit_fn = fn;
2123	pmd->pre_commit_context = context;
2124	pmd_write_unlock(pmd);
2125}
2126
2127int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd)
2128{
2129	int r = -EINVAL;
2130	struct dm_block *sblock;
2131	struct thin_disk_superblock *disk_super;
2132
2133	pmd_write_lock(pmd);
2134	if (pmd->fail_io)
2135		goto out;
2136
2137	pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG;
2138
2139	r = superblock_lock(pmd, &sblock);
2140	if (r) {
2141		DMERR("couldn't lock superblock");
2142		goto out;
2143	}
2144
2145	disk_super = dm_block_data(sblock);
2146	disk_super->flags = cpu_to_le32(pmd->flags);
2147
2148	dm_bm_unlock(sblock);
2149out:
2150	pmd_write_unlock(pmd);
2151	return r;
2152}
2153
2154bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd)
2155{
2156	bool needs_check;
2157
2158	down_read(&pmd->root_lock);
2159	needs_check = pmd->flags & THIN_METADATA_NEEDS_CHECK_FLAG;
2160	up_read(&pmd->root_lock);
2161
2162	return needs_check;
2163}
2164
2165void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd)
2166{
2167	down_read(&pmd->root_lock);
2168	if (!pmd->fail_io)
2169		dm_tm_issue_prefetches(pmd->tm);
2170	up_read(&pmd->root_lock);
2171}
2172