xref: /kernel/linux/linux-5.10/drivers/md/dm-table.c (revision 8c2ecf20)
1/*
2 * Copyright (C) 2001 Sistina Software (UK) Limited.
3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
4 *
5 * This file is released under the GPL.
6 */
7
8#include "dm-core.h"
9
10#include <linux/module.h>
11#include <linux/vmalloc.h>
12#include <linux/blkdev.h>
13#include <linux/namei.h>
14#include <linux/ctype.h>
15#include <linux/string.h>
16#include <linux/slab.h>
17#include <linux/interrupt.h>
18#include <linux/mutex.h>
19#include <linux/delay.h>
20#include <linux/atomic.h>
21#include <linux/blk-mq.h>
22#include <linux/mount.h>
23#include <linux/dax.h>
24
25#define DM_MSG_PREFIX "table"
26
27#define NODE_SIZE L1_CACHE_BYTES
28#define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t))
29#define CHILDREN_PER_NODE (KEYS_PER_NODE + 1)
30
31/*
32 * Similar to ceiling(log_size(n))
33 */
34static unsigned int int_log(unsigned int n, unsigned int base)
35{
36	int result = 0;
37
38	while (n > 1) {
39		n = dm_div_up(n, base);
40		result++;
41	}
42
43	return result;
44}
45
46/*
47 * Calculate the index of the child node of the n'th node k'th key.
48 */
49static inline unsigned int get_child(unsigned int n, unsigned int k)
50{
51	return (n * CHILDREN_PER_NODE) + k;
52}
53
54/*
55 * Return the n'th node of level l from table t.
56 */
57static inline sector_t *get_node(struct dm_table *t,
58				 unsigned int l, unsigned int n)
59{
60	return t->index[l] + (n * KEYS_PER_NODE);
61}
62
63/*
64 * Return the highest key that you could lookup from the n'th
65 * node on level l of the btree.
66 */
67static sector_t high(struct dm_table *t, unsigned int l, unsigned int n)
68{
69	for (; l < t->depth - 1; l++)
70		n = get_child(n, CHILDREN_PER_NODE - 1);
71
72	if (n >= t->counts[l])
73		return (sector_t) - 1;
74
75	return get_node(t, l, n)[KEYS_PER_NODE - 1];
76}
77
78/*
79 * Fills in a level of the btree based on the highs of the level
80 * below it.
81 */
82static int setup_btree_index(unsigned int l, struct dm_table *t)
83{
84	unsigned int n, k;
85	sector_t *node;
86
87	for (n = 0U; n < t->counts[l]; n++) {
88		node = get_node(t, l, n);
89
90		for (k = 0U; k < KEYS_PER_NODE; k++)
91			node[k] = high(t, l + 1, get_child(n, k));
92	}
93
94	return 0;
95}
96
97void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size)
98{
99	unsigned long size;
100	void *addr;
101
102	/*
103	 * Check that we're not going to overflow.
104	 */
105	if (nmemb > (ULONG_MAX / elem_size))
106		return NULL;
107
108	size = nmemb * elem_size;
109	addr = vzalloc(size);
110
111	return addr;
112}
113EXPORT_SYMBOL(dm_vcalloc);
114
115/*
116 * highs, and targets are managed as dynamic arrays during a
117 * table load.
118 */
119static int alloc_targets(struct dm_table *t, unsigned int num)
120{
121	sector_t *n_highs;
122	struct dm_target *n_targets;
123
124	/*
125	 * Allocate both the target array and offset array at once.
126	 */
127	n_highs = (sector_t *) dm_vcalloc(num, sizeof(struct dm_target) +
128					  sizeof(sector_t));
129	if (!n_highs)
130		return -ENOMEM;
131
132	n_targets = (struct dm_target *) (n_highs + num);
133
134	memset(n_highs, -1, sizeof(*n_highs) * num);
135	vfree(t->highs);
136
137	t->num_allocated = num;
138	t->highs = n_highs;
139	t->targets = n_targets;
140
141	return 0;
142}
143
144int dm_table_create(struct dm_table **result, fmode_t mode,
145		    unsigned num_targets, struct mapped_device *md)
146{
147	struct dm_table *t;
148
149	if (num_targets > DM_MAX_TARGETS)
150		return -EOVERFLOW;
151
152	t = kzalloc(sizeof(*t), GFP_KERNEL);
153
154	if (!t)
155		return -ENOMEM;
156
157	INIT_LIST_HEAD(&t->devices);
158
159	if (!num_targets)
160		num_targets = KEYS_PER_NODE;
161
162	num_targets = dm_round_up(num_targets, KEYS_PER_NODE);
163
164	if (!num_targets) {
165		kfree(t);
166		return -EOVERFLOW;
167	}
168
169	if (alloc_targets(t, num_targets)) {
170		kfree(t);
171		return -ENOMEM;
172	}
173
174	t->type = DM_TYPE_NONE;
175	t->mode = mode;
176	t->md = md;
177	*result = t;
178	return 0;
179}
180
181static void free_devices(struct list_head *devices, struct mapped_device *md)
182{
183	struct list_head *tmp, *next;
184
185	list_for_each_safe(tmp, next, devices) {
186		struct dm_dev_internal *dd =
187		    list_entry(tmp, struct dm_dev_internal, list);
188		DMWARN("%s: dm_table_destroy: dm_put_device call missing for %s",
189		       dm_device_name(md), dd->dm_dev->name);
190		dm_put_table_device(md, dd->dm_dev);
191		kfree(dd);
192	}
193}
194
195void dm_table_destroy(struct dm_table *t)
196{
197	unsigned int i;
198
199	if (!t)
200		return;
201
202	/* free the indexes */
203	if (t->depth >= 2)
204		vfree(t->index[t->depth - 2]);
205
206	/* free the targets */
207	for (i = 0; i < t->num_targets; i++) {
208		struct dm_target *tgt = t->targets + i;
209
210		if (tgt->type->dtr)
211			tgt->type->dtr(tgt);
212
213		dm_put_target_type(tgt->type);
214	}
215
216	vfree(t->highs);
217
218	/* free the device list */
219	free_devices(&t->devices, t->md);
220
221	dm_free_md_mempools(t->mempools);
222
223	kfree(t);
224}
225
226/*
227 * See if we've already got a device in the list.
228 */
229static struct dm_dev_internal *find_device(struct list_head *l, dev_t dev)
230{
231	struct dm_dev_internal *dd;
232
233	list_for_each_entry (dd, l, list)
234		if (dd->dm_dev->bdev->bd_dev == dev)
235			return dd;
236
237	return NULL;
238}
239
240/*
241 * If possible, this checks an area of a destination device is invalid.
242 */
243static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev,
244				  sector_t start, sector_t len, void *data)
245{
246	struct queue_limits *limits = data;
247	struct block_device *bdev = dev->bdev;
248	sector_t dev_size =
249		i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
250	unsigned short logical_block_size_sectors =
251		limits->logical_block_size >> SECTOR_SHIFT;
252	char b[BDEVNAME_SIZE];
253
254	if (!dev_size)
255		return 0;
256
257	if ((start >= dev_size) || (start + len > dev_size)) {
258		DMWARN("%s: %s too small for target: "
259		       "start=%llu, len=%llu, dev_size=%llu",
260		       dm_device_name(ti->table->md), bdevname(bdev, b),
261		       (unsigned long long)start,
262		       (unsigned long long)len,
263		       (unsigned long long)dev_size);
264		return 1;
265	}
266
267	/*
268	 * If the target is mapped to zoned block device(s), check
269	 * that the zones are not partially mapped.
270	 */
271	if (bdev_zoned_model(bdev) != BLK_ZONED_NONE) {
272		unsigned int zone_sectors = bdev_zone_sectors(bdev);
273
274		if (start & (zone_sectors - 1)) {
275			DMWARN("%s: start=%llu not aligned to h/w zone size %u of %s",
276			       dm_device_name(ti->table->md),
277			       (unsigned long long)start,
278			       zone_sectors, bdevname(bdev, b));
279			return 1;
280		}
281
282		/*
283		 * Note: The last zone of a zoned block device may be smaller
284		 * than other zones. So for a target mapping the end of a
285		 * zoned block device with such a zone, len would not be zone
286		 * aligned. We do not allow such last smaller zone to be part
287		 * of the mapping here to ensure that mappings with multiple
288		 * devices do not end up with a smaller zone in the middle of
289		 * the sector range.
290		 */
291		if (len & (zone_sectors - 1)) {
292			DMWARN("%s: len=%llu not aligned to h/w zone size %u of %s",
293			       dm_device_name(ti->table->md),
294			       (unsigned long long)len,
295			       zone_sectors, bdevname(bdev, b));
296			return 1;
297		}
298	}
299
300	if (logical_block_size_sectors <= 1)
301		return 0;
302
303	if (start & (logical_block_size_sectors - 1)) {
304		DMWARN("%s: start=%llu not aligned to h/w "
305		       "logical block size %u of %s",
306		       dm_device_name(ti->table->md),
307		       (unsigned long long)start,
308		       limits->logical_block_size, bdevname(bdev, b));
309		return 1;
310	}
311
312	if (len & (logical_block_size_sectors - 1)) {
313		DMWARN("%s: len=%llu not aligned to h/w "
314		       "logical block size %u of %s",
315		       dm_device_name(ti->table->md),
316		       (unsigned long long)len,
317		       limits->logical_block_size, bdevname(bdev, b));
318		return 1;
319	}
320
321	return 0;
322}
323
324/*
325 * This upgrades the mode on an already open dm_dev, being
326 * careful to leave things as they were if we fail to reopen the
327 * device and not to touch the existing bdev field in case
328 * it is accessed concurrently.
329 */
330static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode,
331			struct mapped_device *md)
332{
333	int r;
334	struct dm_dev *old_dev, *new_dev;
335
336	old_dev = dd->dm_dev;
337
338	r = dm_get_table_device(md, dd->dm_dev->bdev->bd_dev,
339				dd->dm_dev->mode | new_mode, &new_dev);
340	if (r)
341		return r;
342
343	dd->dm_dev = new_dev;
344	dm_put_table_device(md, old_dev);
345
346	return 0;
347}
348
349/*
350 * Convert the path to a device
351 */
352dev_t dm_get_dev_t(const char *path)
353{
354	dev_t dev;
355	struct block_device *bdev;
356
357	bdev = lookup_bdev(path);
358	if (IS_ERR(bdev))
359		dev = name_to_dev_t(path);
360	else {
361		dev = bdev->bd_dev;
362		bdput(bdev);
363	}
364
365	return dev;
366}
367EXPORT_SYMBOL_GPL(dm_get_dev_t);
368
369/*
370 * Add a device to the list, or just increment the usage count if
371 * it's already present.
372 */
373int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
374		  struct dm_dev **result)
375{
376	int r;
377	dev_t dev;
378	unsigned int major, minor;
379	char dummy;
380	struct dm_dev_internal *dd;
381	struct dm_table *t = ti->table;
382
383	BUG_ON(!t);
384
385	if (sscanf(path, "%u:%u%c", &major, &minor, &dummy) == 2) {
386		/* Extract the major/minor numbers */
387		dev = MKDEV(major, minor);
388		if (MAJOR(dev) != major || MINOR(dev) != minor)
389			return -EOVERFLOW;
390	} else {
391		dev = dm_get_dev_t(path);
392		if (!dev)
393			return -ENODEV;
394	}
395
396	dd = find_device(&t->devices, dev);
397	if (!dd) {
398		dd = kmalloc(sizeof(*dd), GFP_KERNEL);
399		if (!dd)
400			return -ENOMEM;
401
402		if ((r = dm_get_table_device(t->md, dev, mode, &dd->dm_dev))) {
403			kfree(dd);
404			return r;
405		}
406
407		refcount_set(&dd->count, 1);
408		list_add(&dd->list, &t->devices);
409		goto out;
410
411	} else if (dd->dm_dev->mode != (mode | dd->dm_dev->mode)) {
412		r = upgrade_mode(dd, mode, t->md);
413		if (r)
414			return r;
415	}
416	refcount_inc(&dd->count);
417out:
418	*result = dd->dm_dev;
419	return 0;
420}
421EXPORT_SYMBOL(dm_get_device);
422
423static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
424				sector_t start, sector_t len, void *data)
425{
426	struct queue_limits *limits = data;
427	struct block_device *bdev = dev->bdev;
428	struct request_queue *q = bdev_get_queue(bdev);
429	char b[BDEVNAME_SIZE];
430
431	if (unlikely(!q)) {
432		DMWARN("%s: Cannot set limits for nonexistent device %s",
433		       dm_device_name(ti->table->md), bdevname(bdev, b));
434		return 0;
435	}
436
437	if (blk_stack_limits(limits, &q->limits,
438			get_start_sect(bdev) + start) < 0)
439		DMWARN("%s: adding target device %s caused an alignment inconsistency: "
440		       "physical_block_size=%u, logical_block_size=%u, "
441		       "alignment_offset=%u, start=%llu",
442		       dm_device_name(ti->table->md), bdevname(bdev, b),
443		       q->limits.physical_block_size,
444		       q->limits.logical_block_size,
445		       q->limits.alignment_offset,
446		       (unsigned long long) start << SECTOR_SHIFT);
447	return 0;
448}
449
450/*
451 * Decrement a device's use count and remove it if necessary.
452 */
453void dm_put_device(struct dm_target *ti, struct dm_dev *d)
454{
455	int found = 0;
456	struct list_head *devices = &ti->table->devices;
457	struct dm_dev_internal *dd;
458
459	list_for_each_entry(dd, devices, list) {
460		if (dd->dm_dev == d) {
461			found = 1;
462			break;
463		}
464	}
465	if (!found) {
466		DMWARN("%s: device %s not in table devices list",
467		       dm_device_name(ti->table->md), d->name);
468		return;
469	}
470	if (refcount_dec_and_test(&dd->count)) {
471		dm_put_table_device(ti->table->md, d);
472		list_del(&dd->list);
473		kfree(dd);
474	}
475}
476EXPORT_SYMBOL(dm_put_device);
477
478/*
479 * Checks to see if the target joins onto the end of the table.
480 */
481static int adjoin(struct dm_table *table, struct dm_target *ti)
482{
483	struct dm_target *prev;
484
485	if (!table->num_targets)
486		return !ti->begin;
487
488	prev = &table->targets[table->num_targets - 1];
489	return (ti->begin == (prev->begin + prev->len));
490}
491
492/*
493 * Used to dynamically allocate the arg array.
494 *
495 * We do first allocation with GFP_NOIO because dm-mpath and dm-thin must
496 * process messages even if some device is suspended. These messages have a
497 * small fixed number of arguments.
498 *
499 * On the other hand, dm-switch needs to process bulk data using messages and
500 * excessive use of GFP_NOIO could cause trouble.
501 */
502static char **realloc_argv(unsigned *size, char **old_argv)
503{
504	char **argv;
505	unsigned new_size;
506	gfp_t gfp;
507
508	if (*size) {
509		new_size = *size * 2;
510		gfp = GFP_KERNEL;
511	} else {
512		new_size = 8;
513		gfp = GFP_NOIO;
514	}
515	argv = kmalloc_array(new_size, sizeof(*argv), gfp);
516	if (argv && old_argv) {
517		memcpy(argv, old_argv, *size * sizeof(*argv));
518		*size = new_size;
519	}
520
521	kfree(old_argv);
522	return argv;
523}
524
525/*
526 * Destructively splits up the argument list to pass to ctr.
527 */
528int dm_split_args(int *argc, char ***argvp, char *input)
529{
530	char *start, *end = input, *out, **argv = NULL;
531	unsigned array_size = 0;
532
533	*argc = 0;
534
535	if (!input) {
536		*argvp = NULL;
537		return 0;
538	}
539
540	argv = realloc_argv(&array_size, argv);
541	if (!argv)
542		return -ENOMEM;
543
544	while (1) {
545		/* Skip whitespace */
546		start = skip_spaces(end);
547
548		if (!*start)
549			break;	/* success, we hit the end */
550
551		/* 'out' is used to remove any back-quotes */
552		end = out = start;
553		while (*end) {
554			/* Everything apart from '\0' can be quoted */
555			if (*end == '\\' && *(end + 1)) {
556				*out++ = *(end + 1);
557				end += 2;
558				continue;
559			}
560
561			if (isspace(*end))
562				break;	/* end of token */
563
564			*out++ = *end++;
565		}
566
567		/* have we already filled the array ? */
568		if ((*argc + 1) > array_size) {
569			argv = realloc_argv(&array_size, argv);
570			if (!argv)
571				return -ENOMEM;
572		}
573
574		/* we know this is whitespace */
575		if (*end)
576			end++;
577
578		/* terminate the string and put it in the array */
579		*out = '\0';
580		argv[*argc] = start;
581		(*argc)++;
582	}
583
584	*argvp = argv;
585	return 0;
586}
587
588/*
589 * Impose necessary and sufficient conditions on a devices's table such
590 * that any incoming bio which respects its logical_block_size can be
591 * processed successfully.  If it falls across the boundary between
592 * two or more targets, the size of each piece it gets split into must
593 * be compatible with the logical_block_size of the target processing it.
594 */
595static int validate_hardware_logical_block_alignment(struct dm_table *table,
596						 struct queue_limits *limits)
597{
598	/*
599	 * This function uses arithmetic modulo the logical_block_size
600	 * (in units of 512-byte sectors).
601	 */
602	unsigned short device_logical_block_size_sects =
603		limits->logical_block_size >> SECTOR_SHIFT;
604
605	/*
606	 * Offset of the start of the next table entry, mod logical_block_size.
607	 */
608	unsigned short next_target_start = 0;
609
610	/*
611	 * Given an aligned bio that extends beyond the end of a
612	 * target, how many sectors must the next target handle?
613	 */
614	unsigned short remaining = 0;
615
616	struct dm_target *ti;
617	struct queue_limits ti_limits;
618	unsigned i;
619
620	/*
621	 * Check each entry in the table in turn.
622	 */
623	for (i = 0; i < dm_table_get_num_targets(table); i++) {
624		ti = dm_table_get_target(table, i);
625
626		blk_set_stacking_limits(&ti_limits);
627
628		/* combine all target devices' limits */
629		if (ti->type->iterate_devices)
630			ti->type->iterate_devices(ti, dm_set_device_limits,
631						  &ti_limits);
632
633		/*
634		 * If the remaining sectors fall entirely within this
635		 * table entry are they compatible with its logical_block_size?
636		 */
637		if (remaining < ti->len &&
638		    remaining & ((ti_limits.logical_block_size >>
639				  SECTOR_SHIFT) - 1))
640			break;	/* Error */
641
642		next_target_start =
643		    (unsigned short) ((next_target_start + ti->len) &
644				      (device_logical_block_size_sects - 1));
645		remaining = next_target_start ?
646		    device_logical_block_size_sects - next_target_start : 0;
647	}
648
649	if (remaining) {
650		DMWARN("%s: table line %u (start sect %llu len %llu) "
651		       "not aligned to h/w logical block size %u",
652		       dm_device_name(table->md), i,
653		       (unsigned long long) ti->begin,
654		       (unsigned long long) ti->len,
655		       limits->logical_block_size);
656		return -EINVAL;
657	}
658
659	return 0;
660}
661
662int dm_table_add_target(struct dm_table *t, const char *type,
663			sector_t start, sector_t len, char *params)
664{
665	int r = -EINVAL, argc;
666	char **argv;
667	struct dm_target *tgt;
668
669	if (t->singleton) {
670		DMERR("%s: target type %s must appear alone in table",
671		      dm_device_name(t->md), t->targets->type->name);
672		return -EINVAL;
673	}
674
675	BUG_ON(t->num_targets >= t->num_allocated);
676
677	tgt = t->targets + t->num_targets;
678	memset(tgt, 0, sizeof(*tgt));
679
680	if (!len) {
681		DMERR("%s: zero-length target", dm_device_name(t->md));
682		return -EINVAL;
683	}
684
685	tgt->type = dm_get_target_type(type);
686	if (!tgt->type) {
687		DMERR("%s: %s: unknown target type", dm_device_name(t->md), type);
688		return -EINVAL;
689	}
690
691	if (dm_target_needs_singleton(tgt->type)) {
692		if (t->num_targets) {
693			tgt->error = "singleton target type must appear alone in table";
694			goto bad;
695		}
696		t->singleton = true;
697	}
698
699	if (dm_target_always_writeable(tgt->type) && !(t->mode & FMODE_WRITE)) {
700		tgt->error = "target type may not be included in a read-only table";
701		goto bad;
702	}
703
704	if (t->immutable_target_type) {
705		if (t->immutable_target_type != tgt->type) {
706			tgt->error = "immutable target type cannot be mixed with other target types";
707			goto bad;
708		}
709	} else if (dm_target_is_immutable(tgt->type)) {
710		if (t->num_targets) {
711			tgt->error = "immutable target type cannot be mixed with other target types";
712			goto bad;
713		}
714		t->immutable_target_type = tgt->type;
715	}
716
717	if (dm_target_has_integrity(tgt->type))
718		t->integrity_added = 1;
719
720	tgt->table = t;
721	tgt->begin = start;
722	tgt->len = len;
723	tgt->error = "Unknown error";
724
725	/*
726	 * Does this target adjoin the previous one ?
727	 */
728	if (!adjoin(t, tgt)) {
729		tgt->error = "Gap in table";
730		goto bad;
731	}
732
733	r = dm_split_args(&argc, &argv, params);
734	if (r) {
735		tgt->error = "couldn't split parameters (insufficient memory)";
736		goto bad;
737	}
738
739	r = tgt->type->ctr(tgt, argc, argv);
740	kfree(argv);
741	if (r)
742		goto bad;
743
744	t->highs[t->num_targets++] = tgt->begin + tgt->len - 1;
745
746	if (!tgt->num_discard_bios && tgt->discards_supported)
747		DMWARN("%s: %s: ignoring discards_supported because num_discard_bios is zero.",
748		       dm_device_name(t->md), type);
749
750	return 0;
751
752 bad:
753	DMERR("%s: %s: %s", dm_device_name(t->md), type, tgt->error);
754	dm_put_target_type(tgt->type);
755	return r;
756}
757
758/*
759 * Target argument parsing helpers.
760 */
761static int validate_next_arg(const struct dm_arg *arg,
762			     struct dm_arg_set *arg_set,
763			     unsigned *value, char **error, unsigned grouped)
764{
765	const char *arg_str = dm_shift_arg(arg_set);
766	char dummy;
767
768	if (!arg_str ||
769	    (sscanf(arg_str, "%u%c", value, &dummy) != 1) ||
770	    (*value < arg->min) ||
771	    (*value > arg->max) ||
772	    (grouped && arg_set->argc < *value)) {
773		*error = arg->error;
774		return -EINVAL;
775	}
776
777	return 0;
778}
779
780int dm_read_arg(const struct dm_arg *arg, struct dm_arg_set *arg_set,
781		unsigned *value, char **error)
782{
783	return validate_next_arg(arg, arg_set, value, error, 0);
784}
785EXPORT_SYMBOL(dm_read_arg);
786
787int dm_read_arg_group(const struct dm_arg *arg, struct dm_arg_set *arg_set,
788		      unsigned *value, char **error)
789{
790	return validate_next_arg(arg, arg_set, value, error, 1);
791}
792EXPORT_SYMBOL(dm_read_arg_group);
793
794const char *dm_shift_arg(struct dm_arg_set *as)
795{
796	char *r;
797
798	if (as->argc) {
799		as->argc--;
800		r = *as->argv;
801		as->argv++;
802		return r;
803	}
804
805	return NULL;
806}
807EXPORT_SYMBOL(dm_shift_arg);
808
809void dm_consume_args(struct dm_arg_set *as, unsigned num_args)
810{
811	BUG_ON(as->argc < num_args);
812	as->argc -= num_args;
813	as->argv += num_args;
814}
815EXPORT_SYMBOL(dm_consume_args);
816
817static bool __table_type_bio_based(enum dm_queue_mode table_type)
818{
819	return (table_type == DM_TYPE_BIO_BASED ||
820		table_type == DM_TYPE_DAX_BIO_BASED);
821}
822
823static bool __table_type_request_based(enum dm_queue_mode table_type)
824{
825	return table_type == DM_TYPE_REQUEST_BASED;
826}
827
828void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type)
829{
830	t->type = type;
831}
832EXPORT_SYMBOL_GPL(dm_table_set_type);
833
834/* validate the dax capability of the target device span */
835int device_not_dax_capable(struct dm_target *ti, struct dm_dev *dev,
836			sector_t start, sector_t len, void *data)
837{
838	int blocksize = *(int *) data, id;
839	bool rc;
840
841	id = dax_read_lock();
842	rc = !dax_supported(dev->dax_dev, dev->bdev, blocksize, start, len);
843	dax_read_unlock(id);
844
845	return rc;
846}
847
848/* Check devices support synchronous DAX */
849static int device_not_dax_synchronous_capable(struct dm_target *ti, struct dm_dev *dev,
850					      sector_t start, sector_t len, void *data)
851{
852	return !dev->dax_dev || !dax_synchronous(dev->dax_dev);
853}
854
855bool dm_table_supports_dax(struct dm_table *t,
856			   iterate_devices_callout_fn iterate_fn, int *blocksize)
857{
858	struct dm_target *ti;
859	unsigned i;
860
861	/* Ensure that all targets support DAX. */
862	for (i = 0; i < dm_table_get_num_targets(t); i++) {
863		ti = dm_table_get_target(t, i);
864
865		if (!ti->type->direct_access)
866			return false;
867
868		if (!ti->type->iterate_devices ||
869		    ti->type->iterate_devices(ti, iterate_fn, blocksize))
870			return false;
871	}
872
873	return true;
874}
875
876static int device_is_rq_stackable(struct dm_target *ti, struct dm_dev *dev,
877				  sector_t start, sector_t len, void *data)
878{
879	struct block_device *bdev = dev->bdev;
880	struct request_queue *q = bdev_get_queue(bdev);
881
882	/* request-based cannot stack on partitions! */
883	if (bdev_is_partition(bdev))
884		return false;
885
886	return queue_is_mq(q);
887}
888
889static int dm_table_determine_type(struct dm_table *t)
890{
891	unsigned i;
892	unsigned bio_based = 0, request_based = 0, hybrid = 0;
893	struct dm_target *tgt;
894	struct list_head *devices = dm_table_get_devices(t);
895	enum dm_queue_mode live_md_type = dm_get_md_type(t->md);
896	int page_size = PAGE_SIZE;
897
898	if (t->type != DM_TYPE_NONE) {
899		/* target already set the table's type */
900		if (t->type == DM_TYPE_BIO_BASED) {
901			/* possibly upgrade to a variant of bio-based */
902			goto verify_bio_based;
903		}
904		BUG_ON(t->type == DM_TYPE_DAX_BIO_BASED);
905		goto verify_rq_based;
906	}
907
908	for (i = 0; i < t->num_targets; i++) {
909		tgt = t->targets + i;
910		if (dm_target_hybrid(tgt))
911			hybrid = 1;
912		else if (dm_target_request_based(tgt))
913			request_based = 1;
914		else
915			bio_based = 1;
916
917		if (bio_based && request_based) {
918			DMERR("Inconsistent table: different target types"
919			      " can't be mixed up");
920			return -EINVAL;
921		}
922	}
923
924	if (hybrid && !bio_based && !request_based) {
925		/*
926		 * The targets can work either way.
927		 * Determine the type from the live device.
928		 * Default to bio-based if device is new.
929		 */
930		if (__table_type_request_based(live_md_type))
931			request_based = 1;
932		else
933			bio_based = 1;
934	}
935
936	if (bio_based) {
937verify_bio_based:
938		/* We must use this table as bio-based */
939		t->type = DM_TYPE_BIO_BASED;
940		if (dm_table_supports_dax(t, device_not_dax_capable, &page_size) ||
941		    (list_empty(devices) && live_md_type == DM_TYPE_DAX_BIO_BASED)) {
942			t->type = DM_TYPE_DAX_BIO_BASED;
943		}
944		return 0;
945	}
946
947	BUG_ON(!request_based); /* No targets in this table */
948
949	t->type = DM_TYPE_REQUEST_BASED;
950
951verify_rq_based:
952	/*
953	 * Request-based dm supports only tables that have a single target now.
954	 * To support multiple targets, request splitting support is needed,
955	 * and that needs lots of changes in the block-layer.
956	 * (e.g. request completion process for partial completion.)
957	 */
958	if (t->num_targets > 1) {
959		DMERR("request-based DM doesn't support multiple targets");
960		return -EINVAL;
961	}
962
963	if (list_empty(devices)) {
964		int srcu_idx;
965		struct dm_table *live_table = dm_get_live_table(t->md, &srcu_idx);
966
967		/* inherit live table's type */
968		if (live_table)
969			t->type = live_table->type;
970		dm_put_live_table(t->md, srcu_idx);
971		return 0;
972	}
973
974	tgt = dm_table_get_immutable_target(t);
975	if (!tgt) {
976		DMERR("table load rejected: immutable target is required");
977		return -EINVAL;
978	} else if (tgt->max_io_len) {
979		DMERR("table load rejected: immutable target that splits IO is not supported");
980		return -EINVAL;
981	}
982
983	/* Non-request-stackable devices can't be used for request-based dm */
984	if (!tgt->type->iterate_devices ||
985	    !tgt->type->iterate_devices(tgt, device_is_rq_stackable, NULL)) {
986		DMERR("table load rejected: including non-request-stackable devices");
987		return -EINVAL;
988	}
989
990	return 0;
991}
992
993enum dm_queue_mode dm_table_get_type(struct dm_table *t)
994{
995	return t->type;
996}
997
998struct target_type *dm_table_get_immutable_target_type(struct dm_table *t)
999{
1000	return t->immutable_target_type;
1001}
1002
1003struct dm_target *dm_table_get_immutable_target(struct dm_table *t)
1004{
1005	/* Immutable target is implicitly a singleton */
1006	if (t->num_targets > 1 ||
1007	    !dm_target_is_immutable(t->targets[0].type))
1008		return NULL;
1009
1010	return t->targets;
1011}
1012
1013struct dm_target *dm_table_get_wildcard_target(struct dm_table *t)
1014{
1015	struct dm_target *ti;
1016	unsigned i;
1017
1018	for (i = 0; i < dm_table_get_num_targets(t); i++) {
1019		ti = dm_table_get_target(t, i);
1020		if (dm_target_is_wildcard(ti->type))
1021			return ti;
1022	}
1023
1024	return NULL;
1025}
1026
1027bool dm_table_bio_based(struct dm_table *t)
1028{
1029	return __table_type_bio_based(dm_table_get_type(t));
1030}
1031
1032bool dm_table_request_based(struct dm_table *t)
1033{
1034	return __table_type_request_based(dm_table_get_type(t));
1035}
1036
1037static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md)
1038{
1039	enum dm_queue_mode type = dm_table_get_type(t);
1040	unsigned per_io_data_size = 0;
1041	unsigned min_pool_size = 0;
1042	struct dm_target *ti;
1043	unsigned i;
1044
1045	if (unlikely(type == DM_TYPE_NONE)) {
1046		DMWARN("no table type is set, can't allocate mempools");
1047		return -EINVAL;
1048	}
1049
1050	if (__table_type_bio_based(type))
1051		for (i = 0; i < t->num_targets; i++) {
1052			ti = t->targets + i;
1053			per_io_data_size = max(per_io_data_size, ti->per_io_data_size);
1054			min_pool_size = max(min_pool_size, ti->num_flush_bios);
1055		}
1056
1057	t->mempools = dm_alloc_md_mempools(md, type, t->integrity_supported,
1058					   per_io_data_size, min_pool_size);
1059	if (!t->mempools)
1060		return -ENOMEM;
1061
1062	return 0;
1063}
1064
1065void dm_table_free_md_mempools(struct dm_table *t)
1066{
1067	dm_free_md_mempools(t->mempools);
1068	t->mempools = NULL;
1069}
1070
1071struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t)
1072{
1073	return t->mempools;
1074}
1075
1076static int setup_indexes(struct dm_table *t)
1077{
1078	int i;
1079	unsigned int total = 0;
1080	sector_t *indexes;
1081
1082	/* allocate the space for *all* the indexes */
1083	for (i = t->depth - 2; i >= 0; i--) {
1084		t->counts[i] = dm_div_up(t->counts[i + 1], CHILDREN_PER_NODE);
1085		total += t->counts[i];
1086	}
1087
1088	indexes = (sector_t *) dm_vcalloc(total, (unsigned long) NODE_SIZE);
1089	if (!indexes)
1090		return -ENOMEM;
1091
1092	/* set up internal nodes, bottom-up */
1093	for (i = t->depth - 2; i >= 0; i--) {
1094		t->index[i] = indexes;
1095		indexes += (KEYS_PER_NODE * t->counts[i]);
1096		setup_btree_index(i, t);
1097	}
1098
1099	return 0;
1100}
1101
1102/*
1103 * Builds the btree to index the map.
1104 */
1105static int dm_table_build_index(struct dm_table *t)
1106{
1107	int r = 0;
1108	unsigned int leaf_nodes;
1109
1110	/* how many indexes will the btree have ? */
1111	leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE);
1112	t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE);
1113
1114	/* leaf layer has already been set up */
1115	t->counts[t->depth - 1] = leaf_nodes;
1116	t->index[t->depth - 1] = t->highs;
1117
1118	if (t->depth >= 2)
1119		r = setup_indexes(t);
1120
1121	return r;
1122}
1123
1124static bool integrity_profile_exists(struct gendisk *disk)
1125{
1126	return !!blk_get_integrity(disk);
1127}
1128
1129/*
1130 * Get a disk whose integrity profile reflects the table's profile.
1131 * Returns NULL if integrity support was inconsistent or unavailable.
1132 */
1133static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t)
1134{
1135	struct list_head *devices = dm_table_get_devices(t);
1136	struct dm_dev_internal *dd = NULL;
1137	struct gendisk *prev_disk = NULL, *template_disk = NULL;
1138	unsigned i;
1139
1140	for (i = 0; i < dm_table_get_num_targets(t); i++) {
1141		struct dm_target *ti = dm_table_get_target(t, i);
1142		if (!dm_target_passes_integrity(ti->type))
1143			goto no_integrity;
1144	}
1145
1146	list_for_each_entry(dd, devices, list) {
1147		template_disk = dd->dm_dev->bdev->bd_disk;
1148		if (!integrity_profile_exists(template_disk))
1149			goto no_integrity;
1150		else if (prev_disk &&
1151			 blk_integrity_compare(prev_disk, template_disk) < 0)
1152			goto no_integrity;
1153		prev_disk = template_disk;
1154	}
1155
1156	return template_disk;
1157
1158no_integrity:
1159	if (prev_disk)
1160		DMWARN("%s: integrity not set: %s and %s profile mismatch",
1161		       dm_device_name(t->md),
1162		       prev_disk->disk_name,
1163		       template_disk->disk_name);
1164	return NULL;
1165}
1166
1167/*
1168 * Register the mapped device for blk_integrity support if the
1169 * underlying devices have an integrity profile.  But all devices may
1170 * not have matching profiles (checking all devices isn't reliable
1171 * during table load because this table may use other DM device(s) which
1172 * must be resumed before they will have an initialized integity
1173 * profile).  Consequently, stacked DM devices force a 2 stage integrity
1174 * profile validation: First pass during table load, final pass during
1175 * resume.
1176 */
1177static int dm_table_register_integrity(struct dm_table *t)
1178{
1179	struct mapped_device *md = t->md;
1180	struct gendisk *template_disk = NULL;
1181
1182	/* If target handles integrity itself do not register it here. */
1183	if (t->integrity_added)
1184		return 0;
1185
1186	template_disk = dm_table_get_integrity_disk(t);
1187	if (!template_disk)
1188		return 0;
1189
1190	if (!integrity_profile_exists(dm_disk(md))) {
1191		t->integrity_supported = true;
1192		/*
1193		 * Register integrity profile during table load; we can do
1194		 * this because the final profile must match during resume.
1195		 */
1196		blk_integrity_register(dm_disk(md),
1197				       blk_get_integrity(template_disk));
1198		return 0;
1199	}
1200
1201	/*
1202	 * If DM device already has an initialized integrity
1203	 * profile the new profile should not conflict.
1204	 */
1205	if (blk_integrity_compare(dm_disk(md), template_disk) < 0) {
1206		DMWARN("%s: conflict with existing integrity profile: "
1207		       "%s profile mismatch",
1208		       dm_device_name(t->md),
1209		       template_disk->disk_name);
1210		return 1;
1211	}
1212
1213	/* Preserve existing integrity profile */
1214	t->integrity_supported = true;
1215	return 0;
1216}
1217
1218/*
1219 * Prepares the table for use by building the indices,
1220 * setting the type, and allocating mempools.
1221 */
1222int dm_table_complete(struct dm_table *t)
1223{
1224	int r;
1225
1226	r = dm_table_determine_type(t);
1227	if (r) {
1228		DMERR("unable to determine table type");
1229		return r;
1230	}
1231
1232	r = dm_table_build_index(t);
1233	if (r) {
1234		DMERR("unable to build btrees");
1235		return r;
1236	}
1237
1238	r = dm_table_register_integrity(t);
1239	if (r) {
1240		DMERR("could not register integrity profile.");
1241		return r;
1242	}
1243
1244	r = dm_table_alloc_md_mempools(t, t->md);
1245	if (r)
1246		DMERR("unable to allocate mempools");
1247
1248	return r;
1249}
1250
1251static DEFINE_MUTEX(_event_lock);
1252void dm_table_event_callback(struct dm_table *t,
1253			     void (*fn)(void *), void *context)
1254{
1255	mutex_lock(&_event_lock);
1256	t->event_fn = fn;
1257	t->event_context = context;
1258	mutex_unlock(&_event_lock);
1259}
1260
1261void dm_table_event(struct dm_table *t)
1262{
1263	mutex_lock(&_event_lock);
1264	if (t->event_fn)
1265		t->event_fn(t->event_context);
1266	mutex_unlock(&_event_lock);
1267}
1268EXPORT_SYMBOL(dm_table_event);
1269
1270inline sector_t dm_table_get_size(struct dm_table *t)
1271{
1272	return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0;
1273}
1274EXPORT_SYMBOL(dm_table_get_size);
1275
1276struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index)
1277{
1278	if (index >= t->num_targets)
1279		return NULL;
1280
1281	return t->targets + index;
1282}
1283
1284/*
1285 * Search the btree for the correct target.
1286 *
1287 * Caller should check returned pointer for NULL
1288 * to trap I/O beyond end of device.
1289 */
1290struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
1291{
1292	unsigned int l, n = 0, k = 0;
1293	sector_t *node;
1294
1295	if (unlikely(sector >= dm_table_get_size(t)))
1296		return NULL;
1297
1298	for (l = 0; l < t->depth; l++) {
1299		n = get_child(n, k);
1300		node = get_node(t, l, n);
1301
1302		for (k = 0; k < KEYS_PER_NODE; k++)
1303			if (node[k] >= sector)
1304				break;
1305	}
1306
1307	return &t->targets[(KEYS_PER_NODE * n) + k];
1308}
1309
1310/*
1311 * type->iterate_devices() should be called when the sanity check needs to
1312 * iterate and check all underlying data devices. iterate_devices() will
1313 * iterate all underlying data devices until it encounters a non-zero return
1314 * code, returned by whether the input iterate_devices_callout_fn, or
1315 * iterate_devices() itself internally.
1316 *
1317 * For some target type (e.g. dm-stripe), one call of iterate_devices() may
1318 * iterate multiple underlying devices internally, in which case a non-zero
1319 * return code returned by iterate_devices_callout_fn will stop the iteration
1320 * in advance.
1321 *
1322 * Cases requiring _any_ underlying device supporting some kind of attribute,
1323 * should use the iteration structure like dm_table_any_dev_attr(), or call
1324 * it directly. @func should handle semantics of positive examples, e.g.
1325 * capable of something.
1326 *
1327 * Cases requiring _all_ underlying devices supporting some kind of attribute,
1328 * should use the iteration structure like dm_table_supports_nowait() or
1329 * dm_table_supports_discards(). Or introduce dm_table_all_devs_attr() that
1330 * uses an @anti_func that handle semantics of counter examples, e.g. not
1331 * capable of something. So: return !dm_table_any_dev_attr(t, anti_func, data);
1332 */
1333static bool dm_table_any_dev_attr(struct dm_table *t,
1334				  iterate_devices_callout_fn func, void *data)
1335{
1336	struct dm_target *ti;
1337	unsigned int i;
1338
1339	for (i = 0; i < dm_table_get_num_targets(t); i++) {
1340		ti = dm_table_get_target(t, i);
1341
1342		if (ti->type->iterate_devices &&
1343		    ti->type->iterate_devices(ti, func, data))
1344			return true;
1345        }
1346
1347	return false;
1348}
1349
1350static int count_device(struct dm_target *ti, struct dm_dev *dev,
1351			sector_t start, sector_t len, void *data)
1352{
1353	unsigned *num_devices = data;
1354
1355	(*num_devices)++;
1356
1357	return 0;
1358}
1359
1360/*
1361 * Check whether a table has no data devices attached using each
1362 * target's iterate_devices method.
1363 * Returns false if the result is unknown because a target doesn't
1364 * support iterate_devices.
1365 */
1366bool dm_table_has_no_data_devices(struct dm_table *table)
1367{
1368	struct dm_target *ti;
1369	unsigned i, num_devices;
1370
1371	for (i = 0; i < dm_table_get_num_targets(table); i++) {
1372		ti = dm_table_get_target(table, i);
1373
1374		if (!ti->type->iterate_devices)
1375			return false;
1376
1377		num_devices = 0;
1378		ti->type->iterate_devices(ti, count_device, &num_devices);
1379		if (num_devices)
1380			return false;
1381	}
1382
1383	return true;
1384}
1385
1386static int device_not_zoned_model(struct dm_target *ti, struct dm_dev *dev,
1387				  sector_t start, sector_t len, void *data)
1388{
1389	struct request_queue *q = bdev_get_queue(dev->bdev);
1390	enum blk_zoned_model *zoned_model = data;
1391
1392	return !q || blk_queue_zoned_model(q) != *zoned_model;
1393}
1394
1395/*
1396 * Check the device zoned model based on the target feature flag. If the target
1397 * has the DM_TARGET_ZONED_HM feature flag set, host-managed zoned devices are
1398 * also accepted but all devices must have the same zoned model. If the target
1399 * has the DM_TARGET_MIXED_ZONED_MODEL feature set, the devices can have any
1400 * zoned model with all zoned devices having the same zone size.
1401 */
1402static bool dm_table_supports_zoned_model(struct dm_table *t,
1403					  enum blk_zoned_model zoned_model)
1404{
1405	struct dm_target *ti;
1406	unsigned i;
1407
1408	for (i = 0; i < dm_table_get_num_targets(t); i++) {
1409		ti = dm_table_get_target(t, i);
1410
1411		if (dm_target_supports_zoned_hm(ti->type)) {
1412			if (!ti->type->iterate_devices ||
1413			    ti->type->iterate_devices(ti, device_not_zoned_model,
1414						      &zoned_model))
1415				return false;
1416		} else if (!dm_target_supports_mixed_zoned_model(ti->type)) {
1417			if (zoned_model == BLK_ZONED_HM)
1418				return false;
1419		}
1420	}
1421
1422	return true;
1423}
1424
1425static int device_not_matches_zone_sectors(struct dm_target *ti, struct dm_dev *dev,
1426					   sector_t start, sector_t len, void *data)
1427{
1428	struct request_queue *q = bdev_get_queue(dev->bdev);
1429	unsigned int *zone_sectors = data;
1430
1431	if (!blk_queue_is_zoned(q))
1432		return 0;
1433
1434	return !q || blk_queue_zone_sectors(q) != *zone_sectors;
1435}
1436
1437/*
1438 * Check consistency of zoned model and zone sectors across all targets. For
1439 * zone sectors, if the destination device is a zoned block device, it shall
1440 * have the specified zone_sectors.
1441 */
1442static int validate_hardware_zoned_model(struct dm_table *table,
1443					 enum blk_zoned_model zoned_model,
1444					 unsigned int zone_sectors)
1445{
1446	if (zoned_model == BLK_ZONED_NONE)
1447		return 0;
1448
1449	if (!dm_table_supports_zoned_model(table, zoned_model)) {
1450		DMERR("%s: zoned model is not consistent across all devices",
1451		      dm_device_name(table->md));
1452		return -EINVAL;
1453	}
1454
1455	/* Check zone size validity and compatibility */
1456	if (!zone_sectors || !is_power_of_2(zone_sectors))
1457		return -EINVAL;
1458
1459	if (dm_table_any_dev_attr(table, device_not_matches_zone_sectors, &zone_sectors)) {
1460		DMERR("%s: zone sectors is not consistent across all zoned devices",
1461		      dm_device_name(table->md));
1462		return -EINVAL;
1463	}
1464
1465	return 0;
1466}
1467
1468/*
1469 * Establish the new table's queue_limits and validate them.
1470 */
1471int dm_calculate_queue_limits(struct dm_table *table,
1472			      struct queue_limits *limits)
1473{
1474	struct dm_target *ti;
1475	struct queue_limits ti_limits;
1476	unsigned i;
1477	enum blk_zoned_model zoned_model = BLK_ZONED_NONE;
1478	unsigned int zone_sectors = 0;
1479
1480	blk_set_stacking_limits(limits);
1481
1482	for (i = 0; i < dm_table_get_num_targets(table); i++) {
1483		blk_set_stacking_limits(&ti_limits);
1484
1485		ti = dm_table_get_target(table, i);
1486
1487		if (!ti->type->iterate_devices)
1488			goto combine_limits;
1489
1490		/*
1491		 * Combine queue limits of all the devices this target uses.
1492		 */
1493		ti->type->iterate_devices(ti, dm_set_device_limits,
1494					  &ti_limits);
1495
1496		if (zoned_model == BLK_ZONED_NONE && ti_limits.zoned != BLK_ZONED_NONE) {
1497			/*
1498			 * After stacking all limits, validate all devices
1499			 * in table support this zoned model and zone sectors.
1500			 */
1501			zoned_model = ti_limits.zoned;
1502			zone_sectors = ti_limits.chunk_sectors;
1503		}
1504
1505		/* Set I/O hints portion of queue limits */
1506		if (ti->type->io_hints)
1507			ti->type->io_hints(ti, &ti_limits);
1508
1509		/*
1510		 * Check each device area is consistent with the target's
1511		 * overall queue limits.
1512		 */
1513		if (ti->type->iterate_devices(ti, device_area_is_invalid,
1514					      &ti_limits))
1515			return -EINVAL;
1516
1517combine_limits:
1518		/*
1519		 * Merge this target's queue limits into the overall limits
1520		 * for the table.
1521		 */
1522		if (blk_stack_limits(limits, &ti_limits, 0) < 0)
1523			DMWARN("%s: adding target device "
1524			       "(start sect %llu len %llu) "
1525			       "caused an alignment inconsistency",
1526			       dm_device_name(table->md),
1527			       (unsigned long long) ti->begin,
1528			       (unsigned long long) ti->len);
1529	}
1530
1531	/*
1532	 * Verify that the zoned model and zone sectors, as determined before
1533	 * any .io_hints override, are the same across all devices in the table.
1534	 * - this is especially relevant if .io_hints is emulating a disk-managed
1535	 *   zoned model (aka BLK_ZONED_NONE) on host-managed zoned block devices.
1536	 * BUT...
1537	 */
1538	if (limits->zoned != BLK_ZONED_NONE) {
1539		/*
1540		 * ...IF the above limits stacking determined a zoned model
1541		 * validate that all of the table's devices conform to it.
1542		 */
1543		zoned_model = limits->zoned;
1544		zone_sectors = limits->chunk_sectors;
1545	}
1546	if (validate_hardware_zoned_model(table, zoned_model, zone_sectors))
1547		return -EINVAL;
1548
1549	return validate_hardware_logical_block_alignment(table, limits);
1550}
1551
1552/*
1553 * Verify that all devices have an integrity profile that matches the
1554 * DM device's registered integrity profile.  If the profiles don't
1555 * match then unregister the DM device's integrity profile.
1556 */
1557static void dm_table_verify_integrity(struct dm_table *t)
1558{
1559	struct gendisk *template_disk = NULL;
1560
1561	if (t->integrity_added)
1562		return;
1563
1564	if (t->integrity_supported) {
1565		/*
1566		 * Verify that the original integrity profile
1567		 * matches all the devices in this table.
1568		 */
1569		template_disk = dm_table_get_integrity_disk(t);
1570		if (template_disk &&
1571		    blk_integrity_compare(dm_disk(t->md), template_disk) >= 0)
1572			return;
1573	}
1574
1575	if (integrity_profile_exists(dm_disk(t->md))) {
1576		DMWARN("%s: unable to establish an integrity profile",
1577		       dm_device_name(t->md));
1578		blk_integrity_unregister(dm_disk(t->md));
1579	}
1580}
1581
1582static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev,
1583				sector_t start, sector_t len, void *data)
1584{
1585	unsigned long flush = (unsigned long) data;
1586	struct request_queue *q = bdev_get_queue(dev->bdev);
1587
1588	return q && (q->queue_flags & flush);
1589}
1590
1591static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush)
1592{
1593	struct dm_target *ti;
1594	unsigned i;
1595
1596	/*
1597	 * Require at least one underlying device to support flushes.
1598	 * t->devices includes internal dm devices such as mirror logs
1599	 * so we need to use iterate_devices here, which targets
1600	 * supporting flushes must provide.
1601	 */
1602	for (i = 0; i < dm_table_get_num_targets(t); i++) {
1603		ti = dm_table_get_target(t, i);
1604
1605		if (!ti->num_flush_bios)
1606			continue;
1607
1608		if (ti->flush_supported)
1609			return true;
1610
1611		if (ti->type->iterate_devices &&
1612		    ti->type->iterate_devices(ti, device_flush_capable, (void *) flush))
1613			return true;
1614	}
1615
1616	return false;
1617}
1618
1619static int device_dax_write_cache_enabled(struct dm_target *ti,
1620					  struct dm_dev *dev, sector_t start,
1621					  sector_t len, void *data)
1622{
1623	struct dax_device *dax_dev = dev->dax_dev;
1624
1625	if (!dax_dev)
1626		return false;
1627
1628	if (dax_write_cache_enabled(dax_dev))
1629		return true;
1630	return false;
1631}
1632
1633static int device_is_rotational(struct dm_target *ti, struct dm_dev *dev,
1634				sector_t start, sector_t len, void *data)
1635{
1636	struct request_queue *q = bdev_get_queue(dev->bdev);
1637
1638	return q && !blk_queue_nonrot(q);
1639}
1640
1641static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev,
1642			     sector_t start, sector_t len, void *data)
1643{
1644	struct request_queue *q = bdev_get_queue(dev->bdev);
1645
1646	return q && !blk_queue_add_random(q);
1647}
1648
1649static int device_not_write_same_capable(struct dm_target *ti, struct dm_dev *dev,
1650					 sector_t start, sector_t len, void *data)
1651{
1652	struct request_queue *q = bdev_get_queue(dev->bdev);
1653
1654	return q && !q->limits.max_write_same_sectors;
1655}
1656
1657static bool dm_table_supports_write_same(struct dm_table *t)
1658{
1659	struct dm_target *ti;
1660	unsigned i;
1661
1662	for (i = 0; i < dm_table_get_num_targets(t); i++) {
1663		ti = dm_table_get_target(t, i);
1664
1665		if (!ti->num_write_same_bios)
1666			return false;
1667
1668		if (!ti->type->iterate_devices ||
1669		    ti->type->iterate_devices(ti, device_not_write_same_capable, NULL))
1670			return false;
1671	}
1672
1673	return true;
1674}
1675
1676static int device_not_write_zeroes_capable(struct dm_target *ti, struct dm_dev *dev,
1677					   sector_t start, sector_t len, void *data)
1678{
1679	struct request_queue *q = bdev_get_queue(dev->bdev);
1680
1681	return q && !q->limits.max_write_zeroes_sectors;
1682}
1683
1684static bool dm_table_supports_write_zeroes(struct dm_table *t)
1685{
1686	struct dm_target *ti;
1687	unsigned i = 0;
1688
1689	while (i < dm_table_get_num_targets(t)) {
1690		ti = dm_table_get_target(t, i++);
1691
1692		if (!ti->num_write_zeroes_bios)
1693			return false;
1694
1695		if (!ti->type->iterate_devices ||
1696		    ti->type->iterate_devices(ti, device_not_write_zeroes_capable, NULL))
1697			return false;
1698	}
1699
1700	return true;
1701}
1702
1703static int device_not_nowait_capable(struct dm_target *ti, struct dm_dev *dev,
1704				     sector_t start, sector_t len, void *data)
1705{
1706	struct request_queue *q = bdev_get_queue(dev->bdev);
1707
1708	return q && !blk_queue_nowait(q);
1709}
1710
1711static bool dm_table_supports_nowait(struct dm_table *t)
1712{
1713	struct dm_target *ti;
1714	unsigned i = 0;
1715
1716	while (i < dm_table_get_num_targets(t)) {
1717		ti = dm_table_get_target(t, i++);
1718
1719		if (!dm_target_supports_nowait(ti->type))
1720			return false;
1721
1722		if (!ti->type->iterate_devices ||
1723		    ti->type->iterate_devices(ti, device_not_nowait_capable, NULL))
1724			return false;
1725	}
1726
1727	return true;
1728}
1729
1730static int device_not_discard_capable(struct dm_target *ti, struct dm_dev *dev,
1731				      sector_t start, sector_t len, void *data)
1732{
1733	struct request_queue *q = bdev_get_queue(dev->bdev);
1734
1735	return q && !blk_queue_discard(q);
1736}
1737
1738static bool dm_table_supports_discards(struct dm_table *t)
1739{
1740	struct dm_target *ti;
1741	unsigned i;
1742
1743	for (i = 0; i < dm_table_get_num_targets(t); i++) {
1744		ti = dm_table_get_target(t, i);
1745
1746		if (!ti->num_discard_bios)
1747			return false;
1748
1749		/*
1750		 * Either the target provides discard support (as implied by setting
1751		 * 'discards_supported') or it relies on _all_ data devices having
1752		 * discard support.
1753		 */
1754		if (!ti->discards_supported &&
1755		    (!ti->type->iterate_devices ||
1756		     ti->type->iterate_devices(ti, device_not_discard_capable, NULL)))
1757			return false;
1758	}
1759
1760	return true;
1761}
1762
1763static int device_not_secure_erase_capable(struct dm_target *ti,
1764					   struct dm_dev *dev, sector_t start,
1765					   sector_t len, void *data)
1766{
1767	struct request_queue *q = bdev_get_queue(dev->bdev);
1768
1769	return q && !blk_queue_secure_erase(q);
1770}
1771
1772static bool dm_table_supports_secure_erase(struct dm_table *t)
1773{
1774	struct dm_target *ti;
1775	unsigned int i;
1776
1777	for (i = 0; i < dm_table_get_num_targets(t); i++) {
1778		ti = dm_table_get_target(t, i);
1779
1780		if (!ti->num_secure_erase_bios)
1781			return false;
1782
1783		if (!ti->type->iterate_devices ||
1784		    ti->type->iterate_devices(ti, device_not_secure_erase_capable, NULL))
1785			return false;
1786	}
1787
1788	return true;
1789}
1790
1791static int device_requires_stable_pages(struct dm_target *ti,
1792					struct dm_dev *dev, sector_t start,
1793					sector_t len, void *data)
1794{
1795	struct request_queue *q = bdev_get_queue(dev->bdev);
1796
1797	return q && blk_queue_stable_writes(q);
1798}
1799
1800void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
1801			       struct queue_limits *limits)
1802{
1803	bool wc = false, fua = false;
1804	int page_size = PAGE_SIZE;
1805
1806	/*
1807	 * Copy table's limits to the DM device's request_queue
1808	 */
1809	q->limits = *limits;
1810
1811	if (dm_table_supports_nowait(t))
1812		blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q);
1813	else
1814		blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, q);
1815
1816	if (!dm_table_supports_discards(t)) {
1817		blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q);
1818		/* Must also clear discard limits... */
1819		q->limits.max_discard_sectors = 0;
1820		q->limits.max_hw_discard_sectors = 0;
1821		q->limits.discard_granularity = 0;
1822		q->limits.discard_alignment = 0;
1823		q->limits.discard_misaligned = 0;
1824	} else
1825		blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
1826
1827	if (dm_table_supports_secure_erase(t))
1828		blk_queue_flag_set(QUEUE_FLAG_SECERASE, q);
1829
1830	if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) {
1831		wc = true;
1832		if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_FUA)))
1833			fua = true;
1834	}
1835	blk_queue_write_cache(q, wc, fua);
1836
1837	if (dm_table_supports_dax(t, device_not_dax_capable, &page_size)) {
1838		blk_queue_flag_set(QUEUE_FLAG_DAX, q);
1839		if (dm_table_supports_dax(t, device_not_dax_synchronous_capable, NULL))
1840			set_dax_synchronous(t->md->dax_dev);
1841	}
1842	else
1843		blk_queue_flag_clear(QUEUE_FLAG_DAX, q);
1844
1845	if (dm_table_any_dev_attr(t, device_dax_write_cache_enabled, NULL))
1846		dax_write_cache(t->md->dax_dev, true);
1847
1848	/* Ensure that all underlying devices are non-rotational. */
1849	if (dm_table_any_dev_attr(t, device_is_rotational, NULL))
1850		blk_queue_flag_clear(QUEUE_FLAG_NONROT, q);
1851	else
1852		blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
1853
1854	if (!dm_table_supports_write_same(t))
1855		q->limits.max_write_same_sectors = 0;
1856	if (!dm_table_supports_write_zeroes(t))
1857		q->limits.max_write_zeroes_sectors = 0;
1858
1859	dm_table_verify_integrity(t);
1860
1861	/*
1862	 * Some devices don't use blk_integrity but still want stable pages
1863	 * because they do their own checksumming.
1864	 * If any underlying device requires stable pages, a table must require
1865	 * them as well.  Only targets that support iterate_devices are considered:
1866	 * don't want error, zero, etc to require stable pages.
1867	 */
1868	if (dm_table_any_dev_attr(t, device_requires_stable_pages, NULL))
1869		blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);
1870	else
1871		blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q);
1872
1873	/*
1874	 * Determine whether or not this queue's I/O timings contribute
1875	 * to the entropy pool, Only request-based targets use this.
1876	 * Clear QUEUE_FLAG_ADD_RANDOM if any underlying device does not
1877	 * have it set.
1878	 */
1879	if (blk_queue_add_random(q) &&
1880	    dm_table_any_dev_attr(t, device_is_not_random, NULL))
1881		blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q);
1882
1883	/*
1884	 * For a zoned target, the number of zones should be updated for the
1885	 * correct value to be exposed in sysfs queue/nr_zones. For a BIO based
1886	 * target, this is all that is needed.
1887	 */
1888#ifdef CONFIG_BLK_DEV_ZONED
1889	if (blk_queue_is_zoned(q)) {
1890		WARN_ON_ONCE(queue_is_mq(q));
1891		q->nr_zones = blkdev_nr_zones(t->md->disk);
1892	}
1893#endif
1894
1895	blk_queue_update_readahead(q);
1896}
1897
1898unsigned int dm_table_get_num_targets(struct dm_table *t)
1899{
1900	return t->num_targets;
1901}
1902
1903struct list_head *dm_table_get_devices(struct dm_table *t)
1904{
1905	return &t->devices;
1906}
1907
1908fmode_t dm_table_get_mode(struct dm_table *t)
1909{
1910	return t->mode;
1911}
1912EXPORT_SYMBOL(dm_table_get_mode);
1913
1914enum suspend_mode {
1915	PRESUSPEND,
1916	PRESUSPEND_UNDO,
1917	POSTSUSPEND,
1918};
1919
1920static void suspend_targets(struct dm_table *t, enum suspend_mode mode)
1921{
1922	int i = t->num_targets;
1923	struct dm_target *ti = t->targets;
1924
1925	lockdep_assert_held(&t->md->suspend_lock);
1926
1927	while (i--) {
1928		switch (mode) {
1929		case PRESUSPEND:
1930			if (ti->type->presuspend)
1931				ti->type->presuspend(ti);
1932			break;
1933		case PRESUSPEND_UNDO:
1934			if (ti->type->presuspend_undo)
1935				ti->type->presuspend_undo(ti);
1936			break;
1937		case POSTSUSPEND:
1938			if (ti->type->postsuspend)
1939				ti->type->postsuspend(ti);
1940			break;
1941		}
1942		ti++;
1943	}
1944}
1945
1946void dm_table_presuspend_targets(struct dm_table *t)
1947{
1948	if (!t)
1949		return;
1950
1951	suspend_targets(t, PRESUSPEND);
1952}
1953
1954void dm_table_presuspend_undo_targets(struct dm_table *t)
1955{
1956	if (!t)
1957		return;
1958
1959	suspend_targets(t, PRESUSPEND_UNDO);
1960}
1961
1962void dm_table_postsuspend_targets(struct dm_table *t)
1963{
1964	if (!t)
1965		return;
1966
1967	suspend_targets(t, POSTSUSPEND);
1968}
1969
1970int dm_table_resume_targets(struct dm_table *t)
1971{
1972	int i, r = 0;
1973
1974	lockdep_assert_held(&t->md->suspend_lock);
1975
1976	for (i = 0; i < t->num_targets; i++) {
1977		struct dm_target *ti = t->targets + i;
1978
1979		if (!ti->type->preresume)
1980			continue;
1981
1982		r = ti->type->preresume(ti);
1983		if (r) {
1984			DMERR("%s: %s: preresume failed, error = %d",
1985			      dm_device_name(t->md), ti->type->name, r);
1986			return r;
1987		}
1988	}
1989
1990	for (i = 0; i < t->num_targets; i++) {
1991		struct dm_target *ti = t->targets + i;
1992
1993		if (ti->type->resume)
1994			ti->type->resume(ti);
1995	}
1996
1997	return 0;
1998}
1999
2000struct mapped_device *dm_table_get_md(struct dm_table *t)
2001{
2002	return t->md;
2003}
2004EXPORT_SYMBOL(dm_table_get_md);
2005
2006const char *dm_table_device_name(struct dm_table *t)
2007{
2008	return dm_device_name(t->md);
2009}
2010EXPORT_SYMBOL_GPL(dm_table_device_name);
2011
2012void dm_table_run_md_queue_async(struct dm_table *t)
2013{
2014	if (!dm_table_request_based(t))
2015		return;
2016
2017	if (t->md->queue)
2018		blk_mq_run_hw_queues(t->md->queue, true);
2019}
2020EXPORT_SYMBOL(dm_table_run_md_queue_async);
2021
2022