xref: /kernel/linux/linux-5.10/drivers/block/brd.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Ram backed block device driver.
4 *
5 * Copyright (C) 2007 Nick Piggin
6 * Copyright (C) 2007 Novell Inc.
7 *
8 * Parts derived from drivers/block/rd.c, and drivers/block/loop.c, copyright
9 * of their respective owners.
10 */
11
12#include <linux/init.h>
13#include <linux/initrd.h>
14#include <linux/module.h>
15#include <linux/moduleparam.h>
16#include <linux/major.h>
17#include <linux/blkdev.h>
18#include <linux/bio.h>
19#include <linux/highmem.h>
20#include <linux/mutex.h>
21#include <linux/radix-tree.h>
22#include <linux/fs.h>
23#include <linux/slab.h>
24#include <linux/backing-dev.h>
25
26#include <linux/uaccess.h>
27
28#define PAGE_SECTORS_SHIFT	(PAGE_SHIFT - SECTOR_SHIFT)
29#define PAGE_SECTORS		(1 << PAGE_SECTORS_SHIFT)
30
31/*
32 * Each block ramdisk device has a radix_tree brd_pages of pages that stores
33 * the pages containing the block device's contents. A brd page's ->index is
34 * its offset in PAGE_SIZE units. This is similar to, but in no way connected
35 * with, the kernel's pagecache or buffer cache (which sit above our block
36 * device).
37 */
38struct brd_device {
39	int		brd_number;
40
41	struct request_queue	*brd_queue;
42	struct gendisk		*brd_disk;
43	struct list_head	brd_list;
44
45	/*
46	 * Backing store of pages and lock to protect it. This is the contents
47	 * of the block device.
48	 */
49	spinlock_t		brd_lock;
50	struct radix_tree_root	brd_pages;
51};
52
53/*
54 * Look up and return a brd's page for a given sector.
55 */
56static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
57{
58	pgoff_t idx;
59	struct page *page;
60
61	/*
62	 * The page lifetime is protected by the fact that we have opened the
63	 * device node -- brd pages will never be deleted under us, so we
64	 * don't need any further locking or refcounting.
65	 *
66	 * This is strictly true for the radix-tree nodes as well (ie. we
67	 * don't actually need the rcu_read_lock()), however that is not a
68	 * documented feature of the radix-tree API so it is better to be
69	 * safe here (we don't have total exclusion from radix tree updates
70	 * here, only deletes).
71	 */
72	rcu_read_lock();
73	idx = sector >> PAGE_SECTORS_SHIFT; /* sector to page index */
74	page = radix_tree_lookup(&brd->brd_pages, idx);
75	rcu_read_unlock();
76
77	BUG_ON(page && page->index != idx);
78
79	return page;
80}
81
82/*
83 * Insert a new page for a given sector, if one does not already exist.
84 */
85static int brd_insert_page(struct brd_device *brd, sector_t sector)
86{
87	pgoff_t idx;
88	struct page *page;
89	gfp_t gfp_flags;
90
91	page = brd_lookup_page(brd, sector);
92	if (page)
93		return 0;
94
95	/*
96	 * Must use NOIO because we don't want to recurse back into the
97	 * block or filesystem layers from page reclaim.
98	 */
99	gfp_flags = GFP_NOIO | __GFP_ZERO | __GFP_HIGHMEM;
100	page = alloc_page(gfp_flags);
101	if (!page)
102		return -ENOMEM;
103
104	if (radix_tree_preload(GFP_NOIO)) {
105		__free_page(page);
106		return -ENOMEM;
107	}
108
109	spin_lock(&brd->brd_lock);
110	idx = sector >> PAGE_SECTORS_SHIFT;
111	page->index = idx;
112	if (radix_tree_insert(&brd->brd_pages, idx, page)) {
113		__free_page(page);
114		page = radix_tree_lookup(&brd->brd_pages, idx);
115		BUG_ON(!page);
116		BUG_ON(page->index != idx);
117	}
118	spin_unlock(&brd->brd_lock);
119
120	radix_tree_preload_end();
121	return 0;
122}
123
124/*
125 * Free all backing store pages and radix tree. This must only be called when
126 * there are no other users of the device.
127 */
128#define FREE_BATCH 16
129static void brd_free_pages(struct brd_device *brd)
130{
131	unsigned long pos = 0;
132	struct page *pages[FREE_BATCH];
133	int nr_pages;
134
135	do {
136		int i;
137
138		nr_pages = radix_tree_gang_lookup(&brd->brd_pages,
139				(void **)pages, pos, FREE_BATCH);
140
141		for (i = 0; i < nr_pages; i++) {
142			void *ret;
143
144			BUG_ON(pages[i]->index < pos);
145			pos = pages[i]->index;
146			ret = radix_tree_delete(&brd->brd_pages, pos);
147			BUG_ON(!ret || ret != pages[i]);
148			__free_page(pages[i]);
149		}
150
151		pos++;
152
153		/*
154		 * It takes 3.4 seconds to remove 80GiB ramdisk.
155		 * So, we need cond_resched to avoid stalling the CPU.
156		 */
157		cond_resched();
158
159		/*
160		 * This assumes radix_tree_gang_lookup always returns as
161		 * many pages as possible. If the radix-tree code changes,
162		 * so will this have to.
163		 */
164	} while (nr_pages == FREE_BATCH);
165}
166
167/*
168 * copy_to_brd_setup must be called before copy_to_brd. It may sleep.
169 */
170static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n)
171{
172	unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
173	size_t copy;
174	int ret;
175
176	copy = min_t(size_t, n, PAGE_SIZE - offset);
177	ret = brd_insert_page(brd, sector);
178	if (ret)
179		return ret;
180	if (copy < n) {
181		sector += copy >> SECTOR_SHIFT;
182		ret = brd_insert_page(brd, sector);
183	}
184	return ret;
185}
186
187/*
188 * Copy n bytes from src to the brd starting at sector. Does not sleep.
189 */
190static void copy_to_brd(struct brd_device *brd, const void *src,
191			sector_t sector, size_t n)
192{
193	struct page *page;
194	void *dst;
195	unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
196	size_t copy;
197
198	copy = min_t(size_t, n, PAGE_SIZE - offset);
199	page = brd_lookup_page(brd, sector);
200	BUG_ON(!page);
201
202	dst = kmap_atomic(page);
203	memcpy(dst + offset, src, copy);
204	kunmap_atomic(dst);
205
206	if (copy < n) {
207		src += copy;
208		sector += copy >> SECTOR_SHIFT;
209		copy = n - copy;
210		page = brd_lookup_page(brd, sector);
211		BUG_ON(!page);
212
213		dst = kmap_atomic(page);
214		memcpy(dst, src, copy);
215		kunmap_atomic(dst);
216	}
217}
218
219/*
220 * Copy n bytes to dst from the brd starting at sector. Does not sleep.
221 */
222static void copy_from_brd(void *dst, struct brd_device *brd,
223			sector_t sector, size_t n)
224{
225	struct page *page;
226	void *src;
227	unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
228	size_t copy;
229
230	copy = min_t(size_t, n, PAGE_SIZE - offset);
231	page = brd_lookup_page(brd, sector);
232	if (page) {
233		src = kmap_atomic(page);
234		memcpy(dst, src + offset, copy);
235		kunmap_atomic(src);
236	} else
237		memset(dst, 0, copy);
238
239	if (copy < n) {
240		dst += copy;
241		sector += copy >> SECTOR_SHIFT;
242		copy = n - copy;
243		page = brd_lookup_page(brd, sector);
244		if (page) {
245			src = kmap_atomic(page);
246			memcpy(dst, src, copy);
247			kunmap_atomic(src);
248		} else
249			memset(dst, 0, copy);
250	}
251}
252
253/*
254 * Process a single bvec of a bio.
255 */
256static int brd_do_bvec(struct brd_device *brd, struct page *page,
257			unsigned int len, unsigned int off, unsigned int op,
258			sector_t sector)
259{
260	void *mem;
261	int err = 0;
262
263	if (op_is_write(op)) {
264		err = copy_to_brd_setup(brd, sector, len);
265		if (err)
266			goto out;
267	}
268
269	mem = kmap_atomic(page);
270	if (!op_is_write(op)) {
271		copy_from_brd(mem + off, brd, sector, len);
272		flush_dcache_page(page);
273	} else {
274		flush_dcache_page(page);
275		copy_to_brd(brd, mem + off, sector, len);
276	}
277	kunmap_atomic(mem);
278
279out:
280	return err;
281}
282
283static blk_qc_t brd_submit_bio(struct bio *bio)
284{
285	struct brd_device *brd = bio->bi_disk->private_data;
286	struct bio_vec bvec;
287	sector_t sector;
288	struct bvec_iter iter;
289
290	sector = bio->bi_iter.bi_sector;
291	if (bio_end_sector(bio) > get_capacity(bio->bi_disk))
292		goto io_error;
293
294	bio_for_each_segment(bvec, bio, iter) {
295		unsigned int len = bvec.bv_len;
296		int err;
297
298		/* Don't support un-aligned buffer */
299		WARN_ON_ONCE((bvec.bv_offset & (SECTOR_SIZE - 1)) ||
300				(len & (SECTOR_SIZE - 1)));
301
302		err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset,
303				  bio_op(bio), sector);
304		if (err)
305			goto io_error;
306		sector += len >> SECTOR_SHIFT;
307	}
308
309	bio_endio(bio);
310	return BLK_QC_T_NONE;
311io_error:
312	bio_io_error(bio);
313	return BLK_QC_T_NONE;
314}
315
316static int brd_rw_page(struct block_device *bdev, sector_t sector,
317		       struct page *page, unsigned int op)
318{
319	struct brd_device *brd = bdev->bd_disk->private_data;
320	int err;
321
322	if (PageTransHuge(page))
323		return -ENOTSUPP;
324	err = brd_do_bvec(brd, page, PAGE_SIZE, 0, op, sector);
325	page_endio(page, op_is_write(op), err);
326	return err;
327}
328
329static const struct block_device_operations brd_fops = {
330	.owner =		THIS_MODULE,
331	.submit_bio =		brd_submit_bio,
332	.rw_page =		brd_rw_page,
333};
334
335/*
336 * And now the modules code and kernel interface.
337 */
338static int rd_nr = CONFIG_BLK_DEV_RAM_COUNT;
339module_param(rd_nr, int, 0444);
340MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices");
341
342unsigned long rd_size = CONFIG_BLK_DEV_RAM_SIZE;
343module_param(rd_size, ulong, 0444);
344MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
345
346static int max_part = 1;
347module_param(max_part, int, 0444);
348MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices");
349
350MODULE_LICENSE("GPL");
351MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);
352MODULE_ALIAS("rd");
353
354#ifndef MODULE
355/* Legacy boot options - nonmodular */
356static int __init ramdisk_size(char *str)
357{
358	rd_size = simple_strtol(str, NULL, 0);
359	return 1;
360}
361__setup("ramdisk_size=", ramdisk_size);
362#endif
363
364/*
365 * The device scheme is derived from loop.c. Keep them in synch where possible
366 * (should share code eventually).
367 */
368static LIST_HEAD(brd_devices);
369static DEFINE_MUTEX(brd_devices_mutex);
370
371static struct brd_device *brd_alloc(int i)
372{
373	struct brd_device *brd;
374	struct gendisk *disk;
375
376	brd = kzalloc(sizeof(*brd), GFP_KERNEL);
377	if (!brd)
378		goto out;
379	brd->brd_number		= i;
380	spin_lock_init(&brd->brd_lock);
381	INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC);
382
383	brd->brd_queue = blk_alloc_queue(NUMA_NO_NODE);
384	if (!brd->brd_queue)
385		goto out_free_dev;
386
387	/* This is so fdisk will align partitions on 4k, because of
388	 * direct_access API needing 4k alignment, returning a PFN
389	 * (This is only a problem on very small devices <= 4M,
390	 *  otherwise fdisk will align on 1M. Regardless this call
391	 *  is harmless)
392	 */
393	blk_queue_physical_block_size(brd->brd_queue, PAGE_SIZE);
394	disk = brd->brd_disk = alloc_disk(max_part);
395	if (!disk)
396		goto out_free_queue;
397	disk->major		= RAMDISK_MAJOR;
398	disk->first_minor	= i * max_part;
399	disk->fops		= &brd_fops;
400	disk->private_data	= brd;
401	disk->flags		= GENHD_FL_EXT_DEVT;
402	sprintf(disk->disk_name, "ram%d", i);
403	set_capacity(disk, rd_size * 2);
404
405	/* Tell the block layer that this is not a rotational device */
406	blk_queue_flag_set(QUEUE_FLAG_NONROT, brd->brd_queue);
407	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, brd->brd_queue);
408
409	return brd;
410
411out_free_queue:
412	blk_cleanup_queue(brd->brd_queue);
413out_free_dev:
414	kfree(brd);
415out:
416	return NULL;
417}
418
419static void brd_free(struct brd_device *brd)
420{
421	put_disk(brd->brd_disk);
422	blk_cleanup_queue(brd->brd_queue);
423	brd_free_pages(brd);
424	kfree(brd);
425}
426
427static struct brd_device *brd_init_one(int i, bool *new)
428{
429	struct brd_device *brd;
430
431	*new = false;
432	list_for_each_entry(brd, &brd_devices, brd_list) {
433		if (brd->brd_number == i)
434			goto out;
435	}
436
437	brd = brd_alloc(i);
438	if (brd) {
439		brd->brd_disk->queue = brd->brd_queue;
440		add_disk(brd->brd_disk);
441		list_add_tail(&brd->brd_list, &brd_devices);
442	}
443	*new = true;
444out:
445	return brd;
446}
447
448static void brd_del_one(struct brd_device *brd)
449{
450	list_del(&brd->brd_list);
451	del_gendisk(brd->brd_disk);
452	brd_free(brd);
453}
454
455static struct kobject *brd_probe(dev_t dev, int *part, void *data)
456{
457	struct brd_device *brd;
458	struct kobject *kobj;
459	bool new;
460
461	mutex_lock(&brd_devices_mutex);
462	brd = brd_init_one(MINOR(dev) / max_part, &new);
463	kobj = brd ? get_disk_and_module(brd->brd_disk) : NULL;
464	mutex_unlock(&brd_devices_mutex);
465
466	if (new)
467		*part = 0;
468
469	return kobj;
470}
471
472static inline void brd_check_and_reset_par(void)
473{
474	if (unlikely(!max_part))
475		max_part = 1;
476
477	/*
478	 * make sure 'max_part' can be divided exactly by (1U << MINORBITS),
479	 * otherwise, it is possiable to get same dev_t when adding partitions.
480	 */
481	if ((1U << MINORBITS) % max_part != 0)
482		max_part = 1UL << fls(max_part);
483
484	if (max_part > DISK_MAX_PARTS) {
485		pr_info("brd: max_part can't be larger than %d, reset max_part = %d.\n",
486			DISK_MAX_PARTS, DISK_MAX_PARTS);
487		max_part = DISK_MAX_PARTS;
488	}
489}
490
491static int __init brd_init(void)
492{
493	struct brd_device *brd, *next;
494	int i;
495
496	/*
497	 * brd module now has a feature to instantiate underlying device
498	 * structure on-demand, provided that there is an access dev node.
499	 *
500	 * (1) if rd_nr is specified, create that many upfront. else
501	 *     it defaults to CONFIG_BLK_DEV_RAM_COUNT
502	 * (2) User can further extend brd devices by create dev node themselves
503	 *     and have kernel automatically instantiate actual device
504	 *     on-demand. Example:
505	 *		mknod /path/devnod_name b 1 X	# 1 is the rd major
506	 *		fdisk -l /path/devnod_name
507	 *	If (X / max_part) was not already created it will be created
508	 *	dynamically.
509	 */
510
511	if (register_blkdev(RAMDISK_MAJOR, "ramdisk"))
512		return -EIO;
513
514	brd_check_and_reset_par();
515
516	for (i = 0; i < rd_nr; i++) {
517		brd = brd_alloc(i);
518		if (!brd)
519			goto out_free;
520		list_add_tail(&brd->brd_list, &brd_devices);
521	}
522
523	/* point of no return */
524
525	list_for_each_entry(brd, &brd_devices, brd_list) {
526		/*
527		 * associate with queue just before adding disk for
528		 * avoiding to mess up failure path
529		 */
530		brd->brd_disk->queue = brd->brd_queue;
531		add_disk(brd->brd_disk);
532	}
533
534	blk_register_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS,
535				  THIS_MODULE, brd_probe, NULL, NULL);
536
537	pr_info("brd: module loaded\n");
538	return 0;
539
540out_free:
541	list_for_each_entry_safe(brd, next, &brd_devices, brd_list) {
542		list_del(&brd->brd_list);
543		brd_free(brd);
544	}
545	unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
546
547	pr_info("brd: module NOT loaded !!!\n");
548	return -ENOMEM;
549}
550
551static void __exit brd_exit(void)
552{
553	struct brd_device *brd, *next;
554
555	list_for_each_entry_safe(brd, next, &brd_devices, brd_list)
556		brd_del_one(brd);
557
558	blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS);
559	unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
560
561	pr_info("brd: module unloaded\n");
562}
563
564module_init(brd_init);
565module_exit(brd_exit);
566
567