1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2018 Cambridge Greys Ltd
4 * Copyright (C) 2015-2016 Anton Ivanov (aivanov@brocade.com)
5 * Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
6 */
7
8/* 2001-09-28...2002-04-17
9 * Partition stuff by James_McMechan@hotmail.com
10 * old style ubd by setting UBD_SHIFT to 0
11 * 2002-09-27...2002-10-18 massive tinkering for 2.5
12 * partitions have changed in 2.5
13 * 2003-01-29 more tinkering for 2.5.59-1
14 * This should now address the sysfs problems and has
15 * the symlink for devfs to allow for booting with
16 * the common /dev/ubd/discX/... names rather than
17 * only /dev/ubdN/discN this version also has lots of
18 * clean ups preparing for ubd-many.
19 * James McMechan
20 */
21
22#define UBD_SHIFT 4
23
24#include <linux/module.h>
25#include <linux/init.h>
26#include <linux/blkdev.h>
27#include <linux/blk-mq.h>
28#include <linux/ata.h>
29#include <linux/hdreg.h>
30#include <linux/major.h>
31#include <linux/cdrom.h>
32#include <linux/proc_fs.h>
33#include <linux/seq_file.h>
34#include <linux/ctype.h>
35#include <linux/slab.h>
36#include <linux/vmalloc.h>
37#include <linux/platform_device.h>
38#include <linux/scatterlist.h>
39#include <asm/tlbflush.h>
40#include <kern_util.h>
41#include "mconsole_kern.h"
42#include <init.h>
43#include <irq_kern.h>
44#include "ubd.h"
45#include <os.h>
46#include "cow.h"
47
48/* Max request size is determined by sector mask - 32K */
49#define UBD_MAX_REQUEST (8 * sizeof(long))
50
51struct io_desc {
52	char *buffer;
53	unsigned long length;
54	unsigned long sector_mask;
55	unsigned long long cow_offset;
56	unsigned long bitmap_words[2];
57};
58
59struct io_thread_req {
60	struct request *req;
61	int fds[2];
62	unsigned long offsets[2];
63	unsigned long long offset;
64	int sectorsize;
65	int error;
66
67	int desc_cnt;
68	/* io_desc has to be the last element of the struct */
69	struct io_desc io_desc[];
70};
71
72
73static struct io_thread_req * (*irq_req_buffer)[];
74static struct io_thread_req *irq_remainder;
75static int irq_remainder_size;
76
77static struct io_thread_req * (*io_req_buffer)[];
78static struct io_thread_req *io_remainder;
79static int io_remainder_size;
80
81
82
83static inline int ubd_test_bit(__u64 bit, unsigned char *data)
84{
85	__u64 n;
86	int bits, off;
87
88	bits = sizeof(data[0]) * 8;
89	n = bit / bits;
90	off = bit % bits;
91	return (data[n] & (1 << off)) != 0;
92}
93
94static inline void ubd_set_bit(__u64 bit, unsigned char *data)
95{
96	__u64 n;
97	int bits, off;
98
99	bits = sizeof(data[0]) * 8;
100	n = bit / bits;
101	off = bit % bits;
102	data[n] |= (1 << off);
103}
104/*End stuff from ubd_user.h*/
105
106#define DRIVER_NAME "uml-blkdev"
107
108static DEFINE_MUTEX(ubd_lock);
109static DEFINE_MUTEX(ubd_mutex); /* replaces BKL, might not be needed */
110
111static int ubd_open(struct gendisk *disk, blk_mode_t mode);
112static void ubd_release(struct gendisk *disk);
113static int ubd_ioctl(struct block_device *bdev, blk_mode_t mode,
114		     unsigned int cmd, unsigned long arg);
115static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo);
116
117#define MAX_DEV (16)
118
119static const struct block_device_operations ubd_blops = {
120        .owner		= THIS_MODULE,
121        .open		= ubd_open,
122        .release	= ubd_release,
123        .ioctl		= ubd_ioctl,
124        .compat_ioctl	= blkdev_compat_ptr_ioctl,
125	.getgeo		= ubd_getgeo,
126};
127
128/* Protected by ubd_lock */
129static struct gendisk *ubd_gendisk[MAX_DEV];
130
131#ifdef CONFIG_BLK_DEV_UBD_SYNC
132#define OPEN_FLAGS ((struct openflags) { .r = 1, .w = 1, .s = 1, .c = 0, \
133					 .cl = 1 })
134#else
135#define OPEN_FLAGS ((struct openflags) { .r = 1, .w = 1, .s = 0, .c = 0, \
136					 .cl = 1 })
137#endif
138static struct openflags global_openflags = OPEN_FLAGS;
139
140struct cow {
141	/* backing file name */
142	char *file;
143	/* backing file fd */
144	int fd;
145	unsigned long *bitmap;
146	unsigned long bitmap_len;
147	int bitmap_offset;
148	int data_offset;
149};
150
151#define MAX_SG 64
152
153struct ubd {
154	/* name (and fd, below) of the file opened for writing, either the
155	 * backing or the cow file. */
156	char *file;
157	char *serial;
158	int count;
159	int fd;
160	__u64 size;
161	struct openflags boot_openflags;
162	struct openflags openflags;
163	unsigned shared:1;
164	unsigned no_cow:1;
165	unsigned no_trim:1;
166	struct cow cow;
167	struct platform_device pdev;
168	struct request_queue *queue;
169	struct blk_mq_tag_set tag_set;
170	spinlock_t lock;
171};
172
173#define DEFAULT_COW { \
174	.file =			NULL, \
175	.fd =			-1,	\
176	.bitmap =		NULL, \
177	.bitmap_offset =	0, \
178	.data_offset =		0, \
179}
180
181#define DEFAULT_UBD { \
182	.file = 		NULL, \
183	.serial =		NULL, \
184	.count =		0, \
185	.fd =			-1, \
186	.size =			-1, \
187	.boot_openflags =	OPEN_FLAGS, \
188	.openflags =		OPEN_FLAGS, \
189	.no_cow =               0, \
190	.no_trim =		0, \
191	.shared =		0, \
192	.cow =			DEFAULT_COW, \
193	.lock =			__SPIN_LOCK_UNLOCKED(ubd_devs.lock), \
194}
195
196/* Protected by ubd_lock */
197static struct ubd ubd_devs[MAX_DEV] = { [0 ... MAX_DEV - 1] = DEFAULT_UBD };
198
199static blk_status_t ubd_queue_rq(struct blk_mq_hw_ctx *hctx,
200				 const struct blk_mq_queue_data *bd);
201
202static int fake_ide_setup(char *str)
203{
204	pr_warn("The fake_ide option has been removed\n");
205	return 1;
206}
207__setup("fake_ide", fake_ide_setup);
208
209__uml_help(fake_ide_setup,
210"fake_ide\n"
211"    Obsolete stub.\n\n"
212);
213
214static int parse_unit(char **ptr)
215{
216	char *str = *ptr, *end;
217	int n = -1;
218
219	if(isdigit(*str)) {
220		n = simple_strtoul(str, &end, 0);
221		if(end == str)
222			return -1;
223		*ptr = end;
224	}
225	else if (('a' <= *str) && (*str <= 'z')) {
226		n = *str - 'a';
227		str++;
228		*ptr = str;
229	}
230	return n;
231}
232
233/* If *index_out == -1 at exit, the passed option was a general one;
234 * otherwise, the str pointer is used (and owned) inside ubd_devs array, so it
235 * should not be freed on exit.
236 */
237static int ubd_setup_common(char *str, int *index_out, char **error_out)
238{
239	struct ubd *ubd_dev;
240	struct openflags flags = global_openflags;
241	char *file, *backing_file, *serial;
242	int n, err = 0, i;
243
244	if(index_out) *index_out = -1;
245	n = *str;
246	if(n == '='){
247		str++;
248		if(!strcmp(str, "sync")){
249			global_openflags = of_sync(global_openflags);
250			return err;
251		}
252
253		pr_warn("fake major not supported any more\n");
254		return 0;
255	}
256
257	n = parse_unit(&str);
258	if(n < 0){
259		*error_out = "Couldn't parse device number";
260		return -EINVAL;
261	}
262	if(n >= MAX_DEV){
263		*error_out = "Device number out of range";
264		return 1;
265	}
266
267	err = -EBUSY;
268	mutex_lock(&ubd_lock);
269
270	ubd_dev = &ubd_devs[n];
271	if(ubd_dev->file != NULL){
272		*error_out = "Device is already configured";
273		goto out;
274	}
275
276	if (index_out)
277		*index_out = n;
278
279	err = -EINVAL;
280	for (i = 0; i < sizeof("rscdt="); i++) {
281		switch (*str) {
282		case 'r':
283			flags.w = 0;
284			break;
285		case 's':
286			flags.s = 1;
287			break;
288		case 'd':
289			ubd_dev->no_cow = 1;
290			break;
291		case 'c':
292			ubd_dev->shared = 1;
293			break;
294		case 't':
295			ubd_dev->no_trim = 1;
296			break;
297		case '=':
298			str++;
299			goto break_loop;
300		default:
301			*error_out = "Expected '=' or flag letter "
302				"(r, s, c, t or d)";
303			goto out;
304		}
305		str++;
306	}
307
308	if (*str == '=')
309		*error_out = "Too many flags specified";
310	else
311		*error_out = "Missing '='";
312	goto out;
313
314break_loop:
315	file = strsep(&str, ",:");
316	if (*file == '\0')
317		file = NULL;
318
319	backing_file = strsep(&str, ",:");
320	if (backing_file && *backing_file == '\0')
321		backing_file = NULL;
322
323	serial = strsep(&str, ",:");
324	if (serial && *serial == '\0')
325		serial = NULL;
326
327	if (backing_file && ubd_dev->no_cow) {
328		*error_out = "Can't specify both 'd' and a cow file";
329		goto out;
330	}
331
332	err = 0;
333	ubd_dev->file = file;
334	ubd_dev->cow.file = backing_file;
335	ubd_dev->serial = serial;
336	ubd_dev->boot_openflags = flags;
337out:
338	mutex_unlock(&ubd_lock);
339	return err;
340}
341
342static int ubd_setup(char *str)
343{
344	char *error;
345	int err;
346
347	err = ubd_setup_common(str, NULL, &error);
348	if(err)
349		printk(KERN_ERR "Failed to initialize device with \"%s\" : "
350		       "%s\n", str, error);
351	return 1;
352}
353
354__setup("ubd", ubd_setup);
355__uml_help(ubd_setup,
356"ubd<n><flags>=<filename>[(:|,)<filename2>][(:|,)<serial>]\n"
357"    This is used to associate a device with a file in the underlying\n"
358"    filesystem. When specifying two filenames, the first one is the\n"
359"    COW name and the second is the backing file name. As separator you can\n"
360"    use either a ':' or a ',': the first one allows writing things like;\n"
361"	ubd0=~/Uml/root_cow:~/Uml/root_backing_file\n"
362"    while with a ',' the shell would not expand the 2nd '~'.\n"
363"    When using only one filename, UML will detect whether to treat it like\n"
364"    a COW file or a backing file. To override this detection, add the 'd'\n"
365"    flag:\n"
366"	ubd0d=BackingFile\n"
367"    Usually, there is a filesystem in the file, but \n"
368"    that's not required. Swap devices containing swap files can be\n"
369"    specified like this. Also, a file which doesn't contain a\n"
370"    filesystem can have its contents read in the virtual \n"
371"    machine by running 'dd' on the device. <n> must be in the range\n"
372"    0 to 7. Appending an 'r' to the number will cause that device\n"
373"    to be mounted read-only. For example ubd1r=./ext_fs. Appending\n"
374"    an 's' will cause data to be written to disk on the host immediately.\n"
375"    'c' will cause the device to be treated as being shared between multiple\n"
376"    UMLs and file locking will be turned off - this is appropriate for a\n"
377"    cluster filesystem and inappropriate at almost all other times.\n\n"
378"    't' will disable trim/discard support on the device (enabled by default).\n\n"
379"    An optional device serial number can be exposed using the serial parameter\n"
380"    on the cmdline which is exposed as a sysfs entry. This is particularly\n"
381"    useful when a unique number should be given to the device. Note when\n"
382"    specifying a label, the filename2 must be also presented. It can be\n"
383"    an empty string, in which case the backing file is not used:\n"
384"       ubd0=File,,Serial\n"
385);
386
387static int udb_setup(char *str)
388{
389	printk("udb%s specified on command line is almost certainly a ubd -> "
390	       "udb TYPO\n", str);
391	return 1;
392}
393
394__setup("udb", udb_setup);
395__uml_help(udb_setup,
396"udb\n"
397"    This option is here solely to catch ubd -> udb typos, which can be\n"
398"    to impossible to catch visually unless you specifically look for\n"
399"    them.  The only result of any option starting with 'udb' is an error\n"
400"    in the boot output.\n\n"
401);
402
403/* Only changed by ubd_init, which is an initcall. */
404static int thread_fd = -1;
405
406/* Function to read several request pointers at a time
407* handling fractional reads if (and as) needed
408*/
409
410static int bulk_req_safe_read(
411	int fd,
412	struct io_thread_req * (*request_buffer)[],
413	struct io_thread_req **remainder,
414	int *remainder_size,
415	int max_recs
416	)
417{
418	int n = 0;
419	int res = 0;
420
421	if (*remainder_size > 0) {
422		memmove(
423			(char *) request_buffer,
424			(char *) remainder, *remainder_size
425		);
426		n = *remainder_size;
427	}
428
429	res = os_read_file(
430			fd,
431			((char *) request_buffer) + *remainder_size,
432			sizeof(struct io_thread_req *)*max_recs
433				- *remainder_size
434		);
435	if (res > 0) {
436		n += res;
437		if ((n % sizeof(struct io_thread_req *)) > 0) {
438			/*
439			* Read somehow returned not a multiple of dword
440			* theoretically possible, but never observed in the
441			* wild, so read routine must be able to handle it
442			*/
443			*remainder_size = n % sizeof(struct io_thread_req *);
444			WARN(*remainder_size > 0, "UBD IPC read returned a partial result");
445			memmove(
446				remainder,
447				((char *) request_buffer) +
448					(n/sizeof(struct io_thread_req *))*sizeof(struct io_thread_req *),
449				*remainder_size
450			);
451			n = n - *remainder_size;
452		}
453	} else {
454		n = res;
455	}
456	return n;
457}
458
459/* Called without dev->lock held, and only in interrupt context. */
460static void ubd_handler(void)
461{
462	int n;
463	int count;
464
465	while(1){
466		n = bulk_req_safe_read(
467			thread_fd,
468			irq_req_buffer,
469			&irq_remainder,
470			&irq_remainder_size,
471			UBD_REQ_BUFFER_SIZE
472		);
473		if (n < 0) {
474			if(n == -EAGAIN)
475				break;
476			printk(KERN_ERR "spurious interrupt in ubd_handler, "
477			       "err = %d\n", -n);
478			return;
479		}
480		for (count = 0; count < n/sizeof(struct io_thread_req *); count++) {
481			struct io_thread_req *io_req = (*irq_req_buffer)[count];
482
483			if ((io_req->error == BLK_STS_NOTSUPP) && (req_op(io_req->req) == REQ_OP_DISCARD)) {
484				blk_queue_max_discard_sectors(io_req->req->q, 0);
485				blk_queue_max_write_zeroes_sectors(io_req->req->q, 0);
486			}
487			blk_mq_end_request(io_req->req, io_req->error);
488			kfree(io_req);
489		}
490	}
491}
492
493static irqreturn_t ubd_intr(int irq, void *dev)
494{
495	ubd_handler();
496	return IRQ_HANDLED;
497}
498
499/* Only changed by ubd_init, which is an initcall. */
500static int io_pid = -1;
501
502static void kill_io_thread(void)
503{
504	if(io_pid != -1)
505		os_kill_process(io_pid, 1);
506}
507
508__uml_exitcall(kill_io_thread);
509
510static inline int ubd_file_size(struct ubd *ubd_dev, __u64 *size_out)
511{
512	char *file;
513	int fd;
514	int err;
515
516	__u32 version;
517	__u32 align;
518	char *backing_file;
519	time64_t mtime;
520	unsigned long long size;
521	int sector_size;
522	int bitmap_offset;
523
524	if (ubd_dev->file && ubd_dev->cow.file) {
525		file = ubd_dev->cow.file;
526
527		goto out;
528	}
529
530	fd = os_open_file(ubd_dev->file, of_read(OPENFLAGS()), 0);
531	if (fd < 0)
532		return fd;
533
534	err = read_cow_header(file_reader, &fd, &version, &backing_file, \
535		&mtime, &size, &sector_size, &align, &bitmap_offset);
536	os_close_file(fd);
537
538	if(err == -EINVAL)
539		file = ubd_dev->file;
540	else
541		file = backing_file;
542
543out:
544	return os_file_size(file, size_out);
545}
546
547static int read_cow_bitmap(int fd, void *buf, int offset, int len)
548{
549	int err;
550
551	err = os_pread_file(fd, buf, len, offset);
552	if (err < 0)
553		return err;
554
555	return 0;
556}
557
558static int backing_file_mismatch(char *file, __u64 size, time64_t mtime)
559{
560	time64_t modtime;
561	unsigned long long actual;
562	int err;
563
564	err = os_file_modtime(file, &modtime);
565	if (err < 0) {
566		printk(KERN_ERR "Failed to get modification time of backing "
567		       "file \"%s\", err = %d\n", file, -err);
568		return err;
569	}
570
571	err = os_file_size(file, &actual);
572	if (err < 0) {
573		printk(KERN_ERR "Failed to get size of backing file \"%s\", "
574		       "err = %d\n", file, -err);
575		return err;
576	}
577
578	if (actual != size) {
579		/*__u64 can be a long on AMD64 and with %lu GCC complains; so
580		 * the typecast.*/
581		printk(KERN_ERR "Size mismatch (%llu vs %llu) of COW header "
582		       "vs backing file\n", (unsigned long long) size, actual);
583		return -EINVAL;
584	}
585	if (modtime != mtime) {
586		printk(KERN_ERR "mtime mismatch (%lld vs %lld) of COW header vs "
587		       "backing file\n", mtime, modtime);
588		return -EINVAL;
589	}
590	return 0;
591}
592
593static int path_requires_switch(char *from_cmdline, char *from_cow, char *cow)
594{
595	struct uml_stat buf1, buf2;
596	int err;
597
598	if (from_cmdline == NULL)
599		return 0;
600	if (!strcmp(from_cmdline, from_cow))
601		return 0;
602
603	err = os_stat_file(from_cmdline, &buf1);
604	if (err < 0) {
605		printk(KERN_ERR "Couldn't stat '%s', err = %d\n", from_cmdline,
606		       -err);
607		return 0;
608	}
609	err = os_stat_file(from_cow, &buf2);
610	if (err < 0) {
611		printk(KERN_ERR "Couldn't stat '%s', err = %d\n", from_cow,
612		       -err);
613		return 1;
614	}
615	if ((buf1.ust_dev == buf2.ust_dev) && (buf1.ust_ino == buf2.ust_ino))
616		return 0;
617
618	printk(KERN_ERR "Backing file mismatch - \"%s\" requested, "
619	       "\"%s\" specified in COW header of \"%s\"\n",
620	       from_cmdline, from_cow, cow);
621	return 1;
622}
623
624static int open_ubd_file(char *file, struct openflags *openflags, int shared,
625		  char **backing_file_out, int *bitmap_offset_out,
626		  unsigned long *bitmap_len_out, int *data_offset_out,
627		  int *create_cow_out)
628{
629	time64_t mtime;
630	unsigned long long size;
631	__u32 version, align;
632	char *backing_file;
633	int fd, err, sectorsize, asked_switch, mode = 0644;
634
635	fd = os_open_file(file, *openflags, mode);
636	if (fd < 0) {
637		if ((fd == -ENOENT) && (create_cow_out != NULL))
638			*create_cow_out = 1;
639		if (!openflags->w ||
640		    ((fd != -EROFS) && (fd != -EACCES)))
641			return fd;
642		openflags->w = 0;
643		fd = os_open_file(file, *openflags, mode);
644		if (fd < 0)
645			return fd;
646	}
647
648	if (shared)
649		printk(KERN_INFO "Not locking \"%s\" on the host\n", file);
650	else {
651		err = os_lock_file(fd, openflags->w);
652		if (err < 0) {
653			printk(KERN_ERR "Failed to lock '%s', err = %d\n",
654			       file, -err);
655			goto out_close;
656		}
657	}
658
659	/* Successful return case! */
660	if (backing_file_out == NULL)
661		return fd;
662
663	err = read_cow_header(file_reader, &fd, &version, &backing_file, &mtime,
664			      &size, &sectorsize, &align, bitmap_offset_out);
665	if (err && (*backing_file_out != NULL)) {
666		printk(KERN_ERR "Failed to read COW header from COW file "
667		       "\"%s\", errno = %d\n", file, -err);
668		goto out_close;
669	}
670	if (err)
671		return fd;
672
673	asked_switch = path_requires_switch(*backing_file_out, backing_file,
674					    file);
675
676	/* Allow switching only if no mismatch. */
677	if (asked_switch && !backing_file_mismatch(*backing_file_out, size,
678						   mtime)) {
679		printk(KERN_ERR "Switching backing file to '%s'\n",
680		       *backing_file_out);
681		err = write_cow_header(file, fd, *backing_file_out,
682				       sectorsize, align, &size);
683		if (err) {
684			printk(KERN_ERR "Switch failed, errno = %d\n", -err);
685			goto out_close;
686		}
687	} else {
688		*backing_file_out = backing_file;
689		err = backing_file_mismatch(*backing_file_out, size, mtime);
690		if (err)
691			goto out_close;
692	}
693
694	cow_sizes(version, size, sectorsize, align, *bitmap_offset_out,
695		  bitmap_len_out, data_offset_out);
696
697	return fd;
698 out_close:
699	os_close_file(fd);
700	return err;
701}
702
703static int create_cow_file(char *cow_file, char *backing_file,
704		    struct openflags flags,
705		    int sectorsize, int alignment, int *bitmap_offset_out,
706		    unsigned long *bitmap_len_out, int *data_offset_out)
707{
708	int err, fd;
709
710	flags.c = 1;
711	fd = open_ubd_file(cow_file, &flags, 0, NULL, NULL, NULL, NULL, NULL);
712	if (fd < 0) {
713		err = fd;
714		printk(KERN_ERR "Open of COW file '%s' failed, errno = %d\n",
715		       cow_file, -err);
716		goto out;
717	}
718
719	err = init_cow_file(fd, cow_file, backing_file, sectorsize, alignment,
720			    bitmap_offset_out, bitmap_len_out,
721			    data_offset_out);
722	if (!err)
723		return fd;
724	os_close_file(fd);
725 out:
726	return err;
727}
728
729static void ubd_close_dev(struct ubd *ubd_dev)
730{
731	os_close_file(ubd_dev->fd);
732	if(ubd_dev->cow.file == NULL)
733		return;
734
735	os_close_file(ubd_dev->cow.fd);
736	vfree(ubd_dev->cow.bitmap);
737	ubd_dev->cow.bitmap = NULL;
738}
739
740static int ubd_open_dev(struct ubd *ubd_dev)
741{
742	struct openflags flags;
743	char **back_ptr;
744	int err, create_cow, *create_ptr;
745	int fd;
746
747	ubd_dev->openflags = ubd_dev->boot_openflags;
748	create_cow = 0;
749	create_ptr = (ubd_dev->cow.file != NULL) ? &create_cow : NULL;
750	back_ptr = ubd_dev->no_cow ? NULL : &ubd_dev->cow.file;
751
752	fd = open_ubd_file(ubd_dev->file, &ubd_dev->openflags, ubd_dev->shared,
753				back_ptr, &ubd_dev->cow.bitmap_offset,
754				&ubd_dev->cow.bitmap_len, &ubd_dev->cow.data_offset,
755				create_ptr);
756
757	if((fd == -ENOENT) && create_cow){
758		fd = create_cow_file(ubd_dev->file, ubd_dev->cow.file,
759					  ubd_dev->openflags, SECTOR_SIZE, PAGE_SIZE,
760					  &ubd_dev->cow.bitmap_offset,
761					  &ubd_dev->cow.bitmap_len,
762					  &ubd_dev->cow.data_offset);
763		if(fd >= 0){
764			printk(KERN_INFO "Creating \"%s\" as COW file for "
765			       "\"%s\"\n", ubd_dev->file, ubd_dev->cow.file);
766		}
767	}
768
769	if(fd < 0){
770		printk("Failed to open '%s', errno = %d\n", ubd_dev->file,
771		       -fd);
772		return fd;
773	}
774	ubd_dev->fd = fd;
775
776	if(ubd_dev->cow.file != NULL){
777		blk_queue_max_hw_sectors(ubd_dev->queue, 8 * sizeof(long));
778
779		err = -ENOMEM;
780		ubd_dev->cow.bitmap = vmalloc(ubd_dev->cow.bitmap_len);
781		if(ubd_dev->cow.bitmap == NULL){
782			printk(KERN_ERR "Failed to vmalloc COW bitmap\n");
783			goto error;
784		}
785		flush_tlb_kernel_vm();
786
787		err = read_cow_bitmap(ubd_dev->fd, ubd_dev->cow.bitmap,
788				      ubd_dev->cow.bitmap_offset,
789				      ubd_dev->cow.bitmap_len);
790		if(err < 0)
791			goto error;
792
793		flags = ubd_dev->openflags;
794		flags.w = 0;
795		err = open_ubd_file(ubd_dev->cow.file, &flags, ubd_dev->shared, NULL,
796				    NULL, NULL, NULL, NULL);
797		if(err < 0) goto error;
798		ubd_dev->cow.fd = err;
799	}
800	if (ubd_dev->no_trim == 0) {
801		ubd_dev->queue->limits.discard_granularity = SECTOR_SIZE;
802		blk_queue_max_discard_sectors(ubd_dev->queue, UBD_MAX_REQUEST);
803		blk_queue_max_write_zeroes_sectors(ubd_dev->queue, UBD_MAX_REQUEST);
804	}
805	blk_queue_flag_set(QUEUE_FLAG_NONROT, ubd_dev->queue);
806	return 0;
807 error:
808	os_close_file(ubd_dev->fd);
809	return err;
810}
811
812static void ubd_device_release(struct device *dev)
813{
814	struct ubd *ubd_dev = dev_get_drvdata(dev);
815
816	blk_mq_free_tag_set(&ubd_dev->tag_set);
817	*ubd_dev = ((struct ubd) DEFAULT_UBD);
818}
819
820static ssize_t serial_show(struct device *dev,
821			   struct device_attribute *attr, char *buf)
822{
823	struct gendisk *disk = dev_to_disk(dev);
824	struct ubd *ubd_dev = disk->private_data;
825
826	if (!ubd_dev)
827		return 0;
828
829	return sprintf(buf, "%s", ubd_dev->serial);
830}
831
832static DEVICE_ATTR_RO(serial);
833
834static struct attribute *ubd_attrs[] = {
835	&dev_attr_serial.attr,
836	NULL,
837};
838
839static umode_t ubd_attrs_are_visible(struct kobject *kobj,
840				     struct attribute *a, int n)
841{
842	return a->mode;
843}
844
845static const struct attribute_group ubd_attr_group = {
846	.attrs = ubd_attrs,
847	.is_visible = ubd_attrs_are_visible,
848};
849
850static const struct attribute_group *ubd_attr_groups[] = {
851	&ubd_attr_group,
852	NULL,
853};
854
855static int ubd_disk_register(int major, u64 size, int unit,
856			     struct gendisk *disk)
857{
858	disk->major = major;
859	disk->first_minor = unit << UBD_SHIFT;
860	disk->minors = 1 << UBD_SHIFT;
861	disk->fops = &ubd_blops;
862	set_capacity(disk, size / 512);
863	sprintf(disk->disk_name, "ubd%c", 'a' + unit);
864
865	ubd_devs[unit].pdev.id   = unit;
866	ubd_devs[unit].pdev.name = DRIVER_NAME;
867	ubd_devs[unit].pdev.dev.release = ubd_device_release;
868	dev_set_drvdata(&ubd_devs[unit].pdev.dev, &ubd_devs[unit]);
869	platform_device_register(&ubd_devs[unit].pdev);
870
871	disk->private_data = &ubd_devs[unit];
872	disk->queue = ubd_devs[unit].queue;
873	return device_add_disk(&ubd_devs[unit].pdev.dev, disk, ubd_attr_groups);
874}
875
876#define ROUND_BLOCK(n) ((n + (SECTOR_SIZE - 1)) & (-SECTOR_SIZE))
877
878static const struct blk_mq_ops ubd_mq_ops = {
879	.queue_rq = ubd_queue_rq,
880};
881
882static int ubd_add(int n, char **error_out)
883{
884	struct ubd *ubd_dev = &ubd_devs[n];
885	struct gendisk *disk;
886	int err = 0;
887
888	if(ubd_dev->file == NULL)
889		goto out;
890
891	err = ubd_file_size(ubd_dev, &ubd_dev->size);
892	if(err < 0){
893		*error_out = "Couldn't determine size of device's file";
894		goto out;
895	}
896
897	ubd_dev->size = ROUND_BLOCK(ubd_dev->size);
898
899	ubd_dev->tag_set.ops = &ubd_mq_ops;
900	ubd_dev->tag_set.queue_depth = 64;
901	ubd_dev->tag_set.numa_node = NUMA_NO_NODE;
902	ubd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
903	ubd_dev->tag_set.driver_data = ubd_dev;
904	ubd_dev->tag_set.nr_hw_queues = 1;
905
906	err = blk_mq_alloc_tag_set(&ubd_dev->tag_set);
907	if (err)
908		goto out;
909
910	disk = blk_mq_alloc_disk(&ubd_dev->tag_set, ubd_dev);
911	if (IS_ERR(disk)) {
912		err = PTR_ERR(disk);
913		goto out_cleanup_tags;
914	}
915	ubd_dev->queue = disk->queue;
916
917	blk_queue_write_cache(ubd_dev->queue, true, false);
918	blk_queue_max_segments(ubd_dev->queue, MAX_SG);
919	blk_queue_segment_boundary(ubd_dev->queue, PAGE_SIZE - 1);
920	err = ubd_disk_register(UBD_MAJOR, ubd_dev->size, n, disk);
921	if (err)
922		goto out_cleanup_disk;
923
924	ubd_gendisk[n] = disk;
925	return 0;
926
927out_cleanup_disk:
928	put_disk(disk);
929out_cleanup_tags:
930	blk_mq_free_tag_set(&ubd_dev->tag_set);
931out:
932	return err;
933}
934
935static int ubd_config(char *str, char **error_out)
936{
937	int n, ret;
938
939	/* This string is possibly broken up and stored, so it's only
940	 * freed if ubd_setup_common fails, or if only general options
941	 * were set.
942	 */
943	str = kstrdup(str, GFP_KERNEL);
944	if (str == NULL) {
945		*error_out = "Failed to allocate memory";
946		return -ENOMEM;
947	}
948
949	ret = ubd_setup_common(str, &n, error_out);
950	if (ret)
951		goto err_free;
952
953	if (n == -1) {
954		ret = 0;
955		goto err_free;
956	}
957
958	mutex_lock(&ubd_lock);
959	ret = ubd_add(n, error_out);
960	if (ret)
961		ubd_devs[n].file = NULL;
962	mutex_unlock(&ubd_lock);
963
964out:
965	return ret;
966
967err_free:
968	kfree(str);
969	goto out;
970}
971
972static int ubd_get_config(char *name, char *str, int size, char **error_out)
973{
974	struct ubd *ubd_dev;
975	int n, len = 0;
976
977	n = parse_unit(&name);
978	if((n >= MAX_DEV) || (n < 0)){
979		*error_out = "ubd_get_config : device number out of range";
980		return -1;
981	}
982
983	ubd_dev = &ubd_devs[n];
984	mutex_lock(&ubd_lock);
985
986	if(ubd_dev->file == NULL){
987		CONFIG_CHUNK(str, size, len, "", 1);
988		goto out;
989	}
990
991	CONFIG_CHUNK(str, size, len, ubd_dev->file, 0);
992
993	if(ubd_dev->cow.file != NULL){
994		CONFIG_CHUNK(str, size, len, ",", 0);
995		CONFIG_CHUNK(str, size, len, ubd_dev->cow.file, 1);
996	}
997	else CONFIG_CHUNK(str, size, len, "", 1);
998
999 out:
1000	mutex_unlock(&ubd_lock);
1001	return len;
1002}
1003
1004static int ubd_id(char **str, int *start_out, int *end_out)
1005{
1006	int n;
1007
1008	n = parse_unit(str);
1009	*start_out = 0;
1010	*end_out = MAX_DEV - 1;
1011	return n;
1012}
1013
1014static int ubd_remove(int n, char **error_out)
1015{
1016	struct gendisk *disk = ubd_gendisk[n];
1017	struct ubd *ubd_dev;
1018	int err = -ENODEV;
1019
1020	mutex_lock(&ubd_lock);
1021
1022	ubd_dev = &ubd_devs[n];
1023
1024	if(ubd_dev->file == NULL)
1025		goto out;
1026
1027	/* you cannot remove a open disk */
1028	err = -EBUSY;
1029	if(ubd_dev->count > 0)
1030		goto out;
1031
1032	ubd_gendisk[n] = NULL;
1033	if(disk != NULL){
1034		del_gendisk(disk);
1035		put_disk(disk);
1036	}
1037
1038	err = 0;
1039	platform_device_unregister(&ubd_dev->pdev);
1040out:
1041	mutex_unlock(&ubd_lock);
1042	return err;
1043}
1044
1045/* All these are called by mconsole in process context and without
1046 * ubd-specific locks.  The structure itself is const except for .list.
1047 */
1048static struct mc_device ubd_mc = {
1049	.list		= LIST_HEAD_INIT(ubd_mc.list),
1050	.name		= "ubd",
1051	.config		= ubd_config,
1052	.get_config	= ubd_get_config,
1053	.id		= ubd_id,
1054	.remove		= ubd_remove,
1055};
1056
1057static int __init ubd_mc_init(void)
1058{
1059	mconsole_register_dev(&ubd_mc);
1060	return 0;
1061}
1062
1063__initcall(ubd_mc_init);
1064
1065static int __init ubd0_init(void)
1066{
1067	struct ubd *ubd_dev = &ubd_devs[0];
1068
1069	mutex_lock(&ubd_lock);
1070	if(ubd_dev->file == NULL)
1071		ubd_dev->file = "root_fs";
1072	mutex_unlock(&ubd_lock);
1073
1074	return 0;
1075}
1076
1077__initcall(ubd0_init);
1078
1079/* Used in ubd_init, which is an initcall */
1080static struct platform_driver ubd_driver = {
1081	.driver = {
1082		.name  = DRIVER_NAME,
1083	},
1084};
1085
1086static int __init ubd_init(void)
1087{
1088	char *error;
1089	int i, err;
1090
1091	if (register_blkdev(UBD_MAJOR, "ubd"))
1092		return -1;
1093
1094	irq_req_buffer = kmalloc_array(UBD_REQ_BUFFER_SIZE,
1095				       sizeof(struct io_thread_req *),
1096				       GFP_KERNEL
1097		);
1098	irq_remainder = 0;
1099
1100	if (irq_req_buffer == NULL) {
1101		printk(KERN_ERR "Failed to initialize ubd buffering\n");
1102		return -1;
1103	}
1104	io_req_buffer = kmalloc_array(UBD_REQ_BUFFER_SIZE,
1105				      sizeof(struct io_thread_req *),
1106				      GFP_KERNEL
1107		);
1108
1109	io_remainder = 0;
1110
1111	if (io_req_buffer == NULL) {
1112		printk(KERN_ERR "Failed to initialize ubd buffering\n");
1113		return -1;
1114	}
1115	platform_driver_register(&ubd_driver);
1116	mutex_lock(&ubd_lock);
1117	for (i = 0; i < MAX_DEV; i++){
1118		err = ubd_add(i, &error);
1119		if(err)
1120			printk(KERN_ERR "Failed to initialize ubd device %d :"
1121			       "%s\n", i, error);
1122	}
1123	mutex_unlock(&ubd_lock);
1124	return 0;
1125}
1126
1127late_initcall(ubd_init);
1128
1129static int __init ubd_driver_init(void){
1130	unsigned long stack;
1131	int err;
1132
1133	/* Set by CONFIG_BLK_DEV_UBD_SYNC or ubd=sync.*/
1134	if(global_openflags.s){
1135		printk(KERN_INFO "ubd: Synchronous mode\n");
1136		/* Letting ubd=sync be like using ubd#s= instead of ubd#= is
1137		 * enough. So use anyway the io thread. */
1138	}
1139	stack = alloc_stack(0, 0);
1140	io_pid = start_io_thread(stack + PAGE_SIZE, &thread_fd);
1141	if(io_pid < 0){
1142		printk(KERN_ERR
1143		       "ubd : Failed to start I/O thread (errno = %d) - "
1144		       "falling back to synchronous I/O\n", -io_pid);
1145		io_pid = -1;
1146		return 0;
1147	}
1148	err = um_request_irq(UBD_IRQ, thread_fd, IRQ_READ, ubd_intr,
1149			     0, "ubd", ubd_devs);
1150	if(err < 0)
1151		printk(KERN_ERR "um_request_irq failed - errno = %d\n", -err);
1152	return 0;
1153}
1154
1155device_initcall(ubd_driver_init);
1156
1157static int ubd_open(struct gendisk *disk, blk_mode_t mode)
1158{
1159	struct ubd *ubd_dev = disk->private_data;
1160	int err = 0;
1161
1162	mutex_lock(&ubd_mutex);
1163	if(ubd_dev->count == 0){
1164		err = ubd_open_dev(ubd_dev);
1165		if(err){
1166			printk(KERN_ERR "%s: Can't open \"%s\": errno = %d\n",
1167			       disk->disk_name, ubd_dev->file, -err);
1168			goto out;
1169		}
1170	}
1171	ubd_dev->count++;
1172	set_disk_ro(disk, !ubd_dev->openflags.w);
1173out:
1174	mutex_unlock(&ubd_mutex);
1175	return err;
1176}
1177
1178static void ubd_release(struct gendisk *disk)
1179{
1180	struct ubd *ubd_dev = disk->private_data;
1181
1182	mutex_lock(&ubd_mutex);
1183	if(--ubd_dev->count == 0)
1184		ubd_close_dev(ubd_dev);
1185	mutex_unlock(&ubd_mutex);
1186}
1187
1188static void cowify_bitmap(__u64 io_offset, int length, unsigned long *cow_mask,
1189			  __u64 *cow_offset, unsigned long *bitmap,
1190			  __u64 bitmap_offset, unsigned long *bitmap_words,
1191			  __u64 bitmap_len)
1192{
1193	__u64 sector = io_offset >> SECTOR_SHIFT;
1194	int i, update_bitmap = 0;
1195
1196	for (i = 0; i < length >> SECTOR_SHIFT; i++) {
1197		if(cow_mask != NULL)
1198			ubd_set_bit(i, (unsigned char *) cow_mask);
1199		if(ubd_test_bit(sector + i, (unsigned char *) bitmap))
1200			continue;
1201
1202		update_bitmap = 1;
1203		ubd_set_bit(sector + i, (unsigned char *) bitmap);
1204	}
1205
1206	if(!update_bitmap)
1207		return;
1208
1209	*cow_offset = sector / (sizeof(unsigned long) * 8);
1210
1211	/* This takes care of the case where we're exactly at the end of the
1212	 * device, and *cow_offset + 1 is off the end.  So, just back it up
1213	 * by one word.  Thanks to Lynn Kerby for the fix and James McMechan
1214	 * for the original diagnosis.
1215	 */
1216	if (*cow_offset == (DIV_ROUND_UP(bitmap_len,
1217					 sizeof(unsigned long)) - 1))
1218		(*cow_offset)--;
1219
1220	bitmap_words[0] = bitmap[*cow_offset];
1221	bitmap_words[1] = bitmap[*cow_offset + 1];
1222
1223	*cow_offset *= sizeof(unsigned long);
1224	*cow_offset += bitmap_offset;
1225}
1226
1227static void cowify_req(struct io_thread_req *req, struct io_desc *segment,
1228		       unsigned long offset, unsigned long *bitmap,
1229		       __u64 bitmap_offset, __u64 bitmap_len)
1230{
1231	__u64 sector = offset >> SECTOR_SHIFT;
1232	int i;
1233
1234	if (segment->length > (sizeof(segment->sector_mask) * 8) << SECTOR_SHIFT)
1235		panic("Operation too long");
1236
1237	if (req_op(req->req) == REQ_OP_READ) {
1238		for (i = 0; i < segment->length >> SECTOR_SHIFT; i++) {
1239			if(ubd_test_bit(sector + i, (unsigned char *) bitmap))
1240				ubd_set_bit(i, (unsigned char *)
1241					    &segment->sector_mask);
1242		}
1243	} else {
1244		cowify_bitmap(offset, segment->length, &segment->sector_mask,
1245			      &segment->cow_offset, bitmap, bitmap_offset,
1246			      segment->bitmap_words, bitmap_len);
1247	}
1248}
1249
1250static void ubd_map_req(struct ubd *dev, struct io_thread_req *io_req,
1251			struct request *req)
1252{
1253	struct bio_vec bvec;
1254	struct req_iterator iter;
1255	int i = 0;
1256	unsigned long byte_offset = io_req->offset;
1257	enum req_op op = req_op(req);
1258
1259	if (op == REQ_OP_WRITE_ZEROES || op == REQ_OP_DISCARD) {
1260		io_req->io_desc[0].buffer = NULL;
1261		io_req->io_desc[0].length = blk_rq_bytes(req);
1262	} else {
1263		rq_for_each_segment(bvec, req, iter) {
1264			BUG_ON(i >= io_req->desc_cnt);
1265
1266			io_req->io_desc[i].buffer = bvec_virt(&bvec);
1267			io_req->io_desc[i].length = bvec.bv_len;
1268			i++;
1269		}
1270	}
1271
1272	if (dev->cow.file) {
1273		for (i = 0; i < io_req->desc_cnt; i++) {
1274			cowify_req(io_req, &io_req->io_desc[i], byte_offset,
1275				   dev->cow.bitmap, dev->cow.bitmap_offset,
1276				   dev->cow.bitmap_len);
1277			byte_offset += io_req->io_desc[i].length;
1278		}
1279
1280	}
1281}
1282
1283static struct io_thread_req *ubd_alloc_req(struct ubd *dev, struct request *req,
1284					   int desc_cnt)
1285{
1286	struct io_thread_req *io_req;
1287	int i;
1288
1289	io_req = kmalloc(sizeof(*io_req) +
1290			 (desc_cnt * sizeof(struct io_desc)),
1291			 GFP_ATOMIC);
1292	if (!io_req)
1293		return NULL;
1294
1295	io_req->req = req;
1296	if (dev->cow.file)
1297		io_req->fds[0] = dev->cow.fd;
1298	else
1299		io_req->fds[0] = dev->fd;
1300	io_req->error = 0;
1301	io_req->sectorsize = SECTOR_SIZE;
1302	io_req->fds[1] = dev->fd;
1303	io_req->offset = (u64) blk_rq_pos(req) << SECTOR_SHIFT;
1304	io_req->offsets[0] = 0;
1305	io_req->offsets[1] = dev->cow.data_offset;
1306
1307	for (i = 0 ; i < desc_cnt; i++) {
1308		io_req->io_desc[i].sector_mask = 0;
1309		io_req->io_desc[i].cow_offset = -1;
1310	}
1311
1312	return io_req;
1313}
1314
1315static int ubd_submit_request(struct ubd *dev, struct request *req)
1316{
1317	int segs = 0;
1318	struct io_thread_req *io_req;
1319	int ret;
1320	enum req_op op = req_op(req);
1321
1322	if (op == REQ_OP_FLUSH)
1323		segs = 0;
1324	else if (op == REQ_OP_WRITE_ZEROES || op == REQ_OP_DISCARD)
1325		segs = 1;
1326	else
1327		segs = blk_rq_nr_phys_segments(req);
1328
1329	io_req = ubd_alloc_req(dev, req, segs);
1330	if (!io_req)
1331		return -ENOMEM;
1332
1333	io_req->desc_cnt = segs;
1334	if (segs)
1335		ubd_map_req(dev, io_req, req);
1336
1337	ret = os_write_file(thread_fd, &io_req, sizeof(io_req));
1338	if (ret != sizeof(io_req)) {
1339		if (ret != -EAGAIN)
1340			pr_err("write to io thread failed: %d\n", -ret);
1341		kfree(io_req);
1342	}
1343	return ret;
1344}
1345
1346static blk_status_t ubd_queue_rq(struct blk_mq_hw_ctx *hctx,
1347				 const struct blk_mq_queue_data *bd)
1348{
1349	struct ubd *ubd_dev = hctx->queue->queuedata;
1350	struct request *req = bd->rq;
1351	int ret = 0, res = BLK_STS_OK;
1352
1353	blk_mq_start_request(req);
1354
1355	spin_lock_irq(&ubd_dev->lock);
1356
1357	switch (req_op(req)) {
1358	case REQ_OP_FLUSH:
1359	case REQ_OP_READ:
1360	case REQ_OP_WRITE:
1361	case REQ_OP_DISCARD:
1362	case REQ_OP_WRITE_ZEROES:
1363		ret = ubd_submit_request(ubd_dev, req);
1364		break;
1365	default:
1366		WARN_ON_ONCE(1);
1367		res = BLK_STS_NOTSUPP;
1368	}
1369
1370	spin_unlock_irq(&ubd_dev->lock);
1371
1372	if (ret < 0) {
1373		if (ret == -ENOMEM)
1374			res = BLK_STS_RESOURCE;
1375		else
1376			res = BLK_STS_DEV_RESOURCE;
1377	}
1378
1379	return res;
1380}
1381
1382static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
1383{
1384	struct ubd *ubd_dev = bdev->bd_disk->private_data;
1385
1386	geo->heads = 128;
1387	geo->sectors = 32;
1388	geo->cylinders = ubd_dev->size / (128 * 32 * 512);
1389	return 0;
1390}
1391
1392static int ubd_ioctl(struct block_device *bdev, blk_mode_t mode,
1393		     unsigned int cmd, unsigned long arg)
1394{
1395	struct ubd *ubd_dev = bdev->bd_disk->private_data;
1396	u16 ubd_id[ATA_ID_WORDS];
1397
1398	switch (cmd) {
1399		struct cdrom_volctrl volume;
1400	case HDIO_GET_IDENTITY:
1401		memset(&ubd_id, 0, ATA_ID_WORDS * 2);
1402		ubd_id[ATA_ID_CYLS]	= ubd_dev->size / (128 * 32 * 512);
1403		ubd_id[ATA_ID_HEADS]	= 128;
1404		ubd_id[ATA_ID_SECTORS]	= 32;
1405		if(copy_to_user((char __user *) arg, (char *) &ubd_id,
1406				 sizeof(ubd_id)))
1407			return -EFAULT;
1408		return 0;
1409
1410	case CDROMVOLREAD:
1411		if(copy_from_user(&volume, (char __user *) arg, sizeof(volume)))
1412			return -EFAULT;
1413		volume.channel0 = 255;
1414		volume.channel1 = 255;
1415		volume.channel2 = 255;
1416		volume.channel3 = 255;
1417		if(copy_to_user((char __user *) arg, &volume, sizeof(volume)))
1418			return -EFAULT;
1419		return 0;
1420	}
1421	return -EINVAL;
1422}
1423
1424static int map_error(int error_code)
1425{
1426	switch (error_code) {
1427	case 0:
1428		return BLK_STS_OK;
1429	case ENOSYS:
1430	case EOPNOTSUPP:
1431		return BLK_STS_NOTSUPP;
1432	case ENOSPC:
1433		return BLK_STS_NOSPC;
1434	}
1435	return BLK_STS_IOERR;
1436}
1437
1438/*
1439 * Everything from here onwards *IS NOT PART OF THE KERNEL*
1440 *
1441 * The following functions are part of UML hypervisor code.
1442 * All functions from here onwards are executed as a helper
1443 * thread and are not allowed to execute any kernel functions.
1444 *
1445 * Any communication must occur strictly via shared memory and IPC.
1446 *
1447 * Do not add printks, locks, kernel memory operations, etc - it
1448 * will result in unpredictable behaviour and/or crashes.
1449 */
1450
1451static int update_bitmap(struct io_thread_req *req, struct io_desc *segment)
1452{
1453	int n;
1454
1455	if (segment->cow_offset == -1)
1456		return map_error(0);
1457
1458	n = os_pwrite_file(req->fds[1], &segment->bitmap_words,
1459			  sizeof(segment->bitmap_words), segment->cow_offset);
1460	if (n != sizeof(segment->bitmap_words))
1461		return map_error(-n);
1462
1463	return map_error(0);
1464}
1465
1466static void do_io(struct io_thread_req *req, struct io_desc *desc)
1467{
1468	char *buf = NULL;
1469	unsigned long len;
1470	int n, nsectors, start, end, bit;
1471	__u64 off;
1472
1473	/* FLUSH is really a special case, we cannot "case" it with others */
1474
1475	if (req_op(req->req) == REQ_OP_FLUSH) {
1476		/* fds[0] is always either the rw image or our cow file */
1477		req->error = map_error(-os_sync_file(req->fds[0]));
1478		return;
1479	}
1480
1481	nsectors = desc->length / req->sectorsize;
1482	start = 0;
1483	do {
1484		bit = ubd_test_bit(start, (unsigned char *) &desc->sector_mask);
1485		end = start;
1486		while((end < nsectors) &&
1487		      (ubd_test_bit(end, (unsigned char *) &desc->sector_mask) == bit))
1488			end++;
1489
1490		off = req->offset + req->offsets[bit] +
1491			start * req->sectorsize;
1492		len = (end - start) * req->sectorsize;
1493		if (desc->buffer != NULL)
1494			buf = &desc->buffer[start * req->sectorsize];
1495
1496		switch (req_op(req->req)) {
1497		case REQ_OP_READ:
1498			n = 0;
1499			do {
1500				buf = &buf[n];
1501				len -= n;
1502				n = os_pread_file(req->fds[bit], buf, len, off);
1503				if (n < 0) {
1504					req->error = map_error(-n);
1505					return;
1506				}
1507			} while((n < len) && (n != 0));
1508			if (n < len) memset(&buf[n], 0, len - n);
1509			break;
1510		case REQ_OP_WRITE:
1511			n = os_pwrite_file(req->fds[bit], buf, len, off);
1512			if(n != len){
1513				req->error = map_error(-n);
1514				return;
1515			}
1516			break;
1517		case REQ_OP_DISCARD:
1518			n = os_falloc_punch(req->fds[bit], off, len);
1519			if (n) {
1520				req->error = map_error(-n);
1521				return;
1522			}
1523			break;
1524		case REQ_OP_WRITE_ZEROES:
1525			n = os_falloc_zeroes(req->fds[bit], off, len);
1526			if (n) {
1527				req->error = map_error(-n);
1528				return;
1529			}
1530			break;
1531		default:
1532			WARN_ON_ONCE(1);
1533			req->error = BLK_STS_NOTSUPP;
1534			return;
1535		}
1536
1537		start = end;
1538	} while(start < nsectors);
1539
1540	req->offset += len;
1541	req->error = update_bitmap(req, desc);
1542}
1543
1544/* Changed in start_io_thread, which is serialized by being called only
1545 * from ubd_init, which is an initcall.
1546 */
1547int kernel_fd = -1;
1548
1549/* Only changed by the io thread. XXX: currently unused. */
1550static int io_count;
1551
1552int io_thread(void *arg)
1553{
1554	int n, count, written, res;
1555
1556	os_fix_helper_signals();
1557
1558	while(1){
1559		n = bulk_req_safe_read(
1560			kernel_fd,
1561			io_req_buffer,
1562			&io_remainder,
1563			&io_remainder_size,
1564			UBD_REQ_BUFFER_SIZE
1565		);
1566		if (n <= 0) {
1567			if (n == -EAGAIN)
1568				ubd_read_poll(-1);
1569
1570			continue;
1571		}
1572
1573		for (count = 0; count < n/sizeof(struct io_thread_req *); count++) {
1574			struct io_thread_req *req = (*io_req_buffer)[count];
1575			int i;
1576
1577			io_count++;
1578			for (i = 0; !req->error && i < req->desc_cnt; i++)
1579				do_io(req, &(req->io_desc[i]));
1580
1581		}
1582
1583		written = 0;
1584
1585		do {
1586			res = os_write_file(kernel_fd,
1587					    ((char *) io_req_buffer) + written,
1588					    n - written);
1589			if (res >= 0) {
1590				written += res;
1591			}
1592			if (written < n) {
1593				ubd_write_poll(-1);
1594			}
1595		} while (written < n);
1596	}
1597
1598	return 0;
1599}
1600