1// SPDX-License-Identifier: GPL-2.0-only
2//#define DEBUG
3#include <linux/spinlock.h>
4#include <linux/slab.h>
5#include <linux/blkdev.h>
6#include <linux/hdreg.h>
7#include <linux/module.h>
8#include <linux/mutex.h>
9#include <linux/interrupt.h>
10#include <linux/virtio.h>
11#include <linux/virtio_blk.h>
12#include <linux/scatterlist.h>
13#include <linux/string_helpers.h>
14#include <linux/idr.h>
15#include <linux/blk-mq.h>
16#include <linux/blk-mq-virtio.h>
17#include <linux/numa.h>
18#include <uapi/linux/virtio_ring.h>
19
20#define PART_BITS 4
21#define VQ_NAME_LEN 16
22#define MAX_DISCARD_SEGMENTS 256u
23
24static int major;
25static DEFINE_IDA(vd_index_ida);
26
27static struct workqueue_struct *virtblk_wq;
28
29struct virtio_blk_vq {
30	struct virtqueue *vq;
31	spinlock_t lock;
32	char name[VQ_NAME_LEN];
33} ____cacheline_aligned_in_smp;
34
35struct virtio_blk {
36	/*
37	 * This mutex must be held by anything that may run after
38	 * virtblk_remove() sets vblk->vdev to NULL.
39	 *
40	 * blk-mq, virtqueue processing, and sysfs attribute code paths are
41	 * shut down before vblk->vdev is set to NULL and therefore do not need
42	 * to hold this mutex.
43	 */
44	struct mutex vdev_mutex;
45	struct virtio_device *vdev;
46
47	/* The disk structure for the kernel. */
48	struct gendisk *disk;
49
50	/* Block layer tags. */
51	struct blk_mq_tag_set tag_set;
52
53	/* Process context for config space updates */
54	struct work_struct config_work;
55
56	/*
57	 * Tracks references from block_device_operations open/release and
58	 * virtio_driver probe/remove so this object can be freed once no
59	 * longer in use.
60	 */
61	refcount_t refs;
62
63	/* What host tells us, plus 2 for header & tailer. */
64	unsigned int sg_elems;
65
66	/* Ida index - used to track minor number allocations. */
67	int index;
68
69	/* num of vqs */
70	int num_vqs;
71	struct virtio_blk_vq *vqs;
72};
73
74struct virtblk_req {
75	struct virtio_blk_outhdr out_hdr;
76	u8 status;
77	struct scatterlist sg[];
78};
79
80static inline blk_status_t virtblk_result(struct virtblk_req *vbr)
81{
82	switch (vbr->status) {
83	case VIRTIO_BLK_S_OK:
84		return BLK_STS_OK;
85	case VIRTIO_BLK_S_UNSUPP:
86		return BLK_STS_NOTSUPP;
87	default:
88		return BLK_STS_IOERR;
89	}
90}
91
92static int virtblk_add_req(struct virtqueue *vq, struct virtblk_req *vbr,
93		struct scatterlist *data_sg, bool have_data)
94{
95	struct scatterlist hdr, status, *sgs[3];
96	unsigned int num_out = 0, num_in = 0;
97
98	sg_init_one(&hdr, &vbr->out_hdr, sizeof(vbr->out_hdr));
99	sgs[num_out++] = &hdr;
100
101	if (have_data) {
102		if (vbr->out_hdr.type & cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_OUT))
103			sgs[num_out++] = data_sg;
104		else
105			sgs[num_out + num_in++] = data_sg;
106	}
107
108	sg_init_one(&status, &vbr->status, sizeof(vbr->status));
109	sgs[num_out + num_in++] = &status;
110
111	return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
112}
113
114static int virtblk_setup_discard_write_zeroes(struct request *req, bool unmap)
115{
116	unsigned short segments = blk_rq_nr_discard_segments(req);
117	unsigned short n = 0;
118	struct virtio_blk_discard_write_zeroes *range;
119	struct bio *bio;
120	u32 flags = 0;
121
122	if (unmap)
123		flags |= VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP;
124
125	range = kmalloc_array(segments, sizeof(*range), GFP_ATOMIC);
126	if (!range)
127		return -ENOMEM;
128
129	/*
130	 * Single max discard segment means multi-range discard isn't
131	 * supported, and block layer only runs contiguity merge like
132	 * normal RW request. So we can't reply on bio for retrieving
133	 * each range info.
134	 */
135	if (queue_max_discard_segments(req->q) == 1) {
136		range[0].flags = cpu_to_le32(flags);
137		range[0].num_sectors = cpu_to_le32(blk_rq_sectors(req));
138		range[0].sector = cpu_to_le64(blk_rq_pos(req));
139		n = 1;
140	} else {
141		__rq_for_each_bio(bio, req) {
142			u64 sector = bio->bi_iter.bi_sector;
143			u32 num_sectors = bio->bi_iter.bi_size >> SECTOR_SHIFT;
144
145			range[n].flags = cpu_to_le32(flags);
146			range[n].num_sectors = cpu_to_le32(num_sectors);
147			range[n].sector = cpu_to_le64(sector);
148			n++;
149		}
150	}
151
152	WARN_ON_ONCE(n != segments);
153
154	req->special_vec.bv_page = virt_to_page(range);
155	req->special_vec.bv_offset = offset_in_page(range);
156	req->special_vec.bv_len = sizeof(*range) * segments;
157	req->rq_flags |= RQF_SPECIAL_PAYLOAD;
158
159	return 0;
160}
161
162static inline void virtblk_request_done(struct request *req)
163{
164	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
165
166	if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
167		kfree(page_address(req->special_vec.bv_page) +
168		      req->special_vec.bv_offset);
169	}
170
171	blk_mq_end_request(req, virtblk_result(vbr));
172}
173
174static void virtblk_done(struct virtqueue *vq)
175{
176	struct virtio_blk *vblk = vq->vdev->priv;
177	bool req_done = false;
178	int qid = vq->index;
179	struct virtblk_req *vbr;
180	unsigned long flags;
181	unsigned int len;
182
183	spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
184	do {
185		virtqueue_disable_cb(vq);
186		while ((vbr = virtqueue_get_buf(vblk->vqs[qid].vq, &len)) != NULL) {
187			struct request *req = blk_mq_rq_from_pdu(vbr);
188
189			if (likely(!blk_should_fake_timeout(req->q)))
190				blk_mq_complete_request(req);
191			req_done = true;
192		}
193		if (unlikely(virtqueue_is_broken(vq)))
194			break;
195	} while (!virtqueue_enable_cb(vq));
196
197	/* In case queue is stopped waiting for more buffers. */
198	if (req_done)
199		blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
200	spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
201}
202
203static void virtio_commit_rqs(struct blk_mq_hw_ctx *hctx)
204{
205	struct virtio_blk *vblk = hctx->queue->queuedata;
206	struct virtio_blk_vq *vq = &vblk->vqs[hctx->queue_num];
207	bool kick;
208
209	spin_lock_irq(&vq->lock);
210	kick = virtqueue_kick_prepare(vq->vq);
211	spin_unlock_irq(&vq->lock);
212
213	if (kick)
214		virtqueue_notify(vq->vq);
215}
216
217static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
218			   const struct blk_mq_queue_data *bd)
219{
220	struct virtio_blk *vblk = hctx->queue->queuedata;
221	struct request *req = bd->rq;
222	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
223	unsigned long flags;
224	unsigned int num;
225	int qid = hctx->queue_num;
226	int err;
227	bool notify = false;
228	bool unmap = false;
229	u32 type;
230
231	BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);
232
233	switch (req_op(req)) {
234	case REQ_OP_READ:
235	case REQ_OP_WRITE:
236		type = 0;
237		break;
238	case REQ_OP_FLUSH:
239		type = VIRTIO_BLK_T_FLUSH;
240		break;
241	case REQ_OP_DISCARD:
242		type = VIRTIO_BLK_T_DISCARD;
243		break;
244	case REQ_OP_WRITE_ZEROES:
245		type = VIRTIO_BLK_T_WRITE_ZEROES;
246		unmap = !(req->cmd_flags & REQ_NOUNMAP);
247		break;
248	case REQ_OP_DRV_IN:
249		type = VIRTIO_BLK_T_GET_ID;
250		break;
251	default:
252		WARN_ON_ONCE(1);
253		return BLK_STS_IOERR;
254	}
255
256	vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, type);
257	vbr->out_hdr.sector = type ?
258		0 : cpu_to_virtio64(vblk->vdev, blk_rq_pos(req));
259	vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(req));
260
261	blk_mq_start_request(req);
262
263	if (type == VIRTIO_BLK_T_DISCARD || type == VIRTIO_BLK_T_WRITE_ZEROES) {
264		err = virtblk_setup_discard_write_zeroes(req, unmap);
265		if (err)
266			return BLK_STS_RESOURCE;
267	}
268
269	num = blk_rq_map_sg(hctx->queue, req, vbr->sg);
270	if (num) {
271		if (rq_data_dir(req) == WRITE)
272			vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_OUT);
273		else
274			vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_IN);
275	}
276
277	spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
278	err = virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg, num);
279	if (err) {
280		virtqueue_kick(vblk->vqs[qid].vq);
281		/* Don't stop the queue if -ENOMEM: we may have failed to
282		 * bounce the buffer due to global resource outage.
283		 */
284		if (err == -ENOSPC)
285			blk_mq_stop_hw_queue(hctx);
286		spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
287		switch (err) {
288		case -ENOSPC:
289			return BLK_STS_DEV_RESOURCE;
290		case -ENOMEM:
291			return BLK_STS_RESOURCE;
292		default:
293			return BLK_STS_IOERR;
294		}
295	}
296
297	if (bd->last && virtqueue_kick_prepare(vblk->vqs[qid].vq))
298		notify = true;
299	spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
300
301	if (notify)
302		virtqueue_notify(vblk->vqs[qid].vq);
303	return BLK_STS_OK;
304}
305
306/* return id (s/n) string for *disk to *id_str
307 */
308static int virtblk_get_id(struct gendisk *disk, char *id_str)
309{
310	struct virtio_blk *vblk = disk->private_data;
311	struct request_queue *q = vblk->disk->queue;
312	struct request *req;
313	int err;
314
315	req = blk_get_request(q, REQ_OP_DRV_IN, 0);
316	if (IS_ERR(req))
317		return PTR_ERR(req);
318
319	err = blk_rq_map_kern(q, req, id_str, VIRTIO_BLK_ID_BYTES, GFP_KERNEL);
320	if (err)
321		goto out;
322
323	blk_execute_rq(vblk->disk->queue, vblk->disk, req, false);
324	err = blk_status_to_errno(virtblk_result(blk_mq_rq_to_pdu(req)));
325out:
326	blk_put_request(req);
327	return err;
328}
329
330static void virtblk_get(struct virtio_blk *vblk)
331{
332	refcount_inc(&vblk->refs);
333}
334
335static void virtblk_put(struct virtio_blk *vblk)
336{
337	if (refcount_dec_and_test(&vblk->refs)) {
338		ida_simple_remove(&vd_index_ida, vblk->index);
339		mutex_destroy(&vblk->vdev_mutex);
340		kfree(vblk);
341	}
342}
343
344static int virtblk_open(struct block_device *bd, fmode_t mode)
345{
346	struct virtio_blk *vblk = bd->bd_disk->private_data;
347	int ret = 0;
348
349	mutex_lock(&vblk->vdev_mutex);
350
351	if (vblk->vdev)
352		virtblk_get(vblk);
353	else
354		ret = -ENXIO;
355
356	mutex_unlock(&vblk->vdev_mutex);
357	return ret;
358}
359
360static void virtblk_release(struct gendisk *disk, fmode_t mode)
361{
362	struct virtio_blk *vblk = disk->private_data;
363
364	virtblk_put(vblk);
365}
366
367/* We provide getgeo only to please some old bootloader/partitioning tools */
368static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
369{
370	struct virtio_blk *vblk = bd->bd_disk->private_data;
371	int ret = 0;
372
373	mutex_lock(&vblk->vdev_mutex);
374
375	if (!vblk->vdev) {
376		ret = -ENXIO;
377		goto out;
378	}
379
380	/* see if the host passed in geometry config */
381	if (virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_GEOMETRY)) {
382		virtio_cread(vblk->vdev, struct virtio_blk_config,
383			     geometry.cylinders, &geo->cylinders);
384		virtio_cread(vblk->vdev, struct virtio_blk_config,
385			     geometry.heads, &geo->heads);
386		virtio_cread(vblk->vdev, struct virtio_blk_config,
387			     geometry.sectors, &geo->sectors);
388	} else {
389		/* some standard values, similar to sd */
390		geo->heads = 1 << 6;
391		geo->sectors = 1 << 5;
392		geo->cylinders = get_capacity(bd->bd_disk) >> 11;
393	}
394out:
395	mutex_unlock(&vblk->vdev_mutex);
396	return ret;
397}
398
399static const struct block_device_operations virtblk_fops = {
400	.owner  = THIS_MODULE,
401	.open = virtblk_open,
402	.release = virtblk_release,
403	.getgeo = virtblk_getgeo,
404};
405
406static int index_to_minor(int index)
407{
408	return index << PART_BITS;
409}
410
411static int minor_to_index(int minor)
412{
413	return minor >> PART_BITS;
414}
415
416static ssize_t serial_show(struct device *dev,
417			   struct device_attribute *attr, char *buf)
418{
419	struct gendisk *disk = dev_to_disk(dev);
420	int err;
421
422	/* sysfs gives us a PAGE_SIZE buffer */
423	BUILD_BUG_ON(PAGE_SIZE < VIRTIO_BLK_ID_BYTES);
424
425	buf[VIRTIO_BLK_ID_BYTES] = '\0';
426	err = virtblk_get_id(disk, buf);
427	if (!err)
428		return strlen(buf);
429
430	if (err == -EIO) /* Unsupported? Make it empty. */
431		return 0;
432
433	return err;
434}
435
436static DEVICE_ATTR_RO(serial);
437
438/* The queue's logical block size must be set before calling this */
439static void virtblk_update_capacity(struct virtio_blk *vblk, bool resize)
440{
441	struct virtio_device *vdev = vblk->vdev;
442	struct request_queue *q = vblk->disk->queue;
443	char cap_str_2[10], cap_str_10[10];
444	unsigned long long nblocks;
445	u64 capacity;
446
447	/* Host must always specify the capacity. */
448	virtio_cread(vdev, struct virtio_blk_config, capacity, &capacity);
449
450	/* If capacity is too big, truncate with warning. */
451	if ((sector_t)capacity != capacity) {
452		dev_warn(&vdev->dev, "Capacity %llu too large: truncating\n",
453			 (unsigned long long)capacity);
454		capacity = (sector_t)-1;
455	}
456
457	nblocks = DIV_ROUND_UP_ULL(capacity, queue_logical_block_size(q) >> 9);
458
459	string_get_size(nblocks, queue_logical_block_size(q),
460			STRING_UNITS_2, cap_str_2, sizeof(cap_str_2));
461	string_get_size(nblocks, queue_logical_block_size(q),
462			STRING_UNITS_10, cap_str_10, sizeof(cap_str_10));
463
464	dev_notice(&vdev->dev,
465		   "[%s] %s%llu %d-byte logical blocks (%s/%s)\n",
466		   vblk->disk->disk_name,
467		   resize ? "new size: " : "",
468		   nblocks,
469		   queue_logical_block_size(q),
470		   cap_str_10,
471		   cap_str_2);
472
473	set_capacity_revalidate_and_notify(vblk->disk, capacity, true);
474}
475
476static void virtblk_config_changed_work(struct work_struct *work)
477{
478	struct virtio_blk *vblk =
479		container_of(work, struct virtio_blk, config_work);
480
481	virtblk_update_capacity(vblk, true);
482}
483
484static void virtblk_config_changed(struct virtio_device *vdev)
485{
486	struct virtio_blk *vblk = vdev->priv;
487
488	queue_work(virtblk_wq, &vblk->config_work);
489}
490
491static int init_vq(struct virtio_blk *vblk)
492{
493	int err;
494	int i;
495	vq_callback_t **callbacks;
496	const char **names;
497	struct virtqueue **vqs;
498	unsigned short num_vqs;
499	struct virtio_device *vdev = vblk->vdev;
500	struct irq_affinity desc = { 0, };
501
502	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_MQ,
503				   struct virtio_blk_config, num_queues,
504				   &num_vqs);
505	if (err)
506		num_vqs = 1;
507
508	num_vqs = min_t(unsigned int, nr_cpu_ids, num_vqs);
509
510	vblk->vqs = kmalloc_array(num_vqs, sizeof(*vblk->vqs), GFP_KERNEL);
511	if (!vblk->vqs)
512		return -ENOMEM;
513
514	names = kmalloc_array(num_vqs, sizeof(*names), GFP_KERNEL);
515	callbacks = kmalloc_array(num_vqs, sizeof(*callbacks), GFP_KERNEL);
516	vqs = kmalloc_array(num_vqs, sizeof(*vqs), GFP_KERNEL);
517	if (!names || !callbacks || !vqs) {
518		err = -ENOMEM;
519		goto out;
520	}
521
522	for (i = 0; i < num_vqs; i++) {
523		callbacks[i] = virtblk_done;
524		snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req.%d", i);
525		names[i] = vblk->vqs[i].name;
526	}
527
528	/* Discover virtqueues and write information to configuration.  */
529	err = virtio_find_vqs(vdev, num_vqs, vqs, callbacks, names, &desc);
530	if (err)
531		goto out;
532
533	for (i = 0; i < num_vqs; i++) {
534		spin_lock_init(&vblk->vqs[i].lock);
535		vblk->vqs[i].vq = vqs[i];
536	}
537	vblk->num_vqs = num_vqs;
538
539out:
540	kfree(vqs);
541	kfree(callbacks);
542	kfree(names);
543	if (err)
544		kfree(vblk->vqs);
545	return err;
546}
547
548/*
549 * Legacy naming scheme used for virtio devices.  We are stuck with it for
550 * virtio blk but don't ever use it for any new driver.
551 */
552static int virtblk_name_format(char *prefix, int index, char *buf, int buflen)
553{
554	const int base = 'z' - 'a' + 1;
555	char *begin = buf + strlen(prefix);
556	char *end = buf + buflen;
557	char *p;
558	int unit;
559
560	p = end - 1;
561	*p = '\0';
562	unit = base;
563	do {
564		if (p == begin)
565			return -EINVAL;
566		*--p = 'a' + (index % unit);
567		index = (index / unit) - 1;
568	} while (index >= 0);
569
570	memmove(begin, p, end - p);
571	memcpy(buf, prefix, strlen(prefix));
572
573	return 0;
574}
575
576static int virtblk_get_cache_mode(struct virtio_device *vdev)
577{
578	u8 writeback;
579	int err;
580
581	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE,
582				   struct virtio_blk_config, wce,
583				   &writeback);
584
585	/*
586	 * If WCE is not configurable and flush is not available,
587	 * assume no writeback cache is in use.
588	 */
589	if (err)
590		writeback = virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH);
591
592	return writeback;
593}
594
595static void virtblk_update_cache_mode(struct virtio_device *vdev)
596{
597	u8 writeback = virtblk_get_cache_mode(vdev);
598	struct virtio_blk *vblk = vdev->priv;
599
600	blk_queue_write_cache(vblk->disk->queue, writeback, false);
601	revalidate_disk_size(vblk->disk, true);
602}
603
604static const char *const virtblk_cache_types[] = {
605	"write through", "write back"
606};
607
608static ssize_t
609cache_type_store(struct device *dev, struct device_attribute *attr,
610		 const char *buf, size_t count)
611{
612	struct gendisk *disk = dev_to_disk(dev);
613	struct virtio_blk *vblk = disk->private_data;
614	struct virtio_device *vdev = vblk->vdev;
615	int i;
616
617	BUG_ON(!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_CONFIG_WCE));
618	i = sysfs_match_string(virtblk_cache_types, buf);
619	if (i < 0)
620		return i;
621
622	virtio_cwrite8(vdev, offsetof(struct virtio_blk_config, wce), i);
623	virtblk_update_cache_mode(vdev);
624	return count;
625}
626
627static ssize_t
628cache_type_show(struct device *dev, struct device_attribute *attr, char *buf)
629{
630	struct gendisk *disk = dev_to_disk(dev);
631	struct virtio_blk *vblk = disk->private_data;
632	u8 writeback = virtblk_get_cache_mode(vblk->vdev);
633
634	BUG_ON(writeback >= ARRAY_SIZE(virtblk_cache_types));
635	return snprintf(buf, 40, "%s\n", virtblk_cache_types[writeback]);
636}
637
638static DEVICE_ATTR_RW(cache_type);
639
640static struct attribute *virtblk_attrs[] = {
641	&dev_attr_serial.attr,
642	&dev_attr_cache_type.attr,
643	NULL,
644};
645
646static umode_t virtblk_attrs_are_visible(struct kobject *kobj,
647		struct attribute *a, int n)
648{
649	struct device *dev = kobj_to_dev(kobj);
650	struct gendisk *disk = dev_to_disk(dev);
651	struct virtio_blk *vblk = disk->private_data;
652	struct virtio_device *vdev = vblk->vdev;
653
654	if (a == &dev_attr_cache_type.attr &&
655	    !virtio_has_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE))
656		return S_IRUGO;
657
658	return a->mode;
659}
660
661static const struct attribute_group virtblk_attr_group = {
662	.attrs = virtblk_attrs,
663	.is_visible = virtblk_attrs_are_visible,
664};
665
666static const struct attribute_group *virtblk_attr_groups[] = {
667	&virtblk_attr_group,
668	NULL,
669};
670
671static int virtblk_init_request(struct blk_mq_tag_set *set, struct request *rq,
672		unsigned int hctx_idx, unsigned int numa_node)
673{
674	struct virtio_blk *vblk = set->driver_data;
675	struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq);
676
677	sg_init_table(vbr->sg, vblk->sg_elems);
678	return 0;
679}
680
681static int virtblk_map_queues(struct blk_mq_tag_set *set)
682{
683	struct virtio_blk *vblk = set->driver_data;
684
685	return blk_mq_virtio_map_queues(&set->map[HCTX_TYPE_DEFAULT],
686					vblk->vdev, 0);
687}
688
689static const struct blk_mq_ops virtio_mq_ops = {
690	.queue_rq	= virtio_queue_rq,
691	.commit_rqs	= virtio_commit_rqs,
692	.complete	= virtblk_request_done,
693	.init_request	= virtblk_init_request,
694	.map_queues	= virtblk_map_queues,
695};
696
697static unsigned int virtblk_queue_depth;
698module_param_named(queue_depth, virtblk_queue_depth, uint, 0444);
699
700static int virtblk_probe(struct virtio_device *vdev)
701{
702	struct virtio_blk *vblk;
703	struct request_queue *q;
704	int err, index;
705
706	u32 v, blk_size, max_size, sg_elems, opt_io_size;
707	u16 min_io_size;
708	u8 physical_block_exp, alignment_offset;
709
710	if (!vdev->config->get) {
711		dev_err(&vdev->dev, "%s failure: config access disabled\n",
712			__func__);
713		return -EINVAL;
714	}
715
716	err = ida_simple_get(&vd_index_ida, 0, minor_to_index(1 << MINORBITS),
717			     GFP_KERNEL);
718	if (err < 0)
719		goto out;
720	index = err;
721
722	/* We need to know how many segments before we allocate. */
723	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SEG_MAX,
724				   struct virtio_blk_config, seg_max,
725				   &sg_elems);
726
727	/* We need at least one SG element, whatever they say. */
728	if (err || !sg_elems)
729		sg_elems = 1;
730
731	/* We need an extra sg elements at head and tail. */
732	sg_elems += 2;
733	vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL);
734	if (!vblk) {
735		err = -ENOMEM;
736		goto out_free_index;
737	}
738
739	/* This reference is dropped in virtblk_remove(). */
740	refcount_set(&vblk->refs, 1);
741	mutex_init(&vblk->vdev_mutex);
742
743	vblk->vdev = vdev;
744	vblk->sg_elems = sg_elems;
745
746	INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
747
748	err = init_vq(vblk);
749	if (err)
750		goto out_free_vblk;
751
752	/* FIXME: How many partitions?  How long is a piece of string? */
753	vblk->disk = alloc_disk(1 << PART_BITS);
754	if (!vblk->disk) {
755		err = -ENOMEM;
756		goto out_free_vq;
757	}
758
759	/* Default queue sizing is to fill the ring. */
760	if (!virtblk_queue_depth) {
761		virtblk_queue_depth = vblk->vqs[0].vq->num_free;
762		/* ... but without indirect descs, we use 2 descs per req */
763		if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC))
764			virtblk_queue_depth /= 2;
765	}
766
767	memset(&vblk->tag_set, 0, sizeof(vblk->tag_set));
768	vblk->tag_set.ops = &virtio_mq_ops;
769	vblk->tag_set.queue_depth = virtblk_queue_depth;
770	vblk->tag_set.numa_node = NUMA_NO_NODE;
771	vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
772	vblk->tag_set.cmd_size =
773		sizeof(struct virtblk_req) +
774		sizeof(struct scatterlist) * sg_elems;
775	vblk->tag_set.driver_data = vblk;
776	vblk->tag_set.nr_hw_queues = vblk->num_vqs;
777
778	err = blk_mq_alloc_tag_set(&vblk->tag_set);
779	if (err)
780		goto out_put_disk;
781
782	q = blk_mq_init_queue(&vblk->tag_set);
783	if (IS_ERR(q)) {
784		err = -ENOMEM;
785		goto out_free_tags;
786	}
787	vblk->disk->queue = q;
788
789	q->queuedata = vblk;
790
791	virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN);
792
793	vblk->disk->major = major;
794	vblk->disk->first_minor = index_to_minor(index);
795	vblk->disk->private_data = vblk;
796	vblk->disk->fops = &virtblk_fops;
797	vblk->disk->flags |= GENHD_FL_EXT_DEVT;
798	vblk->index = index;
799
800	/* configure queue flush support */
801	virtblk_update_cache_mode(vdev);
802
803	/* If disk is read-only in the host, the guest should obey */
804	if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO))
805		set_disk_ro(vblk->disk, 1);
806
807	/* We can handle whatever the host told us to handle. */
808	blk_queue_max_segments(q, vblk->sg_elems-2);
809
810	/* No real sector limit. */
811	blk_queue_max_hw_sectors(q, -1U);
812
813	max_size = virtio_max_dma_size(vdev);
814
815	/* Host can optionally specify maximum segment size and number of
816	 * segments. */
817	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SIZE_MAX,
818				   struct virtio_blk_config, size_max, &v);
819	if (!err)
820		max_size = min(max_size, v);
821
822	blk_queue_max_segment_size(q, max_size);
823
824	/* Host can optionally specify the block size of the device */
825	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_BLK_SIZE,
826				   struct virtio_blk_config, blk_size,
827				   &blk_size);
828	if (!err) {
829		err = blk_validate_block_size(blk_size);
830		if (err) {
831			dev_err(&vdev->dev,
832				"virtio_blk: invalid block size: 0x%x\n",
833				blk_size);
834			goto out_free_tags;
835		}
836
837		blk_queue_logical_block_size(q, blk_size);
838	} else
839		blk_size = queue_logical_block_size(q);
840
841	/* Use topology information if available */
842	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
843				   struct virtio_blk_config, physical_block_exp,
844				   &physical_block_exp);
845	if (!err && physical_block_exp)
846		blk_queue_physical_block_size(q,
847				blk_size * (1 << physical_block_exp));
848
849	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
850				   struct virtio_blk_config, alignment_offset,
851				   &alignment_offset);
852	if (!err && alignment_offset)
853		blk_queue_alignment_offset(q, blk_size * alignment_offset);
854
855	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
856				   struct virtio_blk_config, min_io_size,
857				   &min_io_size);
858	if (!err && min_io_size)
859		blk_queue_io_min(q, blk_size * min_io_size);
860
861	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
862				   struct virtio_blk_config, opt_io_size,
863				   &opt_io_size);
864	if (!err && opt_io_size)
865		blk_queue_io_opt(q, blk_size * opt_io_size);
866
867	if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) {
868		virtio_cread(vdev, struct virtio_blk_config,
869			     discard_sector_alignment, &v);
870		if (v)
871			q->limits.discard_granularity = v << SECTOR_SHIFT;
872		else
873			q->limits.discard_granularity = blk_size;
874
875		virtio_cread(vdev, struct virtio_blk_config,
876			     max_discard_sectors, &v);
877		blk_queue_max_discard_sectors(q, v ? v : UINT_MAX);
878
879		virtio_cread(vdev, struct virtio_blk_config, max_discard_seg,
880			     &v);
881
882		/*
883		 * max_discard_seg == 0 is out of spec but we always
884		 * handled it.
885		 */
886		if (!v)
887			v = sg_elems - 2;
888		blk_queue_max_discard_segments(q,
889					       min(v, MAX_DISCARD_SEGMENTS));
890
891		blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
892	}
893
894	if (virtio_has_feature(vdev, VIRTIO_BLK_F_WRITE_ZEROES)) {
895		virtio_cread(vdev, struct virtio_blk_config,
896			     max_write_zeroes_sectors, &v);
897		blk_queue_max_write_zeroes_sectors(q, v ? v : UINT_MAX);
898	}
899
900	virtblk_update_capacity(vblk, false);
901	virtio_device_ready(vdev);
902
903	device_add_disk(&vdev->dev, vblk->disk, virtblk_attr_groups);
904	return 0;
905
906out_free_tags:
907	blk_mq_free_tag_set(&vblk->tag_set);
908out_put_disk:
909	put_disk(vblk->disk);
910out_free_vq:
911	vdev->config->del_vqs(vdev);
912	kfree(vblk->vqs);
913out_free_vblk:
914	kfree(vblk);
915out_free_index:
916	ida_simple_remove(&vd_index_ida, index);
917out:
918	return err;
919}
920
921static void virtblk_remove(struct virtio_device *vdev)
922{
923	struct virtio_blk *vblk = vdev->priv;
924
925	/* Make sure no work handler is accessing the device. */
926	flush_work(&vblk->config_work);
927
928	del_gendisk(vblk->disk);
929	blk_cleanup_queue(vblk->disk->queue);
930
931	blk_mq_free_tag_set(&vblk->tag_set);
932
933	mutex_lock(&vblk->vdev_mutex);
934
935	/* Stop all the virtqueues. */
936	vdev->config->reset(vdev);
937
938	/* Virtqueues are stopped, nothing can use vblk->vdev anymore. */
939	vblk->vdev = NULL;
940
941	put_disk(vblk->disk);
942	vdev->config->del_vqs(vdev);
943	kfree(vblk->vqs);
944
945	mutex_unlock(&vblk->vdev_mutex);
946
947	virtblk_put(vblk);
948}
949
950#ifdef CONFIG_PM_SLEEP
951static int virtblk_freeze(struct virtio_device *vdev)
952{
953	struct virtio_blk *vblk = vdev->priv;
954
955	/* Ensure we don't receive any more interrupts */
956	vdev->config->reset(vdev);
957
958	/* Make sure no work handler is accessing the device. */
959	flush_work(&vblk->config_work);
960
961	blk_mq_quiesce_queue(vblk->disk->queue);
962
963	vdev->config->del_vqs(vdev);
964	kfree(vblk->vqs);
965
966	return 0;
967}
968
969static int virtblk_restore(struct virtio_device *vdev)
970{
971	struct virtio_blk *vblk = vdev->priv;
972	int ret;
973
974	ret = init_vq(vdev->priv);
975	if (ret)
976		return ret;
977
978	virtio_device_ready(vdev);
979
980	blk_mq_unquiesce_queue(vblk->disk->queue);
981	return 0;
982}
983#endif
984
985static const struct virtio_device_id id_table[] = {
986	{ VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
987	{ 0 },
988};
989
990static unsigned int features_legacy[] = {
991	VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
992	VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
993	VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
994	VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_WRITE_ZEROES,
995}
996;
997static unsigned int features[] = {
998	VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
999	VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
1000	VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
1001	VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_WRITE_ZEROES,
1002};
1003
1004static struct virtio_driver virtio_blk = {
1005	.feature_table			= features,
1006	.feature_table_size		= ARRAY_SIZE(features),
1007	.feature_table_legacy		= features_legacy,
1008	.feature_table_size_legacy	= ARRAY_SIZE(features_legacy),
1009	.driver.name			= KBUILD_MODNAME,
1010	.driver.owner			= THIS_MODULE,
1011	.id_table			= id_table,
1012	.probe				= virtblk_probe,
1013	.remove				= virtblk_remove,
1014	.config_changed			= virtblk_config_changed,
1015#ifdef CONFIG_PM_SLEEP
1016	.freeze				= virtblk_freeze,
1017	.restore			= virtblk_restore,
1018#endif
1019};
1020
1021static int __init init(void)
1022{
1023	int error;
1024
1025	virtblk_wq = alloc_workqueue("virtio-blk", 0, 0);
1026	if (!virtblk_wq)
1027		return -ENOMEM;
1028
1029	major = register_blkdev(0, "virtblk");
1030	if (major < 0) {
1031		error = major;
1032		goto out_destroy_workqueue;
1033	}
1034
1035	error = register_virtio_driver(&virtio_blk);
1036	if (error)
1037		goto out_unregister_blkdev;
1038	return 0;
1039
1040out_unregister_blkdev:
1041	unregister_blkdev(major, "virtblk");
1042out_destroy_workqueue:
1043	destroy_workqueue(virtblk_wq);
1044	return error;
1045}
1046
1047static void __exit fini(void)
1048{
1049	unregister_virtio_driver(&virtio_blk);
1050	unregister_blkdev(major, "virtblk");
1051	destroy_workqueue(virtblk_wq);
1052}
1053module_init(init);
1054module_exit(fini);
1055
1056MODULE_DEVICE_TABLE(virtio, id_table);
1057MODULE_DESCRIPTION("Virtio block driver");
1058MODULE_LICENSE("GPL");
1059