1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * VDUSE: vDPA Device in Userspace
4 *
5 * Copyright (C) 2020-2021 Bytedance Inc. and/or its affiliates. All rights reserved.
6 *
7 * Author: Xie Yongji <xieyongji@bytedance.com>
8 *
9 */
10
11#include <linux/init.h>
12#include <linux/module.h>
13#include <linux/cdev.h>
14#include <linux/device.h>
15#include <linux/eventfd.h>
16#include <linux/slab.h>
17#include <linux/wait.h>
18#include <linux/dma-map-ops.h>
19#include <linux/poll.h>
20#include <linux/file.h>
21#include <linux/uio.h>
22#include <linux/vdpa.h>
23#include <linux/nospec.h>
24#include <linux/vmalloc.h>
25#include <linux/sched/mm.h>
26#include <uapi/linux/vduse.h>
27#include <uapi/linux/vdpa.h>
28#include <uapi/linux/virtio_config.h>
29#include <uapi/linux/virtio_ids.h>
30#include <uapi/linux/virtio_blk.h>
31#include <linux/mod_devicetable.h>
32
33#include "iova_domain.h"
34
35#define DRV_AUTHOR   "Yongji Xie <xieyongji@bytedance.com>"
36#define DRV_DESC     "vDPA Device in Userspace"
37#define DRV_LICENSE  "GPL v2"
38
39#define VDUSE_DEV_MAX (1U << MINORBITS)
40#define VDUSE_MAX_BOUNCE_SIZE (1024 * 1024 * 1024)
41#define VDUSE_MIN_BOUNCE_SIZE (1024 * 1024)
42#define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024)
43/* 128 MB reserved for virtqueue creation */
44#define VDUSE_IOVA_SIZE (VDUSE_MAX_BOUNCE_SIZE + 128 * 1024 * 1024)
45#define VDUSE_MSG_DEFAULT_TIMEOUT 30
46
47#define IRQ_UNBOUND -1
48
49struct vduse_virtqueue {
50	u16 index;
51	u16 num_max;
52	u32 num;
53	u64 desc_addr;
54	u64 driver_addr;
55	u64 device_addr;
56	struct vdpa_vq_state state;
57	bool ready;
58	bool kicked;
59	spinlock_t kick_lock;
60	spinlock_t irq_lock;
61	struct eventfd_ctx *kickfd;
62	struct vdpa_callback cb;
63	struct work_struct inject;
64	struct work_struct kick;
65	int irq_effective_cpu;
66	struct cpumask irq_affinity;
67	struct kobject kobj;
68};
69
70struct vduse_dev;
71
72struct vduse_vdpa {
73	struct vdpa_device vdpa;
74	struct vduse_dev *dev;
75};
76
77struct vduse_umem {
78	unsigned long iova;
79	unsigned long npages;
80	struct page **pages;
81	struct mm_struct *mm;
82};
83
84struct vduse_dev {
85	struct vduse_vdpa *vdev;
86	struct device *dev;
87	struct vduse_virtqueue **vqs;
88	struct vduse_iova_domain *domain;
89	char *name;
90	struct mutex lock;
91	spinlock_t msg_lock;
92	u64 msg_unique;
93	u32 msg_timeout;
94	wait_queue_head_t waitq;
95	struct list_head send_list;
96	struct list_head recv_list;
97	struct vdpa_callback config_cb;
98	struct work_struct inject;
99	spinlock_t irq_lock;
100	struct rw_semaphore rwsem;
101	int minor;
102	bool broken;
103	bool connected;
104	u64 api_version;
105	u64 device_features;
106	u64 driver_features;
107	u32 device_id;
108	u32 vendor_id;
109	u32 generation;
110	u32 config_size;
111	void *config;
112	u8 status;
113	u32 vq_num;
114	u32 vq_align;
115	struct vduse_umem *umem;
116	struct mutex mem_lock;
117	unsigned int bounce_size;
118	struct mutex domain_lock;
119};
120
121struct vduse_dev_msg {
122	struct vduse_dev_request req;
123	struct vduse_dev_response resp;
124	struct list_head list;
125	wait_queue_head_t waitq;
126	bool completed;
127};
128
129struct vduse_control {
130	u64 api_version;
131};
132
133static DEFINE_MUTEX(vduse_lock);
134static DEFINE_IDR(vduse_idr);
135
136static dev_t vduse_major;
137static struct class *vduse_class;
138static struct cdev vduse_ctrl_cdev;
139static struct cdev vduse_cdev;
140static struct workqueue_struct *vduse_irq_wq;
141static struct workqueue_struct *vduse_irq_bound_wq;
142
143static u32 allowed_device_id[] = {
144	VIRTIO_ID_BLOCK,
145};
146
147static inline struct vduse_dev *vdpa_to_vduse(struct vdpa_device *vdpa)
148{
149	struct vduse_vdpa *vdev = container_of(vdpa, struct vduse_vdpa, vdpa);
150
151	return vdev->dev;
152}
153
154static inline struct vduse_dev *dev_to_vduse(struct device *dev)
155{
156	struct vdpa_device *vdpa = dev_to_vdpa(dev);
157
158	return vdpa_to_vduse(vdpa);
159}
160
161static struct vduse_dev_msg *vduse_find_msg(struct list_head *head,
162					    uint32_t request_id)
163{
164	struct vduse_dev_msg *msg;
165
166	list_for_each_entry(msg, head, list) {
167		if (msg->req.request_id == request_id) {
168			list_del(&msg->list);
169			return msg;
170		}
171	}
172
173	return NULL;
174}
175
176static struct vduse_dev_msg *vduse_dequeue_msg(struct list_head *head)
177{
178	struct vduse_dev_msg *msg = NULL;
179
180	if (!list_empty(head)) {
181		msg = list_first_entry(head, struct vduse_dev_msg, list);
182		list_del(&msg->list);
183	}
184
185	return msg;
186}
187
188static void vduse_enqueue_msg(struct list_head *head,
189			      struct vduse_dev_msg *msg)
190{
191	list_add_tail(&msg->list, head);
192}
193
194static void vduse_dev_broken(struct vduse_dev *dev)
195{
196	struct vduse_dev_msg *msg, *tmp;
197
198	if (unlikely(dev->broken))
199		return;
200
201	list_splice_init(&dev->recv_list, &dev->send_list);
202	list_for_each_entry_safe(msg, tmp, &dev->send_list, list) {
203		list_del(&msg->list);
204		msg->completed = 1;
205		msg->resp.result = VDUSE_REQ_RESULT_FAILED;
206		wake_up(&msg->waitq);
207	}
208	dev->broken = true;
209	wake_up(&dev->waitq);
210}
211
212static int vduse_dev_msg_sync(struct vduse_dev *dev,
213			      struct vduse_dev_msg *msg)
214{
215	int ret;
216
217	if (unlikely(dev->broken))
218		return -EIO;
219
220	init_waitqueue_head(&msg->waitq);
221	spin_lock(&dev->msg_lock);
222	if (unlikely(dev->broken)) {
223		spin_unlock(&dev->msg_lock);
224		return -EIO;
225	}
226	msg->req.request_id = dev->msg_unique++;
227	vduse_enqueue_msg(&dev->send_list, msg);
228	wake_up(&dev->waitq);
229	spin_unlock(&dev->msg_lock);
230	if (dev->msg_timeout)
231		ret = wait_event_killable_timeout(msg->waitq, msg->completed,
232						  (long)dev->msg_timeout * HZ);
233	else
234		ret = wait_event_killable(msg->waitq, msg->completed);
235
236	spin_lock(&dev->msg_lock);
237	if (!msg->completed) {
238		list_del(&msg->list);
239		msg->resp.result = VDUSE_REQ_RESULT_FAILED;
240		/* Mark the device as malfunction when there is a timeout */
241		if (!ret)
242			vduse_dev_broken(dev);
243	}
244	ret = (msg->resp.result == VDUSE_REQ_RESULT_OK) ? 0 : -EIO;
245	spin_unlock(&dev->msg_lock);
246
247	return ret;
248}
249
250static int vduse_dev_get_vq_state_packed(struct vduse_dev *dev,
251					 struct vduse_virtqueue *vq,
252					 struct vdpa_vq_state_packed *packed)
253{
254	struct vduse_dev_msg msg = { 0 };
255	int ret;
256
257	msg.req.type = VDUSE_GET_VQ_STATE;
258	msg.req.vq_state.index = vq->index;
259
260	ret = vduse_dev_msg_sync(dev, &msg);
261	if (ret)
262		return ret;
263
264	packed->last_avail_counter =
265			msg.resp.vq_state.packed.last_avail_counter & 0x0001;
266	packed->last_avail_idx =
267			msg.resp.vq_state.packed.last_avail_idx & 0x7FFF;
268	packed->last_used_counter =
269			msg.resp.vq_state.packed.last_used_counter & 0x0001;
270	packed->last_used_idx =
271			msg.resp.vq_state.packed.last_used_idx & 0x7FFF;
272
273	return 0;
274}
275
276static int vduse_dev_get_vq_state_split(struct vduse_dev *dev,
277					struct vduse_virtqueue *vq,
278					struct vdpa_vq_state_split *split)
279{
280	struct vduse_dev_msg msg = { 0 };
281	int ret;
282
283	msg.req.type = VDUSE_GET_VQ_STATE;
284	msg.req.vq_state.index = vq->index;
285
286	ret = vduse_dev_msg_sync(dev, &msg);
287	if (ret)
288		return ret;
289
290	split->avail_index = msg.resp.vq_state.split.avail_index;
291
292	return 0;
293}
294
295static int vduse_dev_set_status(struct vduse_dev *dev, u8 status)
296{
297	struct vduse_dev_msg msg = { 0 };
298
299	msg.req.type = VDUSE_SET_STATUS;
300	msg.req.s.status = status;
301
302	return vduse_dev_msg_sync(dev, &msg);
303}
304
305static int vduse_dev_update_iotlb(struct vduse_dev *dev,
306				  u64 start, u64 last)
307{
308	struct vduse_dev_msg msg = { 0 };
309
310	if (last < start)
311		return -EINVAL;
312
313	msg.req.type = VDUSE_UPDATE_IOTLB;
314	msg.req.iova.start = start;
315	msg.req.iova.last = last;
316
317	return vduse_dev_msg_sync(dev, &msg);
318}
319
320static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to)
321{
322	struct file *file = iocb->ki_filp;
323	struct vduse_dev *dev = file->private_data;
324	struct vduse_dev_msg *msg;
325	int size = sizeof(struct vduse_dev_request);
326	ssize_t ret;
327
328	if (iov_iter_count(to) < size)
329		return -EINVAL;
330
331	spin_lock(&dev->msg_lock);
332	while (1) {
333		msg = vduse_dequeue_msg(&dev->send_list);
334		if (msg)
335			break;
336
337		ret = -EAGAIN;
338		if (file->f_flags & O_NONBLOCK)
339			goto unlock;
340
341		spin_unlock(&dev->msg_lock);
342		ret = wait_event_interruptible_exclusive(dev->waitq,
343					!list_empty(&dev->send_list));
344		if (ret)
345			return ret;
346
347		spin_lock(&dev->msg_lock);
348	}
349	spin_unlock(&dev->msg_lock);
350	ret = copy_to_iter(&msg->req, size, to);
351	spin_lock(&dev->msg_lock);
352	if (ret != size) {
353		ret = -EFAULT;
354		vduse_enqueue_msg(&dev->send_list, msg);
355		goto unlock;
356	}
357	vduse_enqueue_msg(&dev->recv_list, msg);
358unlock:
359	spin_unlock(&dev->msg_lock);
360
361	return ret;
362}
363
364static bool is_mem_zero(const char *ptr, int size)
365{
366	int i;
367
368	for (i = 0; i < size; i++) {
369		if (ptr[i])
370			return false;
371	}
372	return true;
373}
374
375static ssize_t vduse_dev_write_iter(struct kiocb *iocb, struct iov_iter *from)
376{
377	struct file *file = iocb->ki_filp;
378	struct vduse_dev *dev = file->private_data;
379	struct vduse_dev_response resp;
380	struct vduse_dev_msg *msg;
381	size_t ret;
382
383	ret = copy_from_iter(&resp, sizeof(resp), from);
384	if (ret != sizeof(resp))
385		return -EINVAL;
386
387	if (!is_mem_zero((const char *)resp.reserved, sizeof(resp.reserved)))
388		return -EINVAL;
389
390	spin_lock(&dev->msg_lock);
391	msg = vduse_find_msg(&dev->recv_list, resp.request_id);
392	if (!msg) {
393		ret = -ENOENT;
394		goto unlock;
395	}
396
397	memcpy(&msg->resp, &resp, sizeof(resp));
398	msg->completed = 1;
399	wake_up(&msg->waitq);
400unlock:
401	spin_unlock(&dev->msg_lock);
402
403	return ret;
404}
405
406static __poll_t vduse_dev_poll(struct file *file, poll_table *wait)
407{
408	struct vduse_dev *dev = file->private_data;
409	__poll_t mask = 0;
410
411	poll_wait(file, &dev->waitq, wait);
412
413	spin_lock(&dev->msg_lock);
414
415	if (unlikely(dev->broken))
416		mask |= EPOLLERR;
417	if (!list_empty(&dev->send_list))
418		mask |= EPOLLIN | EPOLLRDNORM;
419	if (!list_empty(&dev->recv_list))
420		mask |= EPOLLOUT | EPOLLWRNORM;
421
422	spin_unlock(&dev->msg_lock);
423
424	return mask;
425}
426
427static void vduse_dev_reset(struct vduse_dev *dev)
428{
429	int i;
430	struct vduse_iova_domain *domain = dev->domain;
431
432	/* The coherent mappings are handled in vduse_dev_free_coherent() */
433	if (domain && domain->bounce_map)
434		vduse_domain_reset_bounce_map(domain);
435
436	down_write(&dev->rwsem);
437
438	dev->status = 0;
439	dev->driver_features = 0;
440	dev->generation++;
441	spin_lock(&dev->irq_lock);
442	dev->config_cb.callback = NULL;
443	dev->config_cb.private = NULL;
444	spin_unlock(&dev->irq_lock);
445	flush_work(&dev->inject);
446
447	for (i = 0; i < dev->vq_num; i++) {
448		struct vduse_virtqueue *vq = dev->vqs[i];
449
450		vq->ready = false;
451		vq->desc_addr = 0;
452		vq->driver_addr = 0;
453		vq->device_addr = 0;
454		vq->num = 0;
455		memset(&vq->state, 0, sizeof(vq->state));
456
457		spin_lock(&vq->kick_lock);
458		vq->kicked = false;
459		if (vq->kickfd)
460			eventfd_ctx_put(vq->kickfd);
461		vq->kickfd = NULL;
462		spin_unlock(&vq->kick_lock);
463
464		spin_lock(&vq->irq_lock);
465		vq->cb.callback = NULL;
466		vq->cb.private = NULL;
467		vq->cb.trigger = NULL;
468		spin_unlock(&vq->irq_lock);
469		flush_work(&vq->inject);
470		flush_work(&vq->kick);
471	}
472
473	up_write(&dev->rwsem);
474}
475
476static int vduse_vdpa_set_vq_address(struct vdpa_device *vdpa, u16 idx,
477				u64 desc_area, u64 driver_area,
478				u64 device_area)
479{
480	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
481	struct vduse_virtqueue *vq = dev->vqs[idx];
482
483	vq->desc_addr = desc_area;
484	vq->driver_addr = driver_area;
485	vq->device_addr = device_area;
486
487	return 0;
488}
489
490static void vduse_vq_kick(struct vduse_virtqueue *vq)
491{
492	spin_lock(&vq->kick_lock);
493	if (!vq->ready)
494		goto unlock;
495
496	if (vq->kickfd)
497		eventfd_signal(vq->kickfd, 1);
498	else
499		vq->kicked = true;
500unlock:
501	spin_unlock(&vq->kick_lock);
502}
503
504static void vduse_vq_kick_work(struct work_struct *work)
505{
506	struct vduse_virtqueue *vq = container_of(work,
507					struct vduse_virtqueue, kick);
508
509	vduse_vq_kick(vq);
510}
511
512static void vduse_vdpa_kick_vq(struct vdpa_device *vdpa, u16 idx)
513{
514	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
515	struct vduse_virtqueue *vq = dev->vqs[idx];
516
517	if (!eventfd_signal_allowed()) {
518		schedule_work(&vq->kick);
519		return;
520	}
521	vduse_vq_kick(vq);
522}
523
524static void vduse_vdpa_set_vq_cb(struct vdpa_device *vdpa, u16 idx,
525			      struct vdpa_callback *cb)
526{
527	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
528	struct vduse_virtqueue *vq = dev->vqs[idx];
529
530	spin_lock(&vq->irq_lock);
531	vq->cb.callback = cb->callback;
532	vq->cb.private = cb->private;
533	vq->cb.trigger = cb->trigger;
534	spin_unlock(&vq->irq_lock);
535}
536
537static void vduse_vdpa_set_vq_num(struct vdpa_device *vdpa, u16 idx, u32 num)
538{
539	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
540	struct vduse_virtqueue *vq = dev->vqs[idx];
541
542	vq->num = num;
543}
544
545static void vduse_vdpa_set_vq_ready(struct vdpa_device *vdpa,
546					u16 idx, bool ready)
547{
548	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
549	struct vduse_virtqueue *vq = dev->vqs[idx];
550
551	vq->ready = ready;
552}
553
554static bool vduse_vdpa_get_vq_ready(struct vdpa_device *vdpa, u16 idx)
555{
556	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
557	struct vduse_virtqueue *vq = dev->vqs[idx];
558
559	return vq->ready;
560}
561
562static int vduse_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 idx,
563				const struct vdpa_vq_state *state)
564{
565	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
566	struct vduse_virtqueue *vq = dev->vqs[idx];
567
568	if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) {
569		vq->state.packed.last_avail_counter =
570				state->packed.last_avail_counter;
571		vq->state.packed.last_avail_idx = state->packed.last_avail_idx;
572		vq->state.packed.last_used_counter =
573				state->packed.last_used_counter;
574		vq->state.packed.last_used_idx = state->packed.last_used_idx;
575	} else
576		vq->state.split.avail_index = state->split.avail_index;
577
578	return 0;
579}
580
581static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx,
582				struct vdpa_vq_state *state)
583{
584	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
585	struct vduse_virtqueue *vq = dev->vqs[idx];
586
587	if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED))
588		return vduse_dev_get_vq_state_packed(dev, vq, &state->packed);
589
590	return vduse_dev_get_vq_state_split(dev, vq, &state->split);
591}
592
593static u32 vduse_vdpa_get_vq_align(struct vdpa_device *vdpa)
594{
595	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
596
597	return dev->vq_align;
598}
599
600static u64 vduse_vdpa_get_device_features(struct vdpa_device *vdpa)
601{
602	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
603
604	return dev->device_features;
605}
606
607static int vduse_vdpa_set_driver_features(struct vdpa_device *vdpa, u64 features)
608{
609	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
610
611	dev->driver_features = features;
612	return 0;
613}
614
615static u64 vduse_vdpa_get_driver_features(struct vdpa_device *vdpa)
616{
617	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
618
619	return dev->driver_features;
620}
621
622static void vduse_vdpa_set_config_cb(struct vdpa_device *vdpa,
623				  struct vdpa_callback *cb)
624{
625	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
626
627	spin_lock(&dev->irq_lock);
628	dev->config_cb.callback = cb->callback;
629	dev->config_cb.private = cb->private;
630	spin_unlock(&dev->irq_lock);
631}
632
633static u16 vduse_vdpa_get_vq_num_max(struct vdpa_device *vdpa)
634{
635	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
636	u16 num_max = 0;
637	int i;
638
639	for (i = 0; i < dev->vq_num; i++)
640		if (num_max < dev->vqs[i]->num_max)
641			num_max = dev->vqs[i]->num_max;
642
643	return num_max;
644}
645
646static u32 vduse_vdpa_get_device_id(struct vdpa_device *vdpa)
647{
648	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
649
650	return dev->device_id;
651}
652
653static u32 vduse_vdpa_get_vendor_id(struct vdpa_device *vdpa)
654{
655	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
656
657	return dev->vendor_id;
658}
659
660static u8 vduse_vdpa_get_status(struct vdpa_device *vdpa)
661{
662	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
663
664	return dev->status;
665}
666
667static void vduse_vdpa_set_status(struct vdpa_device *vdpa, u8 status)
668{
669	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
670
671	if (vduse_dev_set_status(dev, status))
672		return;
673
674	dev->status = status;
675}
676
677static size_t vduse_vdpa_get_config_size(struct vdpa_device *vdpa)
678{
679	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
680
681	return dev->config_size;
682}
683
684static void vduse_vdpa_get_config(struct vdpa_device *vdpa, unsigned int offset,
685				  void *buf, unsigned int len)
686{
687	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
688
689	/* Initialize the buffer in case of partial copy. */
690	memset(buf, 0, len);
691
692	if (offset > dev->config_size)
693		return;
694
695	if (len > dev->config_size - offset)
696		len = dev->config_size - offset;
697
698	memcpy(buf, dev->config + offset, len);
699}
700
701static void vduse_vdpa_set_config(struct vdpa_device *vdpa, unsigned int offset,
702			const void *buf, unsigned int len)
703{
704	/* Now we only support read-only configuration space */
705}
706
707static int vduse_vdpa_reset(struct vdpa_device *vdpa)
708{
709	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
710	int ret = vduse_dev_set_status(dev, 0);
711
712	vduse_dev_reset(dev);
713
714	return ret;
715}
716
717static u32 vduse_vdpa_get_generation(struct vdpa_device *vdpa)
718{
719	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
720
721	return dev->generation;
722}
723
724static int vduse_vdpa_set_vq_affinity(struct vdpa_device *vdpa, u16 idx,
725				      const struct cpumask *cpu_mask)
726{
727	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
728
729	if (cpu_mask)
730		cpumask_copy(&dev->vqs[idx]->irq_affinity, cpu_mask);
731	else
732		cpumask_setall(&dev->vqs[idx]->irq_affinity);
733
734	return 0;
735}
736
737static const struct cpumask *
738vduse_vdpa_get_vq_affinity(struct vdpa_device *vdpa, u16 idx)
739{
740	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
741
742	return &dev->vqs[idx]->irq_affinity;
743}
744
745static int vduse_vdpa_set_map(struct vdpa_device *vdpa,
746				unsigned int asid,
747				struct vhost_iotlb *iotlb)
748{
749	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
750	int ret;
751
752	ret = vduse_domain_set_map(dev->domain, iotlb);
753	if (ret)
754		return ret;
755
756	ret = vduse_dev_update_iotlb(dev, 0ULL, ULLONG_MAX);
757	if (ret) {
758		vduse_domain_clear_map(dev->domain, iotlb);
759		return ret;
760	}
761
762	return 0;
763}
764
765static void vduse_vdpa_free(struct vdpa_device *vdpa)
766{
767	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
768
769	dev->vdev = NULL;
770}
771
772static const struct vdpa_config_ops vduse_vdpa_config_ops = {
773	.set_vq_address		= vduse_vdpa_set_vq_address,
774	.kick_vq		= vduse_vdpa_kick_vq,
775	.set_vq_cb		= vduse_vdpa_set_vq_cb,
776	.set_vq_num             = vduse_vdpa_set_vq_num,
777	.set_vq_ready		= vduse_vdpa_set_vq_ready,
778	.get_vq_ready		= vduse_vdpa_get_vq_ready,
779	.set_vq_state		= vduse_vdpa_set_vq_state,
780	.get_vq_state		= vduse_vdpa_get_vq_state,
781	.get_vq_align		= vduse_vdpa_get_vq_align,
782	.get_device_features	= vduse_vdpa_get_device_features,
783	.set_driver_features	= vduse_vdpa_set_driver_features,
784	.get_driver_features	= vduse_vdpa_get_driver_features,
785	.set_config_cb		= vduse_vdpa_set_config_cb,
786	.get_vq_num_max		= vduse_vdpa_get_vq_num_max,
787	.get_device_id		= vduse_vdpa_get_device_id,
788	.get_vendor_id		= vduse_vdpa_get_vendor_id,
789	.get_status		= vduse_vdpa_get_status,
790	.set_status		= vduse_vdpa_set_status,
791	.get_config_size	= vduse_vdpa_get_config_size,
792	.get_config		= vduse_vdpa_get_config,
793	.set_config		= vduse_vdpa_set_config,
794	.get_generation		= vduse_vdpa_get_generation,
795	.set_vq_affinity	= vduse_vdpa_set_vq_affinity,
796	.get_vq_affinity	= vduse_vdpa_get_vq_affinity,
797	.reset			= vduse_vdpa_reset,
798	.set_map		= vduse_vdpa_set_map,
799	.free			= vduse_vdpa_free,
800};
801
802static dma_addr_t vduse_dev_map_page(struct device *dev, struct page *page,
803				     unsigned long offset, size_t size,
804				     enum dma_data_direction dir,
805				     unsigned long attrs)
806{
807	struct vduse_dev *vdev = dev_to_vduse(dev);
808	struct vduse_iova_domain *domain = vdev->domain;
809
810	return vduse_domain_map_page(domain, page, offset, size, dir, attrs);
811}
812
813static void vduse_dev_unmap_page(struct device *dev, dma_addr_t dma_addr,
814				size_t size, enum dma_data_direction dir,
815				unsigned long attrs)
816{
817	struct vduse_dev *vdev = dev_to_vduse(dev);
818	struct vduse_iova_domain *domain = vdev->domain;
819
820	return vduse_domain_unmap_page(domain, dma_addr, size, dir, attrs);
821}
822
823static void *vduse_dev_alloc_coherent(struct device *dev, size_t size,
824					dma_addr_t *dma_addr, gfp_t flag,
825					unsigned long attrs)
826{
827	struct vduse_dev *vdev = dev_to_vduse(dev);
828	struct vduse_iova_domain *domain = vdev->domain;
829	unsigned long iova;
830	void *addr;
831
832	*dma_addr = DMA_MAPPING_ERROR;
833	addr = vduse_domain_alloc_coherent(domain, size,
834				(dma_addr_t *)&iova, flag, attrs);
835	if (!addr)
836		return NULL;
837
838	*dma_addr = (dma_addr_t)iova;
839
840	return addr;
841}
842
843static void vduse_dev_free_coherent(struct device *dev, size_t size,
844					void *vaddr, dma_addr_t dma_addr,
845					unsigned long attrs)
846{
847	struct vduse_dev *vdev = dev_to_vduse(dev);
848	struct vduse_iova_domain *domain = vdev->domain;
849
850	vduse_domain_free_coherent(domain, size, vaddr, dma_addr, attrs);
851}
852
853static size_t vduse_dev_max_mapping_size(struct device *dev)
854{
855	struct vduse_dev *vdev = dev_to_vduse(dev);
856	struct vduse_iova_domain *domain = vdev->domain;
857
858	return domain->bounce_size;
859}
860
861static const struct dma_map_ops vduse_dev_dma_ops = {
862	.map_page = vduse_dev_map_page,
863	.unmap_page = vduse_dev_unmap_page,
864	.alloc = vduse_dev_alloc_coherent,
865	.free = vduse_dev_free_coherent,
866	.max_mapping_size = vduse_dev_max_mapping_size,
867};
868
869static unsigned int perm_to_file_flags(u8 perm)
870{
871	unsigned int flags = 0;
872
873	switch (perm) {
874	case VDUSE_ACCESS_WO:
875		flags |= O_WRONLY;
876		break;
877	case VDUSE_ACCESS_RO:
878		flags |= O_RDONLY;
879		break;
880	case VDUSE_ACCESS_RW:
881		flags |= O_RDWR;
882		break;
883	default:
884		WARN(1, "invalidate vhost IOTLB permission\n");
885		break;
886	}
887
888	return flags;
889}
890
891static int vduse_kickfd_setup(struct vduse_dev *dev,
892			struct vduse_vq_eventfd *eventfd)
893{
894	struct eventfd_ctx *ctx = NULL;
895	struct vduse_virtqueue *vq;
896	u32 index;
897
898	if (eventfd->index >= dev->vq_num)
899		return -EINVAL;
900
901	index = array_index_nospec(eventfd->index, dev->vq_num);
902	vq = dev->vqs[index];
903	if (eventfd->fd >= 0) {
904		ctx = eventfd_ctx_fdget(eventfd->fd);
905		if (IS_ERR(ctx))
906			return PTR_ERR(ctx);
907	} else if (eventfd->fd != VDUSE_EVENTFD_DEASSIGN)
908		return 0;
909
910	spin_lock(&vq->kick_lock);
911	if (vq->kickfd)
912		eventfd_ctx_put(vq->kickfd);
913	vq->kickfd = ctx;
914	if (vq->ready && vq->kicked && vq->kickfd) {
915		eventfd_signal(vq->kickfd, 1);
916		vq->kicked = false;
917	}
918	spin_unlock(&vq->kick_lock);
919
920	return 0;
921}
922
923static bool vduse_dev_is_ready(struct vduse_dev *dev)
924{
925	int i;
926
927	for (i = 0; i < dev->vq_num; i++)
928		if (!dev->vqs[i]->num_max)
929			return false;
930
931	return true;
932}
933
934static void vduse_dev_irq_inject(struct work_struct *work)
935{
936	struct vduse_dev *dev = container_of(work, struct vduse_dev, inject);
937
938	spin_lock_bh(&dev->irq_lock);
939	if (dev->config_cb.callback)
940		dev->config_cb.callback(dev->config_cb.private);
941	spin_unlock_bh(&dev->irq_lock);
942}
943
944static void vduse_vq_irq_inject(struct work_struct *work)
945{
946	struct vduse_virtqueue *vq = container_of(work,
947					struct vduse_virtqueue, inject);
948
949	spin_lock_bh(&vq->irq_lock);
950	if (vq->ready && vq->cb.callback)
951		vq->cb.callback(vq->cb.private);
952	spin_unlock_bh(&vq->irq_lock);
953}
954
955static bool vduse_vq_signal_irqfd(struct vduse_virtqueue *vq)
956{
957	bool signal = false;
958
959	if (!vq->cb.trigger)
960		return false;
961
962	spin_lock_irq(&vq->irq_lock);
963	if (vq->ready && vq->cb.trigger) {
964		eventfd_signal(vq->cb.trigger, 1);
965		signal = true;
966	}
967	spin_unlock_irq(&vq->irq_lock);
968
969	return signal;
970}
971
972static int vduse_dev_queue_irq_work(struct vduse_dev *dev,
973				    struct work_struct *irq_work,
974				    int irq_effective_cpu)
975{
976	int ret = -EINVAL;
977
978	down_read(&dev->rwsem);
979	if (!(dev->status & VIRTIO_CONFIG_S_DRIVER_OK))
980		goto unlock;
981
982	ret = 0;
983	if (irq_effective_cpu == IRQ_UNBOUND)
984		queue_work(vduse_irq_wq, irq_work);
985	else
986		queue_work_on(irq_effective_cpu,
987			      vduse_irq_bound_wq, irq_work);
988unlock:
989	up_read(&dev->rwsem);
990
991	return ret;
992}
993
994static int vduse_dev_dereg_umem(struct vduse_dev *dev,
995				u64 iova, u64 size)
996{
997	int ret;
998
999	mutex_lock(&dev->mem_lock);
1000	ret = -ENOENT;
1001	if (!dev->umem)
1002		goto unlock;
1003
1004	ret = -EINVAL;
1005	if (!dev->domain)
1006		goto unlock;
1007
1008	if (dev->umem->iova != iova || size != dev->domain->bounce_size)
1009		goto unlock;
1010
1011	vduse_domain_remove_user_bounce_pages(dev->domain);
1012	unpin_user_pages_dirty_lock(dev->umem->pages,
1013				    dev->umem->npages, true);
1014	atomic64_sub(dev->umem->npages, &dev->umem->mm->pinned_vm);
1015	mmdrop(dev->umem->mm);
1016	vfree(dev->umem->pages);
1017	kfree(dev->umem);
1018	dev->umem = NULL;
1019	ret = 0;
1020unlock:
1021	mutex_unlock(&dev->mem_lock);
1022	return ret;
1023}
1024
1025static int vduse_dev_reg_umem(struct vduse_dev *dev,
1026			      u64 iova, u64 uaddr, u64 size)
1027{
1028	struct page **page_list = NULL;
1029	struct vduse_umem *umem = NULL;
1030	long pinned = 0;
1031	unsigned long npages, lock_limit;
1032	int ret;
1033
1034	if (!dev->domain || !dev->domain->bounce_map ||
1035	    size != dev->domain->bounce_size ||
1036	    iova != 0 || uaddr & ~PAGE_MASK)
1037		return -EINVAL;
1038
1039	mutex_lock(&dev->mem_lock);
1040	ret = -EEXIST;
1041	if (dev->umem)
1042		goto unlock;
1043
1044	ret = -ENOMEM;
1045	npages = size >> PAGE_SHIFT;
1046	page_list = __vmalloc(array_size(npages, sizeof(struct page *)),
1047			      GFP_KERNEL_ACCOUNT);
1048	umem = kzalloc(sizeof(*umem), GFP_KERNEL);
1049	if (!page_list || !umem)
1050		goto unlock;
1051
1052	mmap_read_lock(current->mm);
1053
1054	lock_limit = PFN_DOWN(rlimit(RLIMIT_MEMLOCK));
1055	if (npages + atomic64_read(&current->mm->pinned_vm) > lock_limit)
1056		goto out;
1057
1058	pinned = pin_user_pages(uaddr, npages, FOLL_LONGTERM | FOLL_WRITE,
1059				page_list);
1060	if (pinned != npages) {
1061		ret = pinned < 0 ? pinned : -ENOMEM;
1062		goto out;
1063	}
1064
1065	ret = vduse_domain_add_user_bounce_pages(dev->domain,
1066						 page_list, pinned);
1067	if (ret)
1068		goto out;
1069
1070	atomic64_add(npages, &current->mm->pinned_vm);
1071
1072	umem->pages = page_list;
1073	umem->npages = pinned;
1074	umem->iova = iova;
1075	umem->mm = current->mm;
1076	mmgrab(current->mm);
1077
1078	dev->umem = umem;
1079out:
1080	if (ret && pinned > 0)
1081		unpin_user_pages(page_list, pinned);
1082
1083	mmap_read_unlock(current->mm);
1084unlock:
1085	if (ret) {
1086		vfree(page_list);
1087		kfree(umem);
1088	}
1089	mutex_unlock(&dev->mem_lock);
1090	return ret;
1091}
1092
1093static void vduse_vq_update_effective_cpu(struct vduse_virtqueue *vq)
1094{
1095	int curr_cpu = vq->irq_effective_cpu;
1096
1097	while (true) {
1098		curr_cpu = cpumask_next(curr_cpu, &vq->irq_affinity);
1099		if (cpu_online(curr_cpu))
1100			break;
1101
1102		if (curr_cpu >= nr_cpu_ids)
1103			curr_cpu = IRQ_UNBOUND;
1104	}
1105
1106	vq->irq_effective_cpu = curr_cpu;
1107}
1108
1109static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
1110			    unsigned long arg)
1111{
1112	struct vduse_dev *dev = file->private_data;
1113	void __user *argp = (void __user *)arg;
1114	int ret;
1115
1116	if (unlikely(dev->broken))
1117		return -EPERM;
1118
1119	switch (cmd) {
1120	case VDUSE_IOTLB_GET_FD: {
1121		struct vduse_iotlb_entry entry;
1122		struct vhost_iotlb_map *map;
1123		struct vdpa_map_file *map_file;
1124		struct file *f = NULL;
1125
1126		ret = -EFAULT;
1127		if (copy_from_user(&entry, argp, sizeof(entry)))
1128			break;
1129
1130		ret = -EINVAL;
1131		if (entry.start > entry.last)
1132			break;
1133
1134		mutex_lock(&dev->domain_lock);
1135		if (!dev->domain) {
1136			mutex_unlock(&dev->domain_lock);
1137			break;
1138		}
1139		spin_lock(&dev->domain->iotlb_lock);
1140		map = vhost_iotlb_itree_first(dev->domain->iotlb,
1141					      entry.start, entry.last);
1142		if (map) {
1143			map_file = (struct vdpa_map_file *)map->opaque;
1144			f = get_file(map_file->file);
1145			entry.offset = map_file->offset;
1146			entry.start = map->start;
1147			entry.last = map->last;
1148			entry.perm = map->perm;
1149		}
1150		spin_unlock(&dev->domain->iotlb_lock);
1151		mutex_unlock(&dev->domain_lock);
1152		ret = -EINVAL;
1153		if (!f)
1154			break;
1155
1156		ret = -EFAULT;
1157		if (copy_to_user(argp, &entry, sizeof(entry))) {
1158			fput(f);
1159			break;
1160		}
1161		ret = receive_fd(f, perm_to_file_flags(entry.perm));
1162		fput(f);
1163		break;
1164	}
1165	case VDUSE_DEV_GET_FEATURES:
1166		/*
1167		 * Just mirror what driver wrote here.
1168		 * The driver is expected to check FEATURE_OK later.
1169		 */
1170		ret = put_user(dev->driver_features, (u64 __user *)argp);
1171		break;
1172	case VDUSE_DEV_SET_CONFIG: {
1173		struct vduse_config_data config;
1174		unsigned long size = offsetof(struct vduse_config_data,
1175					      buffer);
1176
1177		ret = -EFAULT;
1178		if (copy_from_user(&config, argp, size))
1179			break;
1180
1181		ret = -EINVAL;
1182		if (config.offset > dev->config_size ||
1183		    config.length == 0 ||
1184		    config.length > dev->config_size - config.offset)
1185			break;
1186
1187		ret = -EFAULT;
1188		if (copy_from_user(dev->config + config.offset, argp + size,
1189				   config.length))
1190			break;
1191
1192		ret = 0;
1193		break;
1194	}
1195	case VDUSE_DEV_INJECT_CONFIG_IRQ:
1196		ret = vduse_dev_queue_irq_work(dev, &dev->inject, IRQ_UNBOUND);
1197		break;
1198	case VDUSE_VQ_SETUP: {
1199		struct vduse_vq_config config;
1200		u32 index;
1201
1202		ret = -EFAULT;
1203		if (copy_from_user(&config, argp, sizeof(config)))
1204			break;
1205
1206		ret = -EINVAL;
1207		if (config.index >= dev->vq_num)
1208			break;
1209
1210		if (!is_mem_zero((const char *)config.reserved,
1211				 sizeof(config.reserved)))
1212			break;
1213
1214		index = array_index_nospec(config.index, dev->vq_num);
1215		dev->vqs[index]->num_max = config.max_size;
1216		ret = 0;
1217		break;
1218	}
1219	case VDUSE_VQ_GET_INFO: {
1220		struct vduse_vq_info vq_info;
1221		struct vduse_virtqueue *vq;
1222		u32 index;
1223
1224		ret = -EFAULT;
1225		if (copy_from_user(&vq_info, argp, sizeof(vq_info)))
1226			break;
1227
1228		ret = -EINVAL;
1229		if (vq_info.index >= dev->vq_num)
1230			break;
1231
1232		index = array_index_nospec(vq_info.index, dev->vq_num);
1233		vq = dev->vqs[index];
1234		vq_info.desc_addr = vq->desc_addr;
1235		vq_info.driver_addr = vq->driver_addr;
1236		vq_info.device_addr = vq->device_addr;
1237		vq_info.num = vq->num;
1238
1239		if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) {
1240			vq_info.packed.last_avail_counter =
1241				vq->state.packed.last_avail_counter;
1242			vq_info.packed.last_avail_idx =
1243				vq->state.packed.last_avail_idx;
1244			vq_info.packed.last_used_counter =
1245				vq->state.packed.last_used_counter;
1246			vq_info.packed.last_used_idx =
1247				vq->state.packed.last_used_idx;
1248		} else
1249			vq_info.split.avail_index =
1250				vq->state.split.avail_index;
1251
1252		vq_info.ready = vq->ready;
1253
1254		ret = -EFAULT;
1255		if (copy_to_user(argp, &vq_info, sizeof(vq_info)))
1256			break;
1257
1258		ret = 0;
1259		break;
1260	}
1261	case VDUSE_VQ_SETUP_KICKFD: {
1262		struct vduse_vq_eventfd eventfd;
1263
1264		ret = -EFAULT;
1265		if (copy_from_user(&eventfd, argp, sizeof(eventfd)))
1266			break;
1267
1268		ret = vduse_kickfd_setup(dev, &eventfd);
1269		break;
1270	}
1271	case VDUSE_VQ_INJECT_IRQ: {
1272		u32 index;
1273
1274		ret = -EFAULT;
1275		if (get_user(index, (u32 __user *)argp))
1276			break;
1277
1278		ret = -EINVAL;
1279		if (index >= dev->vq_num)
1280			break;
1281
1282		ret = 0;
1283		index = array_index_nospec(index, dev->vq_num);
1284		if (!vduse_vq_signal_irqfd(dev->vqs[index])) {
1285			vduse_vq_update_effective_cpu(dev->vqs[index]);
1286			ret = vduse_dev_queue_irq_work(dev,
1287						&dev->vqs[index]->inject,
1288						dev->vqs[index]->irq_effective_cpu);
1289		}
1290		break;
1291	}
1292	case VDUSE_IOTLB_REG_UMEM: {
1293		struct vduse_iova_umem umem;
1294
1295		ret = -EFAULT;
1296		if (copy_from_user(&umem, argp, sizeof(umem)))
1297			break;
1298
1299		ret = -EINVAL;
1300		if (!is_mem_zero((const char *)umem.reserved,
1301				 sizeof(umem.reserved)))
1302			break;
1303
1304		mutex_lock(&dev->domain_lock);
1305		ret = vduse_dev_reg_umem(dev, umem.iova,
1306					 umem.uaddr, umem.size);
1307		mutex_unlock(&dev->domain_lock);
1308		break;
1309	}
1310	case VDUSE_IOTLB_DEREG_UMEM: {
1311		struct vduse_iova_umem umem;
1312
1313		ret = -EFAULT;
1314		if (copy_from_user(&umem, argp, sizeof(umem)))
1315			break;
1316
1317		ret = -EINVAL;
1318		if (!is_mem_zero((const char *)umem.reserved,
1319				 sizeof(umem.reserved)))
1320			break;
1321		mutex_lock(&dev->domain_lock);
1322		ret = vduse_dev_dereg_umem(dev, umem.iova,
1323					   umem.size);
1324		mutex_unlock(&dev->domain_lock);
1325		break;
1326	}
1327	case VDUSE_IOTLB_GET_INFO: {
1328		struct vduse_iova_info info;
1329		struct vhost_iotlb_map *map;
1330
1331		ret = -EFAULT;
1332		if (copy_from_user(&info, argp, sizeof(info)))
1333			break;
1334
1335		ret = -EINVAL;
1336		if (info.start > info.last)
1337			break;
1338
1339		if (!is_mem_zero((const char *)info.reserved,
1340				 sizeof(info.reserved)))
1341			break;
1342
1343		mutex_lock(&dev->domain_lock);
1344		if (!dev->domain) {
1345			mutex_unlock(&dev->domain_lock);
1346			break;
1347		}
1348		spin_lock(&dev->domain->iotlb_lock);
1349		map = vhost_iotlb_itree_first(dev->domain->iotlb,
1350					      info.start, info.last);
1351		if (map) {
1352			info.start = map->start;
1353			info.last = map->last;
1354			info.capability = 0;
1355			if (dev->domain->bounce_map && map->start == 0 &&
1356			    map->last == dev->domain->bounce_size - 1)
1357				info.capability |= VDUSE_IOVA_CAP_UMEM;
1358		}
1359		spin_unlock(&dev->domain->iotlb_lock);
1360		mutex_unlock(&dev->domain_lock);
1361		if (!map)
1362			break;
1363
1364		ret = -EFAULT;
1365		if (copy_to_user(argp, &info, sizeof(info)))
1366			break;
1367
1368		ret = 0;
1369		break;
1370	}
1371	default:
1372		ret = -ENOIOCTLCMD;
1373		break;
1374	}
1375
1376	return ret;
1377}
1378
1379static int vduse_dev_release(struct inode *inode, struct file *file)
1380{
1381	struct vduse_dev *dev = file->private_data;
1382
1383	mutex_lock(&dev->domain_lock);
1384	if (dev->domain)
1385		vduse_dev_dereg_umem(dev, 0, dev->domain->bounce_size);
1386	mutex_unlock(&dev->domain_lock);
1387	spin_lock(&dev->msg_lock);
1388	/* Make sure the inflight messages can processed after reconncection */
1389	list_splice_init(&dev->recv_list, &dev->send_list);
1390	spin_unlock(&dev->msg_lock);
1391	dev->connected = false;
1392
1393	return 0;
1394}
1395
1396static struct vduse_dev *vduse_dev_get_from_minor(int minor)
1397{
1398	struct vduse_dev *dev;
1399
1400	mutex_lock(&vduse_lock);
1401	dev = idr_find(&vduse_idr, minor);
1402	mutex_unlock(&vduse_lock);
1403
1404	return dev;
1405}
1406
1407static int vduse_dev_open(struct inode *inode, struct file *file)
1408{
1409	int ret;
1410	struct vduse_dev *dev = vduse_dev_get_from_minor(iminor(inode));
1411
1412	if (!dev)
1413		return -ENODEV;
1414
1415	ret = -EBUSY;
1416	mutex_lock(&dev->lock);
1417	if (dev->connected)
1418		goto unlock;
1419
1420	ret = 0;
1421	dev->connected = true;
1422	file->private_data = dev;
1423unlock:
1424	mutex_unlock(&dev->lock);
1425
1426	return ret;
1427}
1428
1429static const struct file_operations vduse_dev_fops = {
1430	.owner		= THIS_MODULE,
1431	.open		= vduse_dev_open,
1432	.release	= vduse_dev_release,
1433	.read_iter	= vduse_dev_read_iter,
1434	.write_iter	= vduse_dev_write_iter,
1435	.poll		= vduse_dev_poll,
1436	.unlocked_ioctl	= vduse_dev_ioctl,
1437	.compat_ioctl	= compat_ptr_ioctl,
1438	.llseek		= noop_llseek,
1439};
1440
1441static ssize_t irq_cb_affinity_show(struct vduse_virtqueue *vq, char *buf)
1442{
1443	return sprintf(buf, "%*pb\n", cpumask_pr_args(&vq->irq_affinity));
1444}
1445
1446static ssize_t irq_cb_affinity_store(struct vduse_virtqueue *vq,
1447				     const char *buf, size_t count)
1448{
1449	cpumask_var_t new_value;
1450	int ret;
1451
1452	if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
1453		return -ENOMEM;
1454
1455	ret = cpumask_parse(buf, new_value);
1456	if (ret)
1457		goto free_mask;
1458
1459	ret = -EINVAL;
1460	if (!cpumask_intersects(new_value, cpu_online_mask))
1461		goto free_mask;
1462
1463	cpumask_copy(&vq->irq_affinity, new_value);
1464	ret = count;
1465free_mask:
1466	free_cpumask_var(new_value);
1467	return ret;
1468}
1469
1470struct vq_sysfs_entry {
1471	struct attribute attr;
1472	ssize_t (*show)(struct vduse_virtqueue *vq, char *buf);
1473	ssize_t (*store)(struct vduse_virtqueue *vq, const char *buf,
1474			 size_t count);
1475};
1476
1477static struct vq_sysfs_entry irq_cb_affinity_attr = __ATTR_RW(irq_cb_affinity);
1478
1479static struct attribute *vq_attrs[] = {
1480	&irq_cb_affinity_attr.attr,
1481	NULL,
1482};
1483ATTRIBUTE_GROUPS(vq);
1484
1485static ssize_t vq_attr_show(struct kobject *kobj, struct attribute *attr,
1486			    char *buf)
1487{
1488	struct vduse_virtqueue *vq = container_of(kobj,
1489					struct vduse_virtqueue, kobj);
1490	struct vq_sysfs_entry *entry = container_of(attr,
1491					struct vq_sysfs_entry, attr);
1492
1493	if (!entry->show)
1494		return -EIO;
1495
1496	return entry->show(vq, buf);
1497}
1498
1499static ssize_t vq_attr_store(struct kobject *kobj, struct attribute *attr,
1500			     const char *buf, size_t count)
1501{
1502	struct vduse_virtqueue *vq = container_of(kobj,
1503					struct vduse_virtqueue, kobj);
1504	struct vq_sysfs_entry *entry = container_of(attr,
1505					struct vq_sysfs_entry, attr);
1506
1507	if (!entry->store)
1508		return -EIO;
1509
1510	return entry->store(vq, buf, count);
1511}
1512
1513static const struct sysfs_ops vq_sysfs_ops = {
1514	.show = vq_attr_show,
1515	.store = vq_attr_store,
1516};
1517
1518static void vq_release(struct kobject *kobj)
1519{
1520	struct vduse_virtqueue *vq = container_of(kobj,
1521					struct vduse_virtqueue, kobj);
1522	kfree(vq);
1523}
1524
1525static const struct kobj_type vq_type = {
1526	.release	= vq_release,
1527	.sysfs_ops	= &vq_sysfs_ops,
1528	.default_groups	= vq_groups,
1529};
1530
1531static void vduse_dev_deinit_vqs(struct vduse_dev *dev)
1532{
1533	int i;
1534
1535	if (!dev->vqs)
1536		return;
1537
1538	for (i = 0; i < dev->vq_num; i++)
1539		kobject_put(&dev->vqs[i]->kobj);
1540	kfree(dev->vqs);
1541}
1542
1543static int vduse_dev_init_vqs(struct vduse_dev *dev, u32 vq_align, u32 vq_num)
1544{
1545	int ret, i;
1546
1547	dev->vq_align = vq_align;
1548	dev->vq_num = vq_num;
1549	dev->vqs = kcalloc(dev->vq_num, sizeof(*dev->vqs), GFP_KERNEL);
1550	if (!dev->vqs)
1551		return -ENOMEM;
1552
1553	for (i = 0; i < vq_num; i++) {
1554		dev->vqs[i] = kzalloc(sizeof(*dev->vqs[i]), GFP_KERNEL);
1555		if (!dev->vqs[i]) {
1556			ret = -ENOMEM;
1557			goto err;
1558		}
1559
1560		dev->vqs[i]->index = i;
1561		dev->vqs[i]->irq_effective_cpu = IRQ_UNBOUND;
1562		INIT_WORK(&dev->vqs[i]->inject, vduse_vq_irq_inject);
1563		INIT_WORK(&dev->vqs[i]->kick, vduse_vq_kick_work);
1564		spin_lock_init(&dev->vqs[i]->kick_lock);
1565		spin_lock_init(&dev->vqs[i]->irq_lock);
1566		cpumask_setall(&dev->vqs[i]->irq_affinity);
1567
1568		kobject_init(&dev->vqs[i]->kobj, &vq_type);
1569		ret = kobject_add(&dev->vqs[i]->kobj,
1570				  &dev->dev->kobj, "vq%d", i);
1571		if (ret) {
1572			kfree(dev->vqs[i]);
1573			goto err;
1574		}
1575	}
1576
1577	return 0;
1578err:
1579	while (i--)
1580		kobject_put(&dev->vqs[i]->kobj);
1581	kfree(dev->vqs);
1582	dev->vqs = NULL;
1583	return ret;
1584}
1585
1586static struct vduse_dev *vduse_dev_create(void)
1587{
1588	struct vduse_dev *dev = kzalloc(sizeof(*dev), GFP_KERNEL);
1589
1590	if (!dev)
1591		return NULL;
1592
1593	mutex_init(&dev->lock);
1594	mutex_init(&dev->mem_lock);
1595	mutex_init(&dev->domain_lock);
1596	spin_lock_init(&dev->msg_lock);
1597	INIT_LIST_HEAD(&dev->send_list);
1598	INIT_LIST_HEAD(&dev->recv_list);
1599	spin_lock_init(&dev->irq_lock);
1600	init_rwsem(&dev->rwsem);
1601
1602	INIT_WORK(&dev->inject, vduse_dev_irq_inject);
1603	init_waitqueue_head(&dev->waitq);
1604
1605	return dev;
1606}
1607
1608static void vduse_dev_destroy(struct vduse_dev *dev)
1609{
1610	kfree(dev);
1611}
1612
1613static struct vduse_dev *vduse_find_dev(const char *name)
1614{
1615	struct vduse_dev *dev;
1616	int id;
1617
1618	idr_for_each_entry(&vduse_idr, dev, id)
1619		if (!strcmp(dev->name, name))
1620			return dev;
1621
1622	return NULL;
1623}
1624
1625static int vduse_destroy_dev(char *name)
1626{
1627	struct vduse_dev *dev = vduse_find_dev(name);
1628
1629	if (!dev)
1630		return -EINVAL;
1631
1632	mutex_lock(&dev->lock);
1633	if (dev->vdev || dev->connected) {
1634		mutex_unlock(&dev->lock);
1635		return -EBUSY;
1636	}
1637	dev->connected = true;
1638	mutex_unlock(&dev->lock);
1639
1640	vduse_dev_reset(dev);
1641	device_destroy(vduse_class, MKDEV(MAJOR(vduse_major), dev->minor));
1642	idr_remove(&vduse_idr, dev->minor);
1643	kvfree(dev->config);
1644	vduse_dev_deinit_vqs(dev);
1645	if (dev->domain)
1646		vduse_domain_destroy(dev->domain);
1647	kfree(dev->name);
1648	vduse_dev_destroy(dev);
1649	module_put(THIS_MODULE);
1650
1651	return 0;
1652}
1653
1654static bool device_is_allowed(u32 device_id)
1655{
1656	int i;
1657
1658	for (i = 0; i < ARRAY_SIZE(allowed_device_id); i++)
1659		if (allowed_device_id[i] == device_id)
1660			return true;
1661
1662	return false;
1663}
1664
1665static bool features_is_valid(u64 features)
1666{
1667	if (!(features & (1ULL << VIRTIO_F_ACCESS_PLATFORM)))
1668		return false;
1669
1670	/* Now we only support read-only configuration space */
1671	if (features & (1ULL << VIRTIO_BLK_F_CONFIG_WCE))
1672		return false;
1673
1674	return true;
1675}
1676
1677static bool vduse_validate_config(struct vduse_dev_config *config)
1678{
1679	if (!is_mem_zero((const char *)config->reserved,
1680			 sizeof(config->reserved)))
1681		return false;
1682
1683	if (config->vq_align > PAGE_SIZE)
1684		return false;
1685
1686	if (config->config_size > PAGE_SIZE)
1687		return false;
1688
1689	if (config->vq_num > 0xffff)
1690		return false;
1691
1692	if (!config->name[0])
1693		return false;
1694
1695	if (!device_is_allowed(config->device_id))
1696		return false;
1697
1698	if (!features_is_valid(config->features))
1699		return false;
1700
1701	return true;
1702}
1703
1704static ssize_t msg_timeout_show(struct device *device,
1705				struct device_attribute *attr, char *buf)
1706{
1707	struct vduse_dev *dev = dev_get_drvdata(device);
1708
1709	return sysfs_emit(buf, "%u\n", dev->msg_timeout);
1710}
1711
1712static ssize_t msg_timeout_store(struct device *device,
1713				 struct device_attribute *attr,
1714				 const char *buf, size_t count)
1715{
1716	struct vduse_dev *dev = dev_get_drvdata(device);
1717	int ret;
1718
1719	ret = kstrtouint(buf, 10, &dev->msg_timeout);
1720	if (ret < 0)
1721		return ret;
1722
1723	return count;
1724}
1725
1726static DEVICE_ATTR_RW(msg_timeout);
1727
1728static ssize_t bounce_size_show(struct device *device,
1729				struct device_attribute *attr, char *buf)
1730{
1731	struct vduse_dev *dev = dev_get_drvdata(device);
1732
1733	return sysfs_emit(buf, "%u\n", dev->bounce_size);
1734}
1735
1736static ssize_t bounce_size_store(struct device *device,
1737				 struct device_attribute *attr,
1738				 const char *buf, size_t count)
1739{
1740	struct vduse_dev *dev = dev_get_drvdata(device);
1741	unsigned int bounce_size;
1742	int ret;
1743
1744	ret = -EPERM;
1745	mutex_lock(&dev->domain_lock);
1746	if (dev->domain)
1747		goto unlock;
1748
1749	ret = kstrtouint(buf, 10, &bounce_size);
1750	if (ret < 0)
1751		goto unlock;
1752
1753	ret = -EINVAL;
1754	if (bounce_size > VDUSE_MAX_BOUNCE_SIZE ||
1755	    bounce_size < VDUSE_MIN_BOUNCE_SIZE)
1756		goto unlock;
1757
1758	dev->bounce_size = bounce_size & PAGE_MASK;
1759	ret = count;
1760unlock:
1761	mutex_unlock(&dev->domain_lock);
1762	return ret;
1763}
1764
1765static DEVICE_ATTR_RW(bounce_size);
1766
1767static struct attribute *vduse_dev_attrs[] = {
1768	&dev_attr_msg_timeout.attr,
1769	&dev_attr_bounce_size.attr,
1770	NULL
1771};
1772
1773ATTRIBUTE_GROUPS(vduse_dev);
1774
1775static int vduse_create_dev(struct vduse_dev_config *config,
1776			    void *config_buf, u64 api_version)
1777{
1778	int ret;
1779	struct vduse_dev *dev;
1780
1781	ret = -EEXIST;
1782	if (vduse_find_dev(config->name))
1783		goto err;
1784
1785	ret = -ENOMEM;
1786	dev = vduse_dev_create();
1787	if (!dev)
1788		goto err;
1789
1790	dev->api_version = api_version;
1791	dev->device_features = config->features;
1792	dev->device_id = config->device_id;
1793	dev->vendor_id = config->vendor_id;
1794	dev->name = kstrdup(config->name, GFP_KERNEL);
1795	if (!dev->name)
1796		goto err_str;
1797
1798	dev->bounce_size = VDUSE_BOUNCE_SIZE;
1799	dev->config = config_buf;
1800	dev->config_size = config->config_size;
1801
1802	ret = idr_alloc(&vduse_idr, dev, 1, VDUSE_DEV_MAX, GFP_KERNEL);
1803	if (ret < 0)
1804		goto err_idr;
1805
1806	dev->minor = ret;
1807	dev->msg_timeout = VDUSE_MSG_DEFAULT_TIMEOUT;
1808	dev->dev = device_create_with_groups(vduse_class, NULL,
1809				MKDEV(MAJOR(vduse_major), dev->minor),
1810				dev, vduse_dev_groups, "%s", config->name);
1811	if (IS_ERR(dev->dev)) {
1812		ret = PTR_ERR(dev->dev);
1813		goto err_dev;
1814	}
1815
1816	ret = vduse_dev_init_vqs(dev, config->vq_align, config->vq_num);
1817	if (ret)
1818		goto err_vqs;
1819
1820	__module_get(THIS_MODULE);
1821
1822	return 0;
1823err_vqs:
1824	device_destroy(vduse_class, MKDEV(MAJOR(vduse_major), dev->minor));
1825err_dev:
1826	idr_remove(&vduse_idr, dev->minor);
1827err_idr:
1828	kfree(dev->name);
1829err_str:
1830	vduse_dev_destroy(dev);
1831err:
1832	return ret;
1833}
1834
1835static long vduse_ioctl(struct file *file, unsigned int cmd,
1836			unsigned long arg)
1837{
1838	int ret;
1839	void __user *argp = (void __user *)arg;
1840	struct vduse_control *control = file->private_data;
1841
1842	mutex_lock(&vduse_lock);
1843	switch (cmd) {
1844	case VDUSE_GET_API_VERSION:
1845		ret = put_user(control->api_version, (u64 __user *)argp);
1846		break;
1847	case VDUSE_SET_API_VERSION: {
1848		u64 api_version;
1849
1850		ret = -EFAULT;
1851		if (get_user(api_version, (u64 __user *)argp))
1852			break;
1853
1854		ret = -EINVAL;
1855		if (api_version > VDUSE_API_VERSION)
1856			break;
1857
1858		ret = 0;
1859		control->api_version = api_version;
1860		break;
1861	}
1862	case VDUSE_CREATE_DEV: {
1863		struct vduse_dev_config config;
1864		unsigned long size = offsetof(struct vduse_dev_config, config);
1865		void *buf;
1866
1867		ret = -EFAULT;
1868		if (copy_from_user(&config, argp, size))
1869			break;
1870
1871		ret = -EINVAL;
1872		if (vduse_validate_config(&config) == false)
1873			break;
1874
1875		buf = vmemdup_user(argp + size, config.config_size);
1876		if (IS_ERR(buf)) {
1877			ret = PTR_ERR(buf);
1878			break;
1879		}
1880		config.name[VDUSE_NAME_MAX - 1] = '\0';
1881		ret = vduse_create_dev(&config, buf, control->api_version);
1882		if (ret)
1883			kvfree(buf);
1884		break;
1885	}
1886	case VDUSE_DESTROY_DEV: {
1887		char name[VDUSE_NAME_MAX];
1888
1889		ret = -EFAULT;
1890		if (copy_from_user(name, argp, VDUSE_NAME_MAX))
1891			break;
1892
1893		name[VDUSE_NAME_MAX - 1] = '\0';
1894		ret = vduse_destroy_dev(name);
1895		break;
1896	}
1897	default:
1898		ret = -EINVAL;
1899		break;
1900	}
1901	mutex_unlock(&vduse_lock);
1902
1903	return ret;
1904}
1905
1906static int vduse_release(struct inode *inode, struct file *file)
1907{
1908	struct vduse_control *control = file->private_data;
1909
1910	kfree(control);
1911	return 0;
1912}
1913
1914static int vduse_open(struct inode *inode, struct file *file)
1915{
1916	struct vduse_control *control;
1917
1918	control = kmalloc(sizeof(struct vduse_control), GFP_KERNEL);
1919	if (!control)
1920		return -ENOMEM;
1921
1922	control->api_version = VDUSE_API_VERSION;
1923	file->private_data = control;
1924
1925	return 0;
1926}
1927
1928static const struct file_operations vduse_ctrl_fops = {
1929	.owner		= THIS_MODULE,
1930	.open		= vduse_open,
1931	.release	= vduse_release,
1932	.unlocked_ioctl	= vduse_ioctl,
1933	.compat_ioctl	= compat_ptr_ioctl,
1934	.llseek		= noop_llseek,
1935};
1936
1937static char *vduse_devnode(const struct device *dev, umode_t *mode)
1938{
1939	return kasprintf(GFP_KERNEL, "vduse/%s", dev_name(dev));
1940}
1941
1942struct vduse_mgmt_dev {
1943	struct vdpa_mgmt_dev mgmt_dev;
1944	struct device dev;
1945};
1946
1947static struct vduse_mgmt_dev *vduse_mgmt;
1948
1949static int vduse_dev_init_vdpa(struct vduse_dev *dev, const char *name)
1950{
1951	struct vduse_vdpa *vdev;
1952	int ret;
1953
1954	if (dev->vdev)
1955		return -EEXIST;
1956
1957	vdev = vdpa_alloc_device(struct vduse_vdpa, vdpa, dev->dev,
1958				 &vduse_vdpa_config_ops, 1, 1, name, true);
1959	if (IS_ERR(vdev))
1960		return PTR_ERR(vdev);
1961
1962	dev->vdev = vdev;
1963	vdev->dev = dev;
1964	vdev->vdpa.dev.dma_mask = &vdev->vdpa.dev.coherent_dma_mask;
1965	ret = dma_set_mask_and_coherent(&vdev->vdpa.dev, DMA_BIT_MASK(64));
1966	if (ret) {
1967		put_device(&vdev->vdpa.dev);
1968		return ret;
1969	}
1970	set_dma_ops(&vdev->vdpa.dev, &vduse_dev_dma_ops);
1971	vdev->vdpa.dma_dev = &vdev->vdpa.dev;
1972	vdev->vdpa.mdev = &vduse_mgmt->mgmt_dev;
1973
1974	return 0;
1975}
1976
1977static int vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name,
1978			const struct vdpa_dev_set_config *config)
1979{
1980	struct vduse_dev *dev;
1981	int ret;
1982
1983	mutex_lock(&vduse_lock);
1984	dev = vduse_find_dev(name);
1985	if (!dev || !vduse_dev_is_ready(dev)) {
1986		mutex_unlock(&vduse_lock);
1987		return -EINVAL;
1988	}
1989	ret = vduse_dev_init_vdpa(dev, name);
1990	mutex_unlock(&vduse_lock);
1991	if (ret)
1992		return ret;
1993
1994	mutex_lock(&dev->domain_lock);
1995	if (!dev->domain)
1996		dev->domain = vduse_domain_create(VDUSE_IOVA_SIZE - 1,
1997						  dev->bounce_size);
1998	mutex_unlock(&dev->domain_lock);
1999	if (!dev->domain) {
2000		put_device(&dev->vdev->vdpa.dev);
2001		return -ENOMEM;
2002	}
2003
2004	ret = _vdpa_register_device(&dev->vdev->vdpa, dev->vq_num);
2005	if (ret) {
2006		put_device(&dev->vdev->vdpa.dev);
2007		mutex_lock(&dev->domain_lock);
2008		vduse_domain_destroy(dev->domain);
2009		dev->domain = NULL;
2010		mutex_unlock(&dev->domain_lock);
2011		return ret;
2012	}
2013
2014	return 0;
2015}
2016
2017static void vdpa_dev_del(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev)
2018{
2019	_vdpa_unregister_device(dev);
2020}
2021
2022static const struct vdpa_mgmtdev_ops vdpa_dev_mgmtdev_ops = {
2023	.dev_add = vdpa_dev_add,
2024	.dev_del = vdpa_dev_del,
2025};
2026
2027static struct virtio_device_id id_table[] = {
2028	{ VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
2029	{ 0 },
2030};
2031
2032static void vduse_mgmtdev_release(struct device *dev)
2033{
2034	struct vduse_mgmt_dev *mgmt_dev;
2035
2036	mgmt_dev = container_of(dev, struct vduse_mgmt_dev, dev);
2037	kfree(mgmt_dev);
2038}
2039
2040static int vduse_mgmtdev_init(void)
2041{
2042	int ret;
2043
2044	vduse_mgmt = kzalloc(sizeof(*vduse_mgmt), GFP_KERNEL);
2045	if (!vduse_mgmt)
2046		return -ENOMEM;
2047
2048	ret = dev_set_name(&vduse_mgmt->dev, "vduse");
2049	if (ret) {
2050		kfree(vduse_mgmt);
2051		return ret;
2052	}
2053
2054	vduse_mgmt->dev.release = vduse_mgmtdev_release;
2055
2056	ret = device_register(&vduse_mgmt->dev);
2057	if (ret)
2058		goto dev_reg_err;
2059
2060	vduse_mgmt->mgmt_dev.id_table = id_table;
2061	vduse_mgmt->mgmt_dev.ops = &vdpa_dev_mgmtdev_ops;
2062	vduse_mgmt->mgmt_dev.device = &vduse_mgmt->dev;
2063	ret = vdpa_mgmtdev_register(&vduse_mgmt->mgmt_dev);
2064	if (ret)
2065		device_unregister(&vduse_mgmt->dev);
2066
2067	return ret;
2068
2069dev_reg_err:
2070	put_device(&vduse_mgmt->dev);
2071	return ret;
2072}
2073
2074static void vduse_mgmtdev_exit(void)
2075{
2076	vdpa_mgmtdev_unregister(&vduse_mgmt->mgmt_dev);
2077	device_unregister(&vduse_mgmt->dev);
2078}
2079
2080static int vduse_init(void)
2081{
2082	int ret;
2083	struct device *dev;
2084
2085	vduse_class = class_create("vduse");
2086	if (IS_ERR(vduse_class))
2087		return PTR_ERR(vduse_class);
2088
2089	vduse_class->devnode = vduse_devnode;
2090
2091	ret = alloc_chrdev_region(&vduse_major, 0, VDUSE_DEV_MAX, "vduse");
2092	if (ret)
2093		goto err_chardev_region;
2094
2095	/* /dev/vduse/control */
2096	cdev_init(&vduse_ctrl_cdev, &vduse_ctrl_fops);
2097	vduse_ctrl_cdev.owner = THIS_MODULE;
2098	ret = cdev_add(&vduse_ctrl_cdev, vduse_major, 1);
2099	if (ret)
2100		goto err_ctrl_cdev;
2101
2102	dev = device_create(vduse_class, NULL, vduse_major, NULL, "control");
2103	if (IS_ERR(dev)) {
2104		ret = PTR_ERR(dev);
2105		goto err_device;
2106	}
2107
2108	/* /dev/vduse/$DEVICE */
2109	cdev_init(&vduse_cdev, &vduse_dev_fops);
2110	vduse_cdev.owner = THIS_MODULE;
2111	ret = cdev_add(&vduse_cdev, MKDEV(MAJOR(vduse_major), 1),
2112		       VDUSE_DEV_MAX - 1);
2113	if (ret)
2114		goto err_cdev;
2115
2116	ret = -ENOMEM;
2117	vduse_irq_wq = alloc_workqueue("vduse-irq",
2118				WQ_HIGHPRI | WQ_SYSFS | WQ_UNBOUND, 0);
2119	if (!vduse_irq_wq)
2120		goto err_wq;
2121
2122	vduse_irq_bound_wq = alloc_workqueue("vduse-irq-bound", WQ_HIGHPRI, 0);
2123	if (!vduse_irq_bound_wq)
2124		goto err_bound_wq;
2125
2126	ret = vduse_domain_init();
2127	if (ret)
2128		goto err_domain;
2129
2130	ret = vduse_mgmtdev_init();
2131	if (ret)
2132		goto err_mgmtdev;
2133
2134	return 0;
2135err_mgmtdev:
2136	vduse_domain_exit();
2137err_domain:
2138	destroy_workqueue(vduse_irq_bound_wq);
2139err_bound_wq:
2140	destroy_workqueue(vduse_irq_wq);
2141err_wq:
2142	cdev_del(&vduse_cdev);
2143err_cdev:
2144	device_destroy(vduse_class, vduse_major);
2145err_device:
2146	cdev_del(&vduse_ctrl_cdev);
2147err_ctrl_cdev:
2148	unregister_chrdev_region(vduse_major, VDUSE_DEV_MAX);
2149err_chardev_region:
2150	class_destroy(vduse_class);
2151	return ret;
2152}
2153module_init(vduse_init);
2154
2155static void vduse_exit(void)
2156{
2157	vduse_mgmtdev_exit();
2158	vduse_domain_exit();
2159	destroy_workqueue(vduse_irq_bound_wq);
2160	destroy_workqueue(vduse_irq_wq);
2161	cdev_del(&vduse_cdev);
2162	device_destroy(vduse_class, vduse_major);
2163	cdev_del(&vduse_ctrl_cdev);
2164	unregister_chrdev_region(vduse_major, VDUSE_DEV_MAX);
2165	class_destroy(vduse_class);
2166}
2167module_exit(vduse_exit);
2168
2169MODULE_LICENSE(DRV_LICENSE);
2170MODULE_AUTHOR(DRV_AUTHOR);
2171MODULE_DESCRIPTION(DRV_DESC);
2172