xref: /kernel/linux/linux-5.10/drivers/vfio/vfio.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * VFIO core
4 *
5 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
6 *     Author: Alex Williamson <alex.williamson@redhat.com>
7 *
8 * Derived from original vfio:
9 * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
10 * Author: Tom Lyon, pugs@cisco.com
11 */
12
13#include <linux/cdev.h>
14#include <linux/compat.h>
15#include <linux/device.h>
16#include <linux/file.h>
17#include <linux/anon_inodes.h>
18#include <linux/fs.h>
19#include <linux/idr.h>
20#include <linux/iommu.h>
21#include <linux/list.h>
22#include <linux/miscdevice.h>
23#include <linux/module.h>
24#include <linux/mutex.h>
25#include <linux/pci.h>
26#include <linux/rwsem.h>
27#include <linux/sched.h>
28#include <linux/slab.h>
29#include <linux/stat.h>
30#include <linux/string.h>
31#include <linux/uaccess.h>
32#include <linux/vfio.h>
33#include <linux/wait.h>
34#include <linux/sched/signal.h>
35
36#define DRIVER_VERSION	"0.3"
37#define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
38#define DRIVER_DESC	"VFIO - User Level meta-driver"
39
40static struct vfio {
41	struct class			*class;
42	struct list_head		iommu_drivers_list;
43	struct mutex			iommu_drivers_lock;
44	struct list_head		group_list;
45	struct idr			group_idr;
46	struct mutex			group_lock;
47	struct cdev			group_cdev;
48	dev_t				group_devt;
49} vfio;
50
51struct vfio_iommu_driver {
52	const struct vfio_iommu_driver_ops	*ops;
53	struct list_head			vfio_next;
54};
55
56struct vfio_container {
57	struct kref			kref;
58	struct list_head		group_list;
59	struct rw_semaphore		group_lock;
60	struct vfio_iommu_driver	*iommu_driver;
61	void				*iommu_data;
62	bool				noiommu;
63};
64
65struct vfio_unbound_dev {
66	struct device			*dev;
67	struct list_head		unbound_next;
68};
69
70struct vfio_group {
71	struct kref			kref;
72	int				minor;
73	atomic_t			container_users;
74	struct iommu_group		*iommu_group;
75	struct vfio_container		*container;
76	struct list_head		device_list;
77	struct mutex			device_lock;
78	struct device			*dev;
79	struct notifier_block		nb;
80	struct list_head		vfio_next;
81	struct list_head		container_next;
82	struct list_head		unbound_list;
83	struct mutex			unbound_lock;
84	atomic_t			opened;
85	wait_queue_head_t		container_q;
86	bool				noiommu;
87	unsigned int			dev_counter;
88	struct kvm			*kvm;
89	struct blocking_notifier_head	notifier;
90};
91
92#ifdef CONFIG_VFIO_NOIOMMU
93static bool noiommu __read_mostly;
94module_param_named(enable_unsafe_noiommu_mode,
95		   noiommu, bool, S_IRUGO | S_IWUSR);
96MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
97#endif
98
99/*
100 * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
101 * and remove functions, any use cases other than acquiring the first
102 * reference for the purpose of calling vfio_add_group_dev() or removing
103 * that symmetric reference after vfio_del_group_dev() should use the raw
104 * iommu_group_{get,put} functions.  In particular, vfio_iommu_group_put()
105 * removes the device from the dummy group and cannot be nested.
106 */
107struct iommu_group *vfio_iommu_group_get(struct device *dev)
108{
109	struct iommu_group *group;
110	int __maybe_unused ret;
111
112	group = iommu_group_get(dev);
113
114#ifdef CONFIG_VFIO_NOIOMMU
115	/*
116	 * With noiommu enabled, an IOMMU group will be created for a device
117	 * that doesn't already have one and doesn't have an iommu_ops on their
118	 * bus.  We set iommudata simply to be able to identify these groups
119	 * as special use and for reclamation later.
120	 */
121	if (group || !noiommu || iommu_present(dev->bus))
122		return group;
123
124	group = iommu_group_alloc();
125	if (IS_ERR(group))
126		return NULL;
127
128	iommu_group_set_name(group, "vfio-noiommu");
129	iommu_group_set_iommudata(group, &noiommu, NULL);
130	ret = iommu_group_add_device(group, dev);
131	if (ret) {
132		iommu_group_put(group);
133		return NULL;
134	}
135
136	/*
137	 * Where to taint?  At this point we've added an IOMMU group for a
138	 * device that is not backed by iommu_ops, therefore any iommu_
139	 * callback using iommu_ops can legitimately Oops.  So, while we may
140	 * be about to give a DMA capable device to a user without IOMMU
141	 * protection, which is clearly taint-worthy, let's go ahead and do
142	 * it here.
143	 */
144	add_taint(TAINT_USER, LOCKDEP_STILL_OK);
145	dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
146#endif
147
148	return group;
149}
150EXPORT_SYMBOL_GPL(vfio_iommu_group_get);
151
152void vfio_iommu_group_put(struct iommu_group *group, struct device *dev)
153{
154#ifdef CONFIG_VFIO_NOIOMMU
155	if (iommu_group_get_iommudata(group) == &noiommu)
156		iommu_group_remove_device(dev);
157#endif
158
159	iommu_group_put(group);
160}
161EXPORT_SYMBOL_GPL(vfio_iommu_group_put);
162
163#ifdef CONFIG_VFIO_NOIOMMU
164static void *vfio_noiommu_open(unsigned long arg)
165{
166	if (arg != VFIO_NOIOMMU_IOMMU)
167		return ERR_PTR(-EINVAL);
168	if (!capable(CAP_SYS_RAWIO))
169		return ERR_PTR(-EPERM);
170
171	return NULL;
172}
173
174static void vfio_noiommu_release(void *iommu_data)
175{
176}
177
178static long vfio_noiommu_ioctl(void *iommu_data,
179			       unsigned int cmd, unsigned long arg)
180{
181	if (cmd == VFIO_CHECK_EXTENSION)
182		return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
183
184	return -ENOTTY;
185}
186
187static int vfio_noiommu_attach_group(void *iommu_data,
188				     struct iommu_group *iommu_group)
189{
190	return iommu_group_get_iommudata(iommu_group) == &noiommu ? 0 : -EINVAL;
191}
192
193static void vfio_noiommu_detach_group(void *iommu_data,
194				      struct iommu_group *iommu_group)
195{
196}
197
198static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
199	.name = "vfio-noiommu",
200	.owner = THIS_MODULE,
201	.open = vfio_noiommu_open,
202	.release = vfio_noiommu_release,
203	.ioctl = vfio_noiommu_ioctl,
204	.attach_group = vfio_noiommu_attach_group,
205	.detach_group = vfio_noiommu_detach_group,
206};
207#endif
208
209
210/**
211 * IOMMU driver registration
212 */
213int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
214{
215	struct vfio_iommu_driver *driver, *tmp;
216
217	driver = kzalloc(sizeof(*driver), GFP_KERNEL);
218	if (!driver)
219		return -ENOMEM;
220
221	driver->ops = ops;
222
223	mutex_lock(&vfio.iommu_drivers_lock);
224
225	/* Check for duplicates */
226	list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
227		if (tmp->ops == ops) {
228			mutex_unlock(&vfio.iommu_drivers_lock);
229			kfree(driver);
230			return -EINVAL;
231		}
232	}
233
234	list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
235
236	mutex_unlock(&vfio.iommu_drivers_lock);
237
238	return 0;
239}
240EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
241
242void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
243{
244	struct vfio_iommu_driver *driver;
245
246	mutex_lock(&vfio.iommu_drivers_lock);
247	list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
248		if (driver->ops == ops) {
249			list_del(&driver->vfio_next);
250			mutex_unlock(&vfio.iommu_drivers_lock);
251			kfree(driver);
252			return;
253		}
254	}
255	mutex_unlock(&vfio.iommu_drivers_lock);
256}
257EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
258
259/**
260 * Group minor allocation/free - both called with vfio.group_lock held
261 */
262static int vfio_alloc_group_minor(struct vfio_group *group)
263{
264	return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL);
265}
266
267static void vfio_free_group_minor(int minor)
268{
269	idr_remove(&vfio.group_idr, minor);
270}
271
272static int vfio_iommu_group_notifier(struct notifier_block *nb,
273				     unsigned long action, void *data);
274static void vfio_group_get(struct vfio_group *group);
275
276/**
277 * Container objects - containers are created when /dev/vfio/vfio is
278 * opened, but their lifecycle extends until the last user is done, so
279 * it's freed via kref.  Must support container/group/device being
280 * closed in any order.
281 */
282static void vfio_container_get(struct vfio_container *container)
283{
284	kref_get(&container->kref);
285}
286
287static void vfio_container_release(struct kref *kref)
288{
289	struct vfio_container *container;
290	container = container_of(kref, struct vfio_container, kref);
291
292	kfree(container);
293}
294
295static void vfio_container_put(struct vfio_container *container)
296{
297	kref_put(&container->kref, vfio_container_release);
298}
299
300static void vfio_group_unlock_and_free(struct vfio_group *group)
301{
302	mutex_unlock(&vfio.group_lock);
303	/*
304	 * Unregister outside of lock.  A spurious callback is harmless now
305	 * that the group is no longer in vfio.group_list.
306	 */
307	iommu_group_unregister_notifier(group->iommu_group, &group->nb);
308	kfree(group);
309}
310
311/**
312 * Group objects - create, release, get, put, search
313 */
314static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
315{
316	struct vfio_group *group, *tmp;
317	struct device *dev;
318	int ret, minor;
319
320	group = kzalloc(sizeof(*group), GFP_KERNEL);
321	if (!group)
322		return ERR_PTR(-ENOMEM);
323
324	kref_init(&group->kref);
325	INIT_LIST_HEAD(&group->device_list);
326	mutex_init(&group->device_lock);
327	INIT_LIST_HEAD(&group->unbound_list);
328	mutex_init(&group->unbound_lock);
329	atomic_set(&group->container_users, 0);
330	atomic_set(&group->opened, 0);
331	init_waitqueue_head(&group->container_q);
332	group->iommu_group = iommu_group;
333#ifdef CONFIG_VFIO_NOIOMMU
334	group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
335#endif
336	BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
337
338	group->nb.notifier_call = vfio_iommu_group_notifier;
339
340	/*
341	 * blocking notifiers acquire a rwsem around registering and hold
342	 * it around callback.  Therefore, need to register outside of
343	 * vfio.group_lock to avoid A-B/B-A contention.  Our callback won't
344	 * do anything unless it can find the group in vfio.group_list, so
345	 * no harm in registering early.
346	 */
347	ret = iommu_group_register_notifier(iommu_group, &group->nb);
348	if (ret) {
349		kfree(group);
350		return ERR_PTR(ret);
351	}
352
353	mutex_lock(&vfio.group_lock);
354
355	/* Did we race creating this group? */
356	list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
357		if (tmp->iommu_group == iommu_group) {
358			vfio_group_get(tmp);
359			vfio_group_unlock_and_free(group);
360			return tmp;
361		}
362	}
363
364	minor = vfio_alloc_group_minor(group);
365	if (minor < 0) {
366		vfio_group_unlock_and_free(group);
367		return ERR_PTR(minor);
368	}
369
370	dev = device_create(vfio.class, NULL,
371			    MKDEV(MAJOR(vfio.group_devt), minor),
372			    group, "%s%d", group->noiommu ? "noiommu-" : "",
373			    iommu_group_id(iommu_group));
374	if (IS_ERR(dev)) {
375		vfio_free_group_minor(minor);
376		vfio_group_unlock_and_free(group);
377		return ERR_CAST(dev);
378	}
379
380	group->minor = minor;
381	group->dev = dev;
382
383	list_add(&group->vfio_next, &vfio.group_list);
384
385	mutex_unlock(&vfio.group_lock);
386
387	return group;
388}
389
390/* called with vfio.group_lock held */
391static void vfio_group_release(struct kref *kref)
392{
393	struct vfio_group *group = container_of(kref, struct vfio_group, kref);
394	struct vfio_unbound_dev *unbound, *tmp;
395	struct iommu_group *iommu_group = group->iommu_group;
396
397	WARN_ON(!list_empty(&group->device_list));
398	WARN_ON(group->notifier.head);
399
400	list_for_each_entry_safe(unbound, tmp,
401				 &group->unbound_list, unbound_next) {
402		list_del(&unbound->unbound_next);
403		kfree(unbound);
404	}
405
406	device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
407	list_del(&group->vfio_next);
408	vfio_free_group_minor(group->minor);
409	vfio_group_unlock_and_free(group);
410	iommu_group_put(iommu_group);
411}
412
413static void vfio_group_put(struct vfio_group *group)
414{
415	kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
416}
417
418struct vfio_group_put_work {
419	struct work_struct work;
420	struct vfio_group *group;
421};
422
423static void vfio_group_put_bg(struct work_struct *work)
424{
425	struct vfio_group_put_work *do_work;
426
427	do_work = container_of(work, struct vfio_group_put_work, work);
428
429	vfio_group_put(do_work->group);
430	kfree(do_work);
431}
432
433static void vfio_group_schedule_put(struct vfio_group *group)
434{
435	struct vfio_group_put_work *do_work;
436
437	do_work = kmalloc(sizeof(*do_work), GFP_KERNEL);
438	if (WARN_ON(!do_work))
439		return;
440
441	INIT_WORK(&do_work->work, vfio_group_put_bg);
442	do_work->group = group;
443	schedule_work(&do_work->work);
444}
445
446/* Assume group_lock or group reference is held */
447static void vfio_group_get(struct vfio_group *group)
448{
449	kref_get(&group->kref);
450}
451
452/*
453 * Not really a try as we will sleep for mutex, but we need to make
454 * sure the group pointer is valid under lock and get a reference.
455 */
456static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
457{
458	struct vfio_group *target = group;
459
460	mutex_lock(&vfio.group_lock);
461	list_for_each_entry(group, &vfio.group_list, vfio_next) {
462		if (group == target) {
463			vfio_group_get(group);
464			mutex_unlock(&vfio.group_lock);
465			return group;
466		}
467	}
468	mutex_unlock(&vfio.group_lock);
469
470	return NULL;
471}
472
473static
474struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
475{
476	struct vfio_group *group;
477
478	mutex_lock(&vfio.group_lock);
479	list_for_each_entry(group, &vfio.group_list, vfio_next) {
480		if (group->iommu_group == iommu_group) {
481			vfio_group_get(group);
482			mutex_unlock(&vfio.group_lock);
483			return group;
484		}
485	}
486	mutex_unlock(&vfio.group_lock);
487
488	return NULL;
489}
490
491static struct vfio_group *vfio_group_get_from_minor(int minor)
492{
493	struct vfio_group *group;
494
495	mutex_lock(&vfio.group_lock);
496	group = idr_find(&vfio.group_idr, minor);
497	if (!group) {
498		mutex_unlock(&vfio.group_lock);
499		return NULL;
500	}
501	vfio_group_get(group);
502	mutex_unlock(&vfio.group_lock);
503
504	return group;
505}
506
507static struct vfio_group *vfio_group_get_from_dev(struct device *dev)
508{
509	struct iommu_group *iommu_group;
510	struct vfio_group *group;
511
512	iommu_group = iommu_group_get(dev);
513	if (!iommu_group)
514		return NULL;
515
516	group = vfio_group_get_from_iommu(iommu_group);
517	iommu_group_put(iommu_group);
518
519	return group;
520}
521
522/**
523 * Device objects - create, release, get, put, search
524 */
525/* Device reference always implies a group reference */
526void vfio_device_put(struct vfio_device *device)
527{
528	if (refcount_dec_and_test(&device->refcount))
529		complete(&device->comp);
530}
531EXPORT_SYMBOL_GPL(vfio_device_put);
532
533static bool vfio_device_try_get(struct vfio_device *device)
534{
535	return refcount_inc_not_zero(&device->refcount);
536}
537
538static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
539						 struct device *dev)
540{
541	struct vfio_device *device;
542
543	mutex_lock(&group->device_lock);
544	list_for_each_entry(device, &group->device_list, group_next) {
545		if (device->dev == dev && vfio_device_try_get(device)) {
546			mutex_unlock(&group->device_lock);
547			return device;
548		}
549	}
550	mutex_unlock(&group->device_lock);
551	return NULL;
552}
553
554/*
555 * Some drivers, like pci-stub, are only used to prevent other drivers from
556 * claiming a device and are therefore perfectly legitimate for a user owned
557 * group.  The pci-stub driver has no dependencies on DMA or the IOVA mapping
558 * of the device, but it does prevent the user from having direct access to
559 * the device, which is useful in some circumstances.
560 *
561 * We also assume that we can include PCI interconnect devices, ie. bridges.
562 * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
563 * then all of the downstream devices will be part of the same IOMMU group as
564 * the bridge.  Thus, if placing the bridge into the user owned IOVA space
565 * breaks anything, it only does so for user owned devices downstream.  Note
566 * that error notification via MSI can be affected for platforms that handle
567 * MSI within the same IOVA space as DMA.
568 */
569static const char * const vfio_driver_allowed[] = { "pci-stub" };
570
571static bool vfio_dev_driver_allowed(struct device *dev,
572				    struct device_driver *drv)
573{
574	if (dev_is_pci(dev)) {
575		struct pci_dev *pdev = to_pci_dev(dev);
576
577		if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
578			return true;
579	}
580
581	return match_string(vfio_driver_allowed,
582			    ARRAY_SIZE(vfio_driver_allowed),
583			    drv->name) >= 0;
584}
585
586/*
587 * A vfio group is viable for use by userspace if all devices are in
588 * one of the following states:
589 *  - driver-less
590 *  - bound to a vfio driver
591 *  - bound to an otherwise allowed driver
592 *  - a PCI interconnect device
593 *
594 * We use two methods to determine whether a device is bound to a vfio
595 * driver.  The first is to test whether the device exists in the vfio
596 * group.  The second is to test if the device exists on the group
597 * unbound_list, indicating it's in the middle of transitioning from
598 * a vfio driver to driver-less.
599 */
600static int vfio_dev_viable(struct device *dev, void *data)
601{
602	struct vfio_group *group = data;
603	struct vfio_device *device;
604	struct device_driver *drv = READ_ONCE(dev->driver);
605	struct vfio_unbound_dev *unbound;
606	int ret = -EINVAL;
607
608	mutex_lock(&group->unbound_lock);
609	list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
610		if (dev == unbound->dev) {
611			ret = 0;
612			break;
613		}
614	}
615	mutex_unlock(&group->unbound_lock);
616
617	if (!ret || !drv || vfio_dev_driver_allowed(dev, drv))
618		return 0;
619
620	device = vfio_group_get_device(group, dev);
621	if (device) {
622		vfio_device_put(device);
623		return 0;
624	}
625
626	return ret;
627}
628
629/**
630 * Async device support
631 */
632static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
633{
634	struct vfio_device *device;
635
636	/* Do we already know about it?  We shouldn't */
637	device = vfio_group_get_device(group, dev);
638	if (WARN_ON_ONCE(device)) {
639		vfio_device_put(device);
640		return 0;
641	}
642
643	/* Nothing to do for idle groups */
644	if (!atomic_read(&group->container_users))
645		return 0;
646
647	/* TODO Prevent device auto probing */
648	dev_WARN(dev, "Device added to live group %d!\n",
649		 iommu_group_id(group->iommu_group));
650
651	return 0;
652}
653
654static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
655{
656	/* We don't care what happens when the group isn't in use */
657	if (!atomic_read(&group->container_users))
658		return 0;
659
660	return vfio_dev_viable(dev, group);
661}
662
663static int vfio_iommu_group_notifier(struct notifier_block *nb,
664				     unsigned long action, void *data)
665{
666	struct vfio_group *group = container_of(nb, struct vfio_group, nb);
667	struct device *dev = data;
668	struct vfio_unbound_dev *unbound;
669
670	/*
671	 * Need to go through a group_lock lookup to get a reference or we
672	 * risk racing a group being removed.  Ignore spurious notifies.
673	 */
674	group = vfio_group_try_get(group);
675	if (!group)
676		return NOTIFY_OK;
677
678	switch (action) {
679	case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
680		vfio_group_nb_add_dev(group, dev);
681		break;
682	case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
683		/*
684		 * Nothing to do here.  If the device is in use, then the
685		 * vfio sub-driver should block the remove callback until
686		 * it is unused.  If the device is unused or attached to a
687		 * stub driver, then it should be released and we don't
688		 * care that it will be going away.
689		 */
690		break;
691	case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
692		dev_dbg(dev, "%s: group %d binding to driver\n", __func__,
693			iommu_group_id(group->iommu_group));
694		break;
695	case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
696		dev_dbg(dev, "%s: group %d bound to driver %s\n", __func__,
697			iommu_group_id(group->iommu_group), dev->driver->name);
698		BUG_ON(vfio_group_nb_verify(group, dev));
699		break;
700	case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
701		dev_dbg(dev, "%s: group %d unbinding from driver %s\n",
702			__func__, iommu_group_id(group->iommu_group),
703			dev->driver->name);
704		break;
705	case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
706		dev_dbg(dev, "%s: group %d unbound from driver\n", __func__,
707			iommu_group_id(group->iommu_group));
708		/*
709		 * XXX An unbound device in a live group is ok, but we'd
710		 * really like to avoid the above BUG_ON by preventing other
711		 * drivers from binding to it.  Once that occurs, we have to
712		 * stop the system to maintain isolation.  At a minimum, we'd
713		 * want a toggle to disable driver auto probe for this device.
714		 */
715
716		mutex_lock(&group->unbound_lock);
717		list_for_each_entry(unbound,
718				    &group->unbound_list, unbound_next) {
719			if (dev == unbound->dev) {
720				list_del(&unbound->unbound_next);
721				kfree(unbound);
722				break;
723			}
724		}
725		mutex_unlock(&group->unbound_lock);
726		break;
727	}
728
729	/*
730	 * If we're the last reference to the group, the group will be
731	 * released, which includes unregistering the iommu group notifier.
732	 * We hold a read-lock on that notifier list, unregistering needs
733	 * a write-lock... deadlock.  Release our reference asynchronously
734	 * to avoid that situation.
735	 */
736	vfio_group_schedule_put(group);
737	return NOTIFY_OK;
738}
739
740/**
741 * VFIO driver API
742 */
743void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
744			 const struct vfio_device_ops *ops, void *device_data)
745{
746	init_completion(&device->comp);
747	device->dev = dev;
748	device->ops = ops;
749	device->device_data = device_data;
750}
751EXPORT_SYMBOL_GPL(vfio_init_group_dev);
752
753int vfio_register_group_dev(struct vfio_device *device)
754{
755	struct vfio_device *existing_device;
756	struct iommu_group *iommu_group;
757	struct vfio_group *group;
758
759	iommu_group = iommu_group_get(device->dev);
760	if (!iommu_group)
761		return -EINVAL;
762
763	group = vfio_group_get_from_iommu(iommu_group);
764	if (!group) {
765		group = vfio_create_group(iommu_group);
766		if (IS_ERR(group)) {
767			iommu_group_put(iommu_group);
768			return PTR_ERR(group);
769		}
770	} else {
771		/*
772		 * A found vfio_group already holds a reference to the
773		 * iommu_group.  A created vfio_group keeps the reference.
774		 */
775		iommu_group_put(iommu_group);
776	}
777
778	existing_device = vfio_group_get_device(group, device->dev);
779	if (existing_device) {
780		dev_WARN(device->dev, "Device already exists on group %d\n",
781			 iommu_group_id(iommu_group));
782		vfio_device_put(existing_device);
783		vfio_group_put(group);
784		return -EBUSY;
785	}
786
787	/* Our reference on group is moved to the device */
788	device->group = group;
789
790	/* Refcounting can't start until the driver calls register */
791	refcount_set(&device->refcount, 1);
792
793	mutex_lock(&group->device_lock);
794	list_add(&device->group_next, &group->device_list);
795	group->dev_counter++;
796	mutex_unlock(&group->device_lock);
797
798	return 0;
799}
800EXPORT_SYMBOL_GPL(vfio_register_group_dev);
801
802int vfio_add_group_dev(struct device *dev, const struct vfio_device_ops *ops,
803		       void *device_data)
804{
805	struct vfio_device *device;
806	int ret;
807
808	device = kzalloc(sizeof(*device), GFP_KERNEL);
809	if (!device)
810		return -ENOMEM;
811
812	vfio_init_group_dev(device, dev, ops, device_data);
813	ret = vfio_register_group_dev(device);
814	if (ret)
815		goto err_kfree;
816	dev_set_drvdata(dev, device);
817	return 0;
818
819err_kfree:
820	kfree(device);
821	return ret;
822}
823EXPORT_SYMBOL_GPL(vfio_add_group_dev);
824
825/**
826 * Get a reference to the vfio_device for a device.  Even if the
827 * caller thinks they own the device, they could be racing with a
828 * release call path, so we can't trust drvdata for the shortcut.
829 * Go the long way around, from the iommu_group to the vfio_group
830 * to the vfio_device.
831 */
832struct vfio_device *vfio_device_get_from_dev(struct device *dev)
833{
834	struct vfio_group *group;
835	struct vfio_device *device;
836
837	group = vfio_group_get_from_dev(dev);
838	if (!group)
839		return NULL;
840
841	device = vfio_group_get_device(group, dev);
842	vfio_group_put(group);
843
844	return device;
845}
846EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
847
848static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
849						     char *buf)
850{
851	struct vfio_device *it, *device = ERR_PTR(-ENODEV);
852
853	mutex_lock(&group->device_lock);
854	list_for_each_entry(it, &group->device_list, group_next) {
855		int ret;
856
857		if (it->ops->match) {
858			ret = it->ops->match(it->device_data, buf);
859			if (ret < 0) {
860				device = ERR_PTR(ret);
861				break;
862			}
863		} else {
864			ret = !strcmp(dev_name(it->dev), buf);
865		}
866
867		if (ret && vfio_device_try_get(it)) {
868			device = it;
869			break;
870		}
871	}
872	mutex_unlock(&group->device_lock);
873
874	return device;
875}
876
877/*
878 * Caller must hold a reference to the vfio_device
879 */
880void *vfio_device_data(struct vfio_device *device)
881{
882	return device->device_data;
883}
884EXPORT_SYMBOL_GPL(vfio_device_data);
885
886/*
887 * Decrement the device reference count and wait for the device to be
888 * removed.  Open file descriptors for the device... */
889void vfio_unregister_group_dev(struct vfio_device *device)
890{
891	struct vfio_group *group = device->group;
892	struct vfio_unbound_dev *unbound;
893	unsigned int i = 0;
894	bool interrupted = false;
895	long rc;
896
897	/*
898	 * When the device is removed from the group, the group suddenly
899	 * becomes non-viable; the device has a driver (until the unbind
900	 * completes), but it's not present in the group.  This is bad news
901	 * for any external users that need to re-acquire a group reference
902	 * in order to match and release their existing reference.  To
903	 * solve this, we track such devices on the unbound_list to bridge
904	 * the gap until they're fully unbound.
905	 */
906	unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
907	if (unbound) {
908		unbound->dev = device->dev;
909		mutex_lock(&group->unbound_lock);
910		list_add(&unbound->unbound_next, &group->unbound_list);
911		mutex_unlock(&group->unbound_lock);
912	}
913	WARN_ON(!unbound);
914
915	vfio_device_put(device);
916	rc = try_wait_for_completion(&device->comp);
917	while (rc <= 0) {
918		if (device->ops->request)
919			device->ops->request(device->device_data, i++);
920
921		if (interrupted) {
922			rc = wait_for_completion_timeout(&device->comp,
923							 HZ * 10);
924		} else {
925			rc = wait_for_completion_interruptible_timeout(
926				&device->comp, HZ * 10);
927			if (rc < 0) {
928				interrupted = true;
929				dev_warn(device->dev,
930					 "Device is currently in use, task"
931					 " \"%s\" (%d) "
932					 "blocked until device is released",
933					 current->comm, task_pid_nr(current));
934			}
935		}
936	}
937
938	mutex_lock(&group->device_lock);
939	list_del(&device->group_next);
940	group->dev_counter--;
941	mutex_unlock(&group->device_lock);
942
943	/*
944	 * In order to support multiple devices per group, devices can be
945	 * plucked from the group while other devices in the group are still
946	 * in use.  The container persists with this group and those remaining
947	 * devices still attached.  If the user creates an isolation violation
948	 * by binding this device to another driver while the group is still in
949	 * use, that's their fault.  However, in the case of removing the last,
950	 * or potentially the only, device in the group there can be no other
951	 * in-use devices in the group.  The user has done their due diligence
952	 * and we should lay no claims to those devices.  In order to do that,
953	 * we need to make sure the group is detached from the container.
954	 * Without this stall, we're potentially racing with a user process
955	 * that may attempt to immediately bind this device to another driver.
956	 */
957	if (list_empty(&group->device_list))
958		wait_event(group->container_q, !group->container);
959
960	/* Matches the get in vfio_register_group_dev() */
961	vfio_group_put(group);
962}
963EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
964
965void *vfio_del_group_dev(struct device *dev)
966{
967	struct vfio_device *device = dev_get_drvdata(dev);
968	void *device_data = device->device_data;
969
970	vfio_unregister_group_dev(device);
971	dev_set_drvdata(dev, NULL);
972	kfree(device);
973	return device_data;
974}
975EXPORT_SYMBOL_GPL(vfio_del_group_dev);
976
977/**
978 * VFIO base fd, /dev/vfio/vfio
979 */
980static long vfio_ioctl_check_extension(struct vfio_container *container,
981				       unsigned long arg)
982{
983	struct vfio_iommu_driver *driver;
984	long ret = 0;
985
986	down_read(&container->group_lock);
987
988	driver = container->iommu_driver;
989
990	switch (arg) {
991		/* No base extensions yet */
992	default:
993		/*
994		 * If no driver is set, poll all registered drivers for
995		 * extensions and return the first positive result.  If
996		 * a driver is already set, further queries will be passed
997		 * only to that driver.
998		 */
999		if (!driver) {
1000			mutex_lock(&vfio.iommu_drivers_lock);
1001			list_for_each_entry(driver, &vfio.iommu_drivers_list,
1002					    vfio_next) {
1003
1004#ifdef CONFIG_VFIO_NOIOMMU
1005				if (!list_empty(&container->group_list) &&
1006				    (container->noiommu !=
1007				     (driver->ops == &vfio_noiommu_ops)))
1008					continue;
1009#endif
1010
1011				if (!try_module_get(driver->ops->owner))
1012					continue;
1013
1014				ret = driver->ops->ioctl(NULL,
1015							 VFIO_CHECK_EXTENSION,
1016							 arg);
1017				module_put(driver->ops->owner);
1018				if (ret > 0)
1019					break;
1020			}
1021			mutex_unlock(&vfio.iommu_drivers_lock);
1022		} else
1023			ret = driver->ops->ioctl(container->iommu_data,
1024						 VFIO_CHECK_EXTENSION, arg);
1025	}
1026
1027	up_read(&container->group_lock);
1028
1029	return ret;
1030}
1031
1032/* hold write lock on container->group_lock */
1033static int __vfio_container_attach_groups(struct vfio_container *container,
1034					  struct vfio_iommu_driver *driver,
1035					  void *data)
1036{
1037	struct vfio_group *group;
1038	int ret = -ENODEV;
1039
1040	list_for_each_entry(group, &container->group_list, container_next) {
1041		ret = driver->ops->attach_group(data, group->iommu_group);
1042		if (ret)
1043			goto unwind;
1044	}
1045
1046	return ret;
1047
1048unwind:
1049	list_for_each_entry_continue_reverse(group, &container->group_list,
1050					     container_next) {
1051		driver->ops->detach_group(data, group->iommu_group);
1052	}
1053
1054	return ret;
1055}
1056
1057static long vfio_ioctl_set_iommu(struct vfio_container *container,
1058				 unsigned long arg)
1059{
1060	struct vfio_iommu_driver *driver;
1061	long ret = -ENODEV;
1062
1063	down_write(&container->group_lock);
1064
1065	/*
1066	 * The container is designed to be an unprivileged interface while
1067	 * the group can be assigned to specific users.  Therefore, only by
1068	 * adding a group to a container does the user get the privilege of
1069	 * enabling the iommu, which may allocate finite resources.  There
1070	 * is no unset_iommu, but by removing all the groups from a container,
1071	 * the container is deprivileged and returns to an unset state.
1072	 */
1073	if (list_empty(&container->group_list) || container->iommu_driver) {
1074		up_write(&container->group_lock);
1075		return -EINVAL;
1076	}
1077
1078	mutex_lock(&vfio.iommu_drivers_lock);
1079	list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
1080		void *data;
1081
1082#ifdef CONFIG_VFIO_NOIOMMU
1083		/*
1084		 * Only noiommu containers can use vfio-noiommu and noiommu
1085		 * containers can only use vfio-noiommu.
1086		 */
1087		if (container->noiommu != (driver->ops == &vfio_noiommu_ops))
1088			continue;
1089#endif
1090
1091		if (!try_module_get(driver->ops->owner))
1092			continue;
1093
1094		/*
1095		 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1096		 * so test which iommu driver reported support for this
1097		 * extension and call open on them.  We also pass them the
1098		 * magic, allowing a single driver to support multiple
1099		 * interfaces if they'd like.
1100		 */
1101		if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
1102			module_put(driver->ops->owner);
1103			continue;
1104		}
1105
1106		data = driver->ops->open(arg);
1107		if (IS_ERR(data)) {
1108			ret = PTR_ERR(data);
1109			module_put(driver->ops->owner);
1110			continue;
1111		}
1112
1113		ret = __vfio_container_attach_groups(container, driver, data);
1114		if (ret) {
1115			driver->ops->release(data);
1116			module_put(driver->ops->owner);
1117			continue;
1118		}
1119
1120		container->iommu_driver = driver;
1121		container->iommu_data = data;
1122		break;
1123	}
1124
1125	mutex_unlock(&vfio.iommu_drivers_lock);
1126	up_write(&container->group_lock);
1127
1128	return ret;
1129}
1130
1131static long vfio_fops_unl_ioctl(struct file *filep,
1132				unsigned int cmd, unsigned long arg)
1133{
1134	struct vfio_container *container = filep->private_data;
1135	struct vfio_iommu_driver *driver;
1136	void *data;
1137	long ret = -EINVAL;
1138
1139	if (!container)
1140		return ret;
1141
1142	switch (cmd) {
1143	case VFIO_GET_API_VERSION:
1144		ret = VFIO_API_VERSION;
1145		break;
1146	case VFIO_CHECK_EXTENSION:
1147		ret = vfio_ioctl_check_extension(container, arg);
1148		break;
1149	case VFIO_SET_IOMMU:
1150		ret = vfio_ioctl_set_iommu(container, arg);
1151		break;
1152	default:
1153		driver = container->iommu_driver;
1154		data = container->iommu_data;
1155
1156		if (driver) /* passthrough all unrecognized ioctls */
1157			ret = driver->ops->ioctl(data, cmd, arg);
1158	}
1159
1160	return ret;
1161}
1162
1163static int vfio_fops_open(struct inode *inode, struct file *filep)
1164{
1165	struct vfio_container *container;
1166
1167	container = kzalloc(sizeof(*container), GFP_KERNEL);
1168	if (!container)
1169		return -ENOMEM;
1170
1171	INIT_LIST_HEAD(&container->group_list);
1172	init_rwsem(&container->group_lock);
1173	kref_init(&container->kref);
1174
1175	filep->private_data = container;
1176
1177	return 0;
1178}
1179
1180static int vfio_fops_release(struct inode *inode, struct file *filep)
1181{
1182	struct vfio_container *container = filep->private_data;
1183
1184	filep->private_data = NULL;
1185
1186	vfio_container_put(container);
1187
1188	return 0;
1189}
1190
1191/*
1192 * Once an iommu driver is set, we optionally pass read/write/mmap
1193 * on to the driver, allowing management interfaces beyond ioctl.
1194 */
1195static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
1196			      size_t count, loff_t *ppos)
1197{
1198	struct vfio_container *container = filep->private_data;
1199	struct vfio_iommu_driver *driver;
1200	ssize_t ret = -EINVAL;
1201
1202	driver = container->iommu_driver;
1203	if (likely(driver && driver->ops->read))
1204		ret = driver->ops->read(container->iommu_data,
1205					buf, count, ppos);
1206
1207	return ret;
1208}
1209
1210static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
1211			       size_t count, loff_t *ppos)
1212{
1213	struct vfio_container *container = filep->private_data;
1214	struct vfio_iommu_driver *driver;
1215	ssize_t ret = -EINVAL;
1216
1217	driver = container->iommu_driver;
1218	if (likely(driver && driver->ops->write))
1219		ret = driver->ops->write(container->iommu_data,
1220					 buf, count, ppos);
1221
1222	return ret;
1223}
1224
1225static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1226{
1227	struct vfio_container *container = filep->private_data;
1228	struct vfio_iommu_driver *driver;
1229	int ret = -EINVAL;
1230
1231	driver = container->iommu_driver;
1232	if (likely(driver && driver->ops->mmap))
1233		ret = driver->ops->mmap(container->iommu_data, vma);
1234
1235	return ret;
1236}
1237
1238static const struct file_operations vfio_fops = {
1239	.owner		= THIS_MODULE,
1240	.open		= vfio_fops_open,
1241	.release	= vfio_fops_release,
1242	.read		= vfio_fops_read,
1243	.write		= vfio_fops_write,
1244	.unlocked_ioctl	= vfio_fops_unl_ioctl,
1245	.compat_ioctl	= compat_ptr_ioctl,
1246	.mmap		= vfio_fops_mmap,
1247};
1248
1249/**
1250 * VFIO Group fd, /dev/vfio/$GROUP
1251 */
1252static void __vfio_group_unset_container(struct vfio_group *group)
1253{
1254	struct vfio_container *container = group->container;
1255	struct vfio_iommu_driver *driver;
1256
1257	down_write(&container->group_lock);
1258
1259	driver = container->iommu_driver;
1260	if (driver)
1261		driver->ops->detach_group(container->iommu_data,
1262					  group->iommu_group);
1263
1264	group->container = NULL;
1265	wake_up(&group->container_q);
1266	list_del(&group->container_next);
1267
1268	/* Detaching the last group deprivileges a container, remove iommu */
1269	if (driver && list_empty(&container->group_list)) {
1270		driver->ops->release(container->iommu_data);
1271		module_put(driver->ops->owner);
1272		container->iommu_driver = NULL;
1273		container->iommu_data = NULL;
1274	}
1275
1276	up_write(&container->group_lock);
1277
1278	vfio_container_put(container);
1279}
1280
1281/*
1282 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1283 * if there was no container to unset.  Since the ioctl is called on
1284 * the group, we know that still exists, therefore the only valid
1285 * transition here is 1->0.
1286 */
1287static int vfio_group_unset_container(struct vfio_group *group)
1288{
1289	int users = atomic_cmpxchg(&group->container_users, 1, 0);
1290
1291	if (!users)
1292		return -EINVAL;
1293	if (users != 1)
1294		return -EBUSY;
1295
1296	__vfio_group_unset_container(group);
1297
1298	return 0;
1299}
1300
1301/*
1302 * When removing container users, anything that removes the last user
1303 * implicitly removes the group from the container.  That is, if the
1304 * group file descriptor is closed, as well as any device file descriptors,
1305 * the group is free.
1306 */
1307static void vfio_group_try_dissolve_container(struct vfio_group *group)
1308{
1309	if (0 == atomic_dec_if_positive(&group->container_users))
1310		__vfio_group_unset_container(group);
1311}
1312
1313static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1314{
1315	struct fd f;
1316	struct vfio_container *container;
1317	struct vfio_iommu_driver *driver;
1318	int ret = 0;
1319
1320	if (atomic_read(&group->container_users))
1321		return -EINVAL;
1322
1323	if (group->noiommu && !capable(CAP_SYS_RAWIO))
1324		return -EPERM;
1325
1326	f = fdget(container_fd);
1327	if (!f.file)
1328		return -EBADF;
1329
1330	/* Sanity check, is this really our fd? */
1331	if (f.file->f_op != &vfio_fops) {
1332		fdput(f);
1333		return -EINVAL;
1334	}
1335
1336	container = f.file->private_data;
1337	WARN_ON(!container); /* fget ensures we don't race vfio_release */
1338
1339	down_write(&container->group_lock);
1340
1341	/* Real groups and fake groups cannot mix */
1342	if (!list_empty(&container->group_list) &&
1343	    container->noiommu != group->noiommu) {
1344		ret = -EPERM;
1345		goto unlock_out;
1346	}
1347
1348	driver = container->iommu_driver;
1349	if (driver) {
1350		ret = driver->ops->attach_group(container->iommu_data,
1351						group->iommu_group);
1352		if (ret)
1353			goto unlock_out;
1354	}
1355
1356	group->container = container;
1357	container->noiommu = group->noiommu;
1358	list_add(&group->container_next, &container->group_list);
1359
1360	/* Get a reference on the container and mark a user within the group */
1361	vfio_container_get(container);
1362	atomic_inc(&group->container_users);
1363
1364unlock_out:
1365	up_write(&container->group_lock);
1366	fdput(f);
1367	return ret;
1368}
1369
1370static bool vfio_group_viable(struct vfio_group *group)
1371{
1372	return (iommu_group_for_each_dev(group->iommu_group,
1373					 group, vfio_dev_viable) == 0);
1374}
1375
1376static int vfio_group_add_container_user(struct vfio_group *group)
1377{
1378	if (!atomic_inc_not_zero(&group->container_users))
1379		return -EINVAL;
1380
1381	if (group->noiommu) {
1382		atomic_dec(&group->container_users);
1383		return -EPERM;
1384	}
1385	if (!group->container->iommu_driver || !vfio_group_viable(group)) {
1386		atomic_dec(&group->container_users);
1387		return -EINVAL;
1388	}
1389
1390	return 0;
1391}
1392
1393static const struct file_operations vfio_device_fops;
1394
1395static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1396{
1397	struct vfio_device *device;
1398	struct file *filep;
1399	int ret;
1400
1401	if (0 == atomic_read(&group->container_users) ||
1402	    !group->container->iommu_driver || !vfio_group_viable(group))
1403		return -EINVAL;
1404
1405	if (group->noiommu && !capable(CAP_SYS_RAWIO))
1406		return -EPERM;
1407
1408	device = vfio_device_get_from_name(group, buf);
1409	if (IS_ERR(device))
1410		return PTR_ERR(device);
1411
1412	ret = device->ops->open(device->device_data);
1413	if (ret) {
1414		vfio_device_put(device);
1415		return ret;
1416	}
1417
1418	/*
1419	 * We can't use anon_inode_getfd() because we need to modify
1420	 * the f_mode flags directly to allow more than just ioctls
1421	 */
1422	ret = get_unused_fd_flags(O_CLOEXEC);
1423	if (ret < 0) {
1424		device->ops->release(device->device_data);
1425		vfio_device_put(device);
1426		return ret;
1427	}
1428
1429	filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1430				   device, O_RDWR);
1431	if (IS_ERR(filep)) {
1432		put_unused_fd(ret);
1433		ret = PTR_ERR(filep);
1434		device->ops->release(device->device_data);
1435		vfio_device_put(device);
1436		return ret;
1437	}
1438
1439	/*
1440	 * TODO: add an anon_inode interface to do this.
1441	 * Appears to be missing by lack of need rather than
1442	 * explicitly prevented.  Now there's need.
1443	 */
1444	filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1445
1446	atomic_inc(&group->container_users);
1447
1448	fd_install(ret, filep);
1449
1450	if (group->noiommu)
1451		dev_warn(device->dev, "vfio-noiommu device opened by user "
1452			 "(%s:%d)\n", current->comm, task_pid_nr(current));
1453
1454	return ret;
1455}
1456
1457static long vfio_group_fops_unl_ioctl(struct file *filep,
1458				      unsigned int cmd, unsigned long arg)
1459{
1460	struct vfio_group *group = filep->private_data;
1461	long ret = -ENOTTY;
1462
1463	switch (cmd) {
1464	case VFIO_GROUP_GET_STATUS:
1465	{
1466		struct vfio_group_status status;
1467		unsigned long minsz;
1468
1469		minsz = offsetofend(struct vfio_group_status, flags);
1470
1471		if (copy_from_user(&status, (void __user *)arg, minsz))
1472			return -EFAULT;
1473
1474		if (status.argsz < minsz)
1475			return -EINVAL;
1476
1477		status.flags = 0;
1478
1479		if (vfio_group_viable(group))
1480			status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1481
1482		if (group->container)
1483			status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1484
1485		if (copy_to_user((void __user *)arg, &status, minsz))
1486			return -EFAULT;
1487
1488		ret = 0;
1489		break;
1490	}
1491	case VFIO_GROUP_SET_CONTAINER:
1492	{
1493		int fd;
1494
1495		if (get_user(fd, (int __user *)arg))
1496			return -EFAULT;
1497
1498		if (fd < 0)
1499			return -EINVAL;
1500
1501		ret = vfio_group_set_container(group, fd);
1502		break;
1503	}
1504	case VFIO_GROUP_UNSET_CONTAINER:
1505		ret = vfio_group_unset_container(group);
1506		break;
1507	case VFIO_GROUP_GET_DEVICE_FD:
1508	{
1509		char *buf;
1510
1511		buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1512		if (IS_ERR(buf))
1513			return PTR_ERR(buf);
1514
1515		ret = vfio_group_get_device_fd(group, buf);
1516		kfree(buf);
1517		break;
1518	}
1519	}
1520
1521	return ret;
1522}
1523
1524static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1525{
1526	struct vfio_group *group;
1527	int opened;
1528
1529	group = vfio_group_get_from_minor(iminor(inode));
1530	if (!group)
1531		return -ENODEV;
1532
1533	if (group->noiommu && !capable(CAP_SYS_RAWIO)) {
1534		vfio_group_put(group);
1535		return -EPERM;
1536	}
1537
1538	/* Do we need multiple instances of the group open?  Seems not. */
1539	opened = atomic_cmpxchg(&group->opened, 0, 1);
1540	if (opened) {
1541		vfio_group_put(group);
1542		return -EBUSY;
1543	}
1544
1545	/* Is something still in use from a previous open? */
1546	if (group->container) {
1547		atomic_dec(&group->opened);
1548		vfio_group_put(group);
1549		return -EBUSY;
1550	}
1551
1552	/* Warn if previous user didn't cleanup and re-init to drop them */
1553	if (WARN_ON(group->notifier.head))
1554		BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
1555
1556	filep->private_data = group;
1557
1558	return 0;
1559}
1560
1561static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1562{
1563	struct vfio_group *group = filep->private_data;
1564
1565	filep->private_data = NULL;
1566
1567	vfio_group_try_dissolve_container(group);
1568
1569	atomic_dec(&group->opened);
1570
1571	vfio_group_put(group);
1572
1573	return 0;
1574}
1575
1576static const struct file_operations vfio_group_fops = {
1577	.owner		= THIS_MODULE,
1578	.unlocked_ioctl	= vfio_group_fops_unl_ioctl,
1579	.compat_ioctl	= compat_ptr_ioctl,
1580	.open		= vfio_group_fops_open,
1581	.release	= vfio_group_fops_release,
1582};
1583
1584/**
1585 * VFIO Device fd
1586 */
1587static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1588{
1589	struct vfio_device *device = filep->private_data;
1590
1591	device->ops->release(device->device_data);
1592
1593	vfio_group_try_dissolve_container(device->group);
1594
1595	vfio_device_put(device);
1596
1597	return 0;
1598}
1599
1600static long vfio_device_fops_unl_ioctl(struct file *filep,
1601				       unsigned int cmd, unsigned long arg)
1602{
1603	struct vfio_device *device = filep->private_data;
1604
1605	if (unlikely(!device->ops->ioctl))
1606		return -EINVAL;
1607
1608	return device->ops->ioctl(device->device_data, cmd, arg);
1609}
1610
1611static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1612				     size_t count, loff_t *ppos)
1613{
1614	struct vfio_device *device = filep->private_data;
1615
1616	if (unlikely(!device->ops->read))
1617		return -EINVAL;
1618
1619	return device->ops->read(device->device_data, buf, count, ppos);
1620}
1621
1622static ssize_t vfio_device_fops_write(struct file *filep,
1623				      const char __user *buf,
1624				      size_t count, loff_t *ppos)
1625{
1626	struct vfio_device *device = filep->private_data;
1627
1628	if (unlikely(!device->ops->write))
1629		return -EINVAL;
1630
1631	return device->ops->write(device->device_data, buf, count, ppos);
1632}
1633
1634static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1635{
1636	struct vfio_device *device = filep->private_data;
1637
1638	if (unlikely(!device->ops->mmap))
1639		return -EINVAL;
1640
1641	return device->ops->mmap(device->device_data, vma);
1642}
1643
1644static const struct file_operations vfio_device_fops = {
1645	.owner		= THIS_MODULE,
1646	.release	= vfio_device_fops_release,
1647	.read		= vfio_device_fops_read,
1648	.write		= vfio_device_fops_write,
1649	.unlocked_ioctl	= vfio_device_fops_unl_ioctl,
1650	.compat_ioctl	= compat_ptr_ioctl,
1651	.mmap		= vfio_device_fops_mmap,
1652};
1653
1654/**
1655 * External user API, exported by symbols to be linked dynamically.
1656 *
1657 * The protocol includes:
1658 *  1. do normal VFIO init operation:
1659 *	- opening a new container;
1660 *	- attaching group(s) to it;
1661 *	- setting an IOMMU driver for a container.
1662 * When IOMMU is set for a container, all groups in it are
1663 * considered ready to use by an external user.
1664 *
1665 * 2. User space passes a group fd to an external user.
1666 * The external user calls vfio_group_get_external_user()
1667 * to verify that:
1668 *	- the group is initialized;
1669 *	- IOMMU is set for it.
1670 * If both checks passed, vfio_group_get_external_user()
1671 * increments the container user counter to prevent
1672 * the VFIO group from disposal before KVM exits.
1673 *
1674 * 3. The external user calls vfio_external_user_iommu_id()
1675 * to know an IOMMU ID.
1676 *
1677 * 4. When the external KVM finishes, it calls
1678 * vfio_group_put_external_user() to release the VFIO group.
1679 * This call decrements the container user counter.
1680 */
1681struct vfio_group *vfio_group_get_external_user(struct file *filep)
1682{
1683	struct vfio_group *group = filep->private_data;
1684	int ret;
1685
1686	if (filep->f_op != &vfio_group_fops)
1687		return ERR_PTR(-EINVAL);
1688
1689	ret = vfio_group_add_container_user(group);
1690	if (ret)
1691		return ERR_PTR(ret);
1692
1693	vfio_group_get(group);
1694
1695	return group;
1696}
1697EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
1698
1699/**
1700 * External user API, exported by symbols to be linked dynamically.
1701 * The external user passes in a device pointer
1702 * to verify that:
1703 *	- A VFIO group is assiciated with the device;
1704 *	- IOMMU is set for the group.
1705 * If both checks passed, vfio_group_get_external_user_from_dev()
1706 * increments the container user counter to prevent the VFIO group
1707 * from disposal before external user exits and returns the pointer
1708 * to the VFIO group.
1709 *
1710 * When the external user finishes using the VFIO group, it calls
1711 * vfio_group_put_external_user() to release the VFIO group and
1712 * decrement the container user counter.
1713 *
1714 * @dev [in]	: device
1715 * Return error PTR or pointer to VFIO group.
1716 */
1717
1718struct vfio_group *vfio_group_get_external_user_from_dev(struct device *dev)
1719{
1720	struct vfio_group *group;
1721	int ret;
1722
1723	group = vfio_group_get_from_dev(dev);
1724	if (!group)
1725		return ERR_PTR(-ENODEV);
1726
1727	ret = vfio_group_add_container_user(group);
1728	if (ret) {
1729		vfio_group_put(group);
1730		return ERR_PTR(ret);
1731	}
1732
1733	return group;
1734}
1735EXPORT_SYMBOL_GPL(vfio_group_get_external_user_from_dev);
1736
1737void vfio_group_put_external_user(struct vfio_group *group)
1738{
1739	vfio_group_try_dissolve_container(group);
1740	vfio_group_put(group);
1741}
1742EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
1743
1744bool vfio_external_group_match_file(struct vfio_group *test_group,
1745				    struct file *filep)
1746{
1747	struct vfio_group *group = filep->private_data;
1748
1749	return (filep->f_op == &vfio_group_fops) && (group == test_group);
1750}
1751EXPORT_SYMBOL_GPL(vfio_external_group_match_file);
1752
1753int vfio_external_user_iommu_id(struct vfio_group *group)
1754{
1755	return iommu_group_id(group->iommu_group);
1756}
1757EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1758
1759long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1760{
1761	return vfio_ioctl_check_extension(group->container, arg);
1762}
1763EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1764
1765/**
1766 * Sub-module support
1767 */
1768/*
1769 * Helper for managing a buffer of info chain capabilities, allocate or
1770 * reallocate a buffer with additional @size, filling in @id and @version
1771 * of the capability.  A pointer to the new capability is returned.
1772 *
1773 * NB. The chain is based at the head of the buffer, so new entries are
1774 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1775 * next offsets prior to copying to the user buffer.
1776 */
1777struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1778					       size_t size, u16 id, u16 version)
1779{
1780	void *buf;
1781	struct vfio_info_cap_header *header, *tmp;
1782
1783	buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1784	if (!buf) {
1785		kfree(caps->buf);
1786		caps->buf = NULL;
1787		caps->size = 0;
1788		return ERR_PTR(-ENOMEM);
1789	}
1790
1791	caps->buf = buf;
1792	header = buf + caps->size;
1793
1794	/* Eventually copied to user buffer, zero */
1795	memset(header, 0, size);
1796
1797	header->id = id;
1798	header->version = version;
1799
1800	/* Add to the end of the capability chain */
1801	for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1802		; /* nothing */
1803
1804	tmp->next = caps->size;
1805	caps->size += size;
1806
1807	return header;
1808}
1809EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1810
1811void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1812{
1813	struct vfio_info_cap_header *tmp;
1814	void *buf = (void *)caps->buf;
1815
1816	for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1817		tmp->next += offset;
1818}
1819EXPORT_SYMBOL(vfio_info_cap_shift);
1820
1821int vfio_info_add_capability(struct vfio_info_cap *caps,
1822			     struct vfio_info_cap_header *cap, size_t size)
1823{
1824	struct vfio_info_cap_header *header;
1825
1826	header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1827	if (IS_ERR(header))
1828		return PTR_ERR(header);
1829
1830	memcpy(header + 1, cap + 1, size - sizeof(*header));
1831
1832	return 0;
1833}
1834EXPORT_SYMBOL(vfio_info_add_capability);
1835
1836int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1837				       int max_irq_type, size_t *data_size)
1838{
1839	unsigned long minsz;
1840	size_t size;
1841
1842	minsz = offsetofend(struct vfio_irq_set, count);
1843
1844	if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1845	    (hdr->count >= (U32_MAX - hdr->start)) ||
1846	    (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1847				VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1848		return -EINVAL;
1849
1850	if (data_size)
1851		*data_size = 0;
1852
1853	if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1854		return -EINVAL;
1855
1856	switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1857	case VFIO_IRQ_SET_DATA_NONE:
1858		size = 0;
1859		break;
1860	case VFIO_IRQ_SET_DATA_BOOL:
1861		size = sizeof(uint8_t);
1862		break;
1863	case VFIO_IRQ_SET_DATA_EVENTFD:
1864		size = sizeof(int32_t);
1865		break;
1866	default:
1867		return -EINVAL;
1868	}
1869
1870	if (size) {
1871		if (hdr->argsz - minsz < hdr->count * size)
1872			return -EINVAL;
1873
1874		if (!data_size)
1875			return -EINVAL;
1876
1877		*data_size = hdr->count * size;
1878	}
1879
1880	return 0;
1881}
1882EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1883
1884/*
1885 * Pin a set of guest PFNs and return their associated host PFNs for local
1886 * domain only.
1887 * @dev [in]     : device
1888 * @user_pfn [in]: array of user/guest PFNs to be pinned.
1889 * @npage [in]   : count of elements in user_pfn array.  This count should not
1890 *		   be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1891 * @prot [in]    : protection flags
1892 * @phys_pfn[out]: array of host PFNs
1893 * Return error or number of pages pinned.
1894 */
1895int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
1896		   int prot, unsigned long *phys_pfn)
1897{
1898	struct vfio_container *container;
1899	struct vfio_group *group;
1900	struct vfio_iommu_driver *driver;
1901	int ret;
1902
1903	if (!dev || !user_pfn || !phys_pfn || !npage)
1904		return -EINVAL;
1905
1906	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1907		return -E2BIG;
1908
1909	group = vfio_group_get_from_dev(dev);
1910	if (!group)
1911		return -ENODEV;
1912
1913	if (group->dev_counter > 1) {
1914		ret = -EINVAL;
1915		goto err_pin_pages;
1916	}
1917
1918	ret = vfio_group_add_container_user(group);
1919	if (ret)
1920		goto err_pin_pages;
1921
1922	container = group->container;
1923	driver = container->iommu_driver;
1924	if (likely(driver && driver->ops->pin_pages))
1925		ret = driver->ops->pin_pages(container->iommu_data,
1926					     group->iommu_group, user_pfn,
1927					     npage, prot, phys_pfn);
1928	else
1929		ret = -ENOTTY;
1930
1931	vfio_group_try_dissolve_container(group);
1932
1933err_pin_pages:
1934	vfio_group_put(group);
1935	return ret;
1936}
1937EXPORT_SYMBOL(vfio_pin_pages);
1938
1939/*
1940 * Unpin set of host PFNs for local domain only.
1941 * @dev [in]     : device
1942 * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
1943 *		   PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1944 * @npage [in]   : count of elements in user_pfn array.  This count should not
1945 *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1946 * Return error or number of pages unpinned.
1947 */
1948int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
1949{
1950	struct vfio_container *container;
1951	struct vfio_group *group;
1952	struct vfio_iommu_driver *driver;
1953	int ret;
1954
1955	if (!dev || !user_pfn || !npage)
1956		return -EINVAL;
1957
1958	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1959		return -E2BIG;
1960
1961	group = vfio_group_get_from_dev(dev);
1962	if (!group)
1963		return -ENODEV;
1964
1965	ret = vfio_group_add_container_user(group);
1966	if (ret)
1967		goto err_unpin_pages;
1968
1969	container = group->container;
1970	driver = container->iommu_driver;
1971	if (likely(driver && driver->ops->unpin_pages))
1972		ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
1973					       npage);
1974	else
1975		ret = -ENOTTY;
1976
1977	vfio_group_try_dissolve_container(group);
1978
1979err_unpin_pages:
1980	vfio_group_put(group);
1981	return ret;
1982}
1983EXPORT_SYMBOL(vfio_unpin_pages);
1984
1985/*
1986 * Pin a set of guest IOVA PFNs and return their associated host PFNs for a
1987 * VFIO group.
1988 *
1989 * The caller needs to call vfio_group_get_external_user() or
1990 * vfio_group_get_external_user_from_dev() prior to calling this interface,
1991 * so as to prevent the VFIO group from disposal in the middle of the call.
1992 * But it can keep the reference to the VFIO group for several calls into
1993 * this interface.
1994 * After finishing using of the VFIO group, the caller needs to release the
1995 * VFIO group by calling vfio_group_put_external_user().
1996 *
1997 * @group [in]		: VFIO group
1998 * @user_iova_pfn [in]	: array of user/guest IOVA PFNs to be pinned.
1999 * @npage [in]		: count of elements in user_iova_pfn array.
2000 *			  This count should not be greater
2001 *			  VFIO_PIN_PAGES_MAX_ENTRIES.
2002 * @prot [in]		: protection flags
2003 * @phys_pfn [out]	: array of host PFNs
2004 * Return error or number of pages pinned.
2005 */
2006int vfio_group_pin_pages(struct vfio_group *group,
2007			 unsigned long *user_iova_pfn, int npage,
2008			 int prot, unsigned long *phys_pfn)
2009{
2010	struct vfio_container *container;
2011	struct vfio_iommu_driver *driver;
2012	int ret;
2013
2014	if (!group || !user_iova_pfn || !phys_pfn || !npage)
2015		return -EINVAL;
2016
2017	if (group->dev_counter > 1)
2018		return -EINVAL;
2019
2020	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
2021		return -E2BIG;
2022
2023	container = group->container;
2024	driver = container->iommu_driver;
2025	if (likely(driver && driver->ops->pin_pages))
2026		ret = driver->ops->pin_pages(container->iommu_data,
2027					     group->iommu_group, user_iova_pfn,
2028					     npage, prot, phys_pfn);
2029	else
2030		ret = -ENOTTY;
2031
2032	return ret;
2033}
2034EXPORT_SYMBOL(vfio_group_pin_pages);
2035
2036/*
2037 * Unpin a set of guest IOVA PFNs for a VFIO group.
2038 *
2039 * The caller needs to call vfio_group_get_external_user() or
2040 * vfio_group_get_external_user_from_dev() prior to calling this interface,
2041 * so as to prevent the VFIO group from disposal in the middle of the call.
2042 * But it can keep the reference to the VFIO group for several calls into
2043 * this interface.
2044 * After finishing using of the VFIO group, the caller needs to release the
2045 * VFIO group by calling vfio_group_put_external_user().
2046 *
2047 * @group [in]		: vfio group
2048 * @user_iova_pfn [in]	: array of user/guest IOVA PFNs to be unpinned.
2049 * @npage [in]		: count of elements in user_iova_pfn array.
2050 *			  This count should not be greater than
2051 *			  VFIO_PIN_PAGES_MAX_ENTRIES.
2052 * Return error or number of pages unpinned.
2053 */
2054int vfio_group_unpin_pages(struct vfio_group *group,
2055			   unsigned long *user_iova_pfn, int npage)
2056{
2057	struct vfio_container *container;
2058	struct vfio_iommu_driver *driver;
2059	int ret;
2060
2061	if (!group || !user_iova_pfn || !npage)
2062		return -EINVAL;
2063
2064	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
2065		return -E2BIG;
2066
2067	container = group->container;
2068	driver = container->iommu_driver;
2069	if (likely(driver && driver->ops->unpin_pages))
2070		ret = driver->ops->unpin_pages(container->iommu_data,
2071					       user_iova_pfn, npage);
2072	else
2073		ret = -ENOTTY;
2074
2075	return ret;
2076}
2077EXPORT_SYMBOL(vfio_group_unpin_pages);
2078
2079
2080/*
2081 * This interface allows the CPUs to perform some sort of virtual DMA on
2082 * behalf of the device.
2083 *
2084 * CPUs read/write from/into a range of IOVAs pointing to user space memory
2085 * into/from a kernel buffer.
2086 *
2087 * As the read/write of user space memory is conducted via the CPUs and is
2088 * not a real device DMA, it is not necessary to pin the user space memory.
2089 *
2090 * The caller needs to call vfio_group_get_external_user() or
2091 * vfio_group_get_external_user_from_dev() prior to calling this interface,
2092 * so as to prevent the VFIO group from disposal in the middle of the call.
2093 * But it can keep the reference to the VFIO group for several calls into
2094 * this interface.
2095 * After finishing using of the VFIO group, the caller needs to release the
2096 * VFIO group by calling vfio_group_put_external_user().
2097 *
2098 * @group [in]		: VFIO group
2099 * @user_iova [in]	: base IOVA of a user space buffer
2100 * @data [in]		: pointer to kernel buffer
2101 * @len [in]		: kernel buffer length
2102 * @write		: indicate read or write
2103 * Return error code on failure or 0 on success.
2104 */
2105int vfio_dma_rw(struct vfio_group *group, dma_addr_t user_iova,
2106		void *data, size_t len, bool write)
2107{
2108	struct vfio_container *container;
2109	struct vfio_iommu_driver *driver;
2110	int ret = 0;
2111
2112	if (!group || !data || len <= 0)
2113		return -EINVAL;
2114
2115	container = group->container;
2116	driver = container->iommu_driver;
2117
2118	if (likely(driver && driver->ops->dma_rw))
2119		ret = driver->ops->dma_rw(container->iommu_data,
2120					  user_iova, data, len, write);
2121	else
2122		ret = -ENOTTY;
2123
2124	return ret;
2125}
2126EXPORT_SYMBOL(vfio_dma_rw);
2127
2128static int vfio_register_iommu_notifier(struct vfio_group *group,
2129					unsigned long *events,
2130					struct notifier_block *nb)
2131{
2132	struct vfio_container *container;
2133	struct vfio_iommu_driver *driver;
2134	int ret;
2135
2136	ret = vfio_group_add_container_user(group);
2137	if (ret)
2138		return -EINVAL;
2139
2140	container = group->container;
2141	driver = container->iommu_driver;
2142	if (likely(driver && driver->ops->register_notifier))
2143		ret = driver->ops->register_notifier(container->iommu_data,
2144						     events, nb);
2145	else
2146		ret = -ENOTTY;
2147
2148	vfio_group_try_dissolve_container(group);
2149
2150	return ret;
2151}
2152
2153static int vfio_unregister_iommu_notifier(struct vfio_group *group,
2154					  struct notifier_block *nb)
2155{
2156	struct vfio_container *container;
2157	struct vfio_iommu_driver *driver;
2158	int ret;
2159
2160	ret = vfio_group_add_container_user(group);
2161	if (ret)
2162		return -EINVAL;
2163
2164	container = group->container;
2165	driver = container->iommu_driver;
2166	if (likely(driver && driver->ops->unregister_notifier))
2167		ret = driver->ops->unregister_notifier(container->iommu_data,
2168						       nb);
2169	else
2170		ret = -ENOTTY;
2171
2172	vfio_group_try_dissolve_container(group);
2173
2174	return ret;
2175}
2176
2177void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
2178{
2179	group->kvm = kvm;
2180	blocking_notifier_call_chain(&group->notifier,
2181				VFIO_GROUP_NOTIFY_SET_KVM, kvm);
2182}
2183EXPORT_SYMBOL_GPL(vfio_group_set_kvm);
2184
2185static int vfio_register_group_notifier(struct vfio_group *group,
2186					unsigned long *events,
2187					struct notifier_block *nb)
2188{
2189	int ret;
2190	bool set_kvm = false;
2191
2192	if (*events & VFIO_GROUP_NOTIFY_SET_KVM)
2193		set_kvm = true;
2194
2195	/* clear known events */
2196	*events &= ~VFIO_GROUP_NOTIFY_SET_KVM;
2197
2198	/* refuse to continue if still events remaining */
2199	if (*events)
2200		return -EINVAL;
2201
2202	ret = vfio_group_add_container_user(group);
2203	if (ret)
2204		return -EINVAL;
2205
2206	ret = blocking_notifier_chain_register(&group->notifier, nb);
2207
2208	/*
2209	 * The attaching of kvm and vfio_group might already happen, so
2210	 * here we replay once upon registration.
2211	 */
2212	if (!ret && set_kvm && group->kvm)
2213		blocking_notifier_call_chain(&group->notifier,
2214					VFIO_GROUP_NOTIFY_SET_KVM, group->kvm);
2215
2216	vfio_group_try_dissolve_container(group);
2217
2218	return ret;
2219}
2220
2221static int vfio_unregister_group_notifier(struct vfio_group *group,
2222					 struct notifier_block *nb)
2223{
2224	int ret;
2225
2226	ret = vfio_group_add_container_user(group);
2227	if (ret)
2228		return -EINVAL;
2229
2230	ret = blocking_notifier_chain_unregister(&group->notifier, nb);
2231
2232	vfio_group_try_dissolve_container(group);
2233
2234	return ret;
2235}
2236
2237int vfio_register_notifier(struct device *dev, enum vfio_notify_type type,
2238			   unsigned long *events, struct notifier_block *nb)
2239{
2240	struct vfio_group *group;
2241	int ret;
2242
2243	if (!dev || !nb || !events || (*events == 0))
2244		return -EINVAL;
2245
2246	group = vfio_group_get_from_dev(dev);
2247	if (!group)
2248		return -ENODEV;
2249
2250	switch (type) {
2251	case VFIO_IOMMU_NOTIFY:
2252		ret = vfio_register_iommu_notifier(group, events, nb);
2253		break;
2254	case VFIO_GROUP_NOTIFY:
2255		ret = vfio_register_group_notifier(group, events, nb);
2256		break;
2257	default:
2258		ret = -EINVAL;
2259	}
2260
2261	vfio_group_put(group);
2262	return ret;
2263}
2264EXPORT_SYMBOL(vfio_register_notifier);
2265
2266int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type,
2267			     struct notifier_block *nb)
2268{
2269	struct vfio_group *group;
2270	int ret;
2271
2272	if (!dev || !nb)
2273		return -EINVAL;
2274
2275	group = vfio_group_get_from_dev(dev);
2276	if (!group)
2277		return -ENODEV;
2278
2279	switch (type) {
2280	case VFIO_IOMMU_NOTIFY:
2281		ret = vfio_unregister_iommu_notifier(group, nb);
2282		break;
2283	case VFIO_GROUP_NOTIFY:
2284		ret = vfio_unregister_group_notifier(group, nb);
2285		break;
2286	default:
2287		ret = -EINVAL;
2288	}
2289
2290	vfio_group_put(group);
2291	return ret;
2292}
2293EXPORT_SYMBOL(vfio_unregister_notifier);
2294
2295/**
2296 * Module/class support
2297 */
2298static char *vfio_devnode(struct device *dev, umode_t *mode)
2299{
2300	return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2301}
2302
2303static struct miscdevice vfio_dev = {
2304	.minor = VFIO_MINOR,
2305	.name = "vfio",
2306	.fops = &vfio_fops,
2307	.nodename = "vfio/vfio",
2308	.mode = S_IRUGO | S_IWUGO,
2309};
2310
2311static int __init vfio_init(void)
2312{
2313	int ret;
2314
2315	idr_init(&vfio.group_idr);
2316	mutex_init(&vfio.group_lock);
2317	mutex_init(&vfio.iommu_drivers_lock);
2318	INIT_LIST_HEAD(&vfio.group_list);
2319	INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2320
2321	ret = misc_register(&vfio_dev);
2322	if (ret) {
2323		pr_err("vfio: misc device register failed\n");
2324		return ret;
2325	}
2326
2327	/* /dev/vfio/$GROUP */
2328	vfio.class = class_create(THIS_MODULE, "vfio");
2329	if (IS_ERR(vfio.class)) {
2330		ret = PTR_ERR(vfio.class);
2331		goto err_class;
2332	}
2333
2334	vfio.class->devnode = vfio_devnode;
2335
2336	ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio");
2337	if (ret)
2338		goto err_alloc_chrdev;
2339
2340	cdev_init(&vfio.group_cdev, &vfio_group_fops);
2341	ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK + 1);
2342	if (ret)
2343		goto err_cdev_add;
2344
2345	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2346
2347#ifdef CONFIG_VFIO_NOIOMMU
2348	vfio_register_iommu_driver(&vfio_noiommu_ops);
2349#endif
2350	return 0;
2351
2352err_cdev_add:
2353	unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2354err_alloc_chrdev:
2355	class_destroy(vfio.class);
2356	vfio.class = NULL;
2357err_class:
2358	misc_deregister(&vfio_dev);
2359	return ret;
2360}
2361
2362static void __exit vfio_cleanup(void)
2363{
2364	WARN_ON(!list_empty(&vfio.group_list));
2365
2366#ifdef CONFIG_VFIO_NOIOMMU
2367	vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2368#endif
2369	idr_destroy(&vfio.group_idr);
2370	cdev_del(&vfio.group_cdev);
2371	unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2372	class_destroy(vfio.class);
2373	vfio.class = NULL;
2374	misc_deregister(&vfio_dev);
2375}
2376
2377module_init(vfio_init);
2378module_exit(vfio_cleanup);
2379
2380MODULE_VERSION(DRIVER_VERSION);
2381MODULE_LICENSE("GPL v2");
2382MODULE_AUTHOR(DRIVER_AUTHOR);
2383MODULE_DESCRIPTION(DRIVER_DESC);
2384MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2385MODULE_ALIAS("devname:vfio/vfio");
2386MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");
2387