1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * VFIO core
4 *
5 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
6 *     Author: Alex Williamson <alex.williamson@redhat.com>
7 *
8 * Derived from original vfio:
9 * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
10 * Author: Tom Lyon, pugs@cisco.com
11 */
12
13#include <linux/cdev.h>
14#include <linux/compat.h>
15#include <linux/device.h>
16#include <linux/fs.h>
17#include <linux/idr.h>
18#include <linux/iommu.h>
19#ifdef CONFIG_HAVE_KVM
20#include <linux/kvm_host.h>
21#endif
22#include <linux/list.h>
23#include <linux/miscdevice.h>
24#include <linux/module.h>
25#include <linux/mutex.h>
26#include <linux/pci.h>
27#include <linux/rwsem.h>
28#include <linux/sched.h>
29#include <linux/slab.h>
30#include <linux/stat.h>
31#include <linux/string.h>
32#include <linux/uaccess.h>
33#include <linux/vfio.h>
34#include <linux/wait.h>
35#include <linux/sched/signal.h>
36#include <linux/pm_runtime.h>
37#include <linux/interval_tree.h>
38#include <linux/iova_bitmap.h>
39#include <linux/iommufd.h>
40#include "vfio.h"
41
42#define DRIVER_VERSION	"0.3"
43#define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
44#define DRIVER_DESC	"VFIO - User Level meta-driver"
45
46static struct vfio {
47	struct class			*device_class;
48	struct ida			device_ida;
49} vfio;
50
51#ifdef CONFIG_VFIO_NOIOMMU
52bool vfio_noiommu __read_mostly;
53module_param_named(enable_unsafe_noiommu_mode,
54		   vfio_noiommu, bool, S_IRUGO | S_IWUSR);
55MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
56#endif
57
58static DEFINE_XARRAY(vfio_device_set_xa);
59
60int vfio_assign_device_set(struct vfio_device *device, void *set_id)
61{
62	unsigned long idx = (unsigned long)set_id;
63	struct vfio_device_set *new_dev_set;
64	struct vfio_device_set *dev_set;
65
66	if (WARN_ON(!set_id))
67		return -EINVAL;
68
69	/*
70	 * Atomically acquire a singleton object in the xarray for this set_id
71	 */
72	xa_lock(&vfio_device_set_xa);
73	dev_set = xa_load(&vfio_device_set_xa, idx);
74	if (dev_set)
75		goto found_get_ref;
76	xa_unlock(&vfio_device_set_xa);
77
78	new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
79	if (!new_dev_set)
80		return -ENOMEM;
81	mutex_init(&new_dev_set->lock);
82	INIT_LIST_HEAD(&new_dev_set->device_list);
83	new_dev_set->set_id = set_id;
84
85	xa_lock(&vfio_device_set_xa);
86	dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
87			       GFP_KERNEL);
88	if (!dev_set) {
89		dev_set = new_dev_set;
90		goto found_get_ref;
91	}
92
93	kfree(new_dev_set);
94	if (xa_is_err(dev_set)) {
95		xa_unlock(&vfio_device_set_xa);
96		return xa_err(dev_set);
97	}
98
99found_get_ref:
100	dev_set->device_count++;
101	xa_unlock(&vfio_device_set_xa);
102	mutex_lock(&dev_set->lock);
103	device->dev_set = dev_set;
104	list_add_tail(&device->dev_set_list, &dev_set->device_list);
105	mutex_unlock(&dev_set->lock);
106	return 0;
107}
108EXPORT_SYMBOL_GPL(vfio_assign_device_set);
109
110static void vfio_release_device_set(struct vfio_device *device)
111{
112	struct vfio_device_set *dev_set = device->dev_set;
113
114	if (!dev_set)
115		return;
116
117	mutex_lock(&dev_set->lock);
118	list_del(&device->dev_set_list);
119	mutex_unlock(&dev_set->lock);
120
121	xa_lock(&vfio_device_set_xa);
122	if (!--dev_set->device_count) {
123		__xa_erase(&vfio_device_set_xa,
124			   (unsigned long)dev_set->set_id);
125		mutex_destroy(&dev_set->lock);
126		kfree(dev_set);
127	}
128	xa_unlock(&vfio_device_set_xa);
129}
130
131unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set)
132{
133	struct vfio_device *cur;
134	unsigned int open_count = 0;
135
136	lockdep_assert_held(&dev_set->lock);
137
138	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
139		open_count += cur->open_count;
140	return open_count;
141}
142EXPORT_SYMBOL_GPL(vfio_device_set_open_count);
143
144struct vfio_device *
145vfio_find_device_in_devset(struct vfio_device_set *dev_set,
146			   struct device *dev)
147{
148	struct vfio_device *cur;
149
150	lockdep_assert_held(&dev_set->lock);
151
152	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
153		if (cur->dev == dev)
154			return cur;
155	return NULL;
156}
157EXPORT_SYMBOL_GPL(vfio_find_device_in_devset);
158
159/*
160 * Device objects - create, release, get, put, search
161 */
162/* Device reference always implies a group reference */
163void vfio_device_put_registration(struct vfio_device *device)
164{
165	if (refcount_dec_and_test(&device->refcount))
166		complete(&device->comp);
167}
168
169bool vfio_device_try_get_registration(struct vfio_device *device)
170{
171	return refcount_inc_not_zero(&device->refcount);
172}
173
174/*
175 * VFIO driver API
176 */
177/* Release helper called by vfio_put_device() */
178static void vfio_device_release(struct device *dev)
179{
180	struct vfio_device *device =
181			container_of(dev, struct vfio_device, device);
182
183	vfio_release_device_set(device);
184	ida_free(&vfio.device_ida, device->index);
185
186	if (device->ops->release)
187		device->ops->release(device);
188
189	kvfree(device);
190}
191
192static int vfio_init_device(struct vfio_device *device, struct device *dev,
193			    const struct vfio_device_ops *ops);
194
195/*
196 * Allocate and initialize vfio_device so it can be registered to vfio
197 * core.
198 *
199 * Drivers should use the wrapper vfio_alloc_device() for allocation.
200 * @size is the size of the structure to be allocated, including any
201 * private data used by the driver.
202 *
203 * Driver may provide an @init callback to cover device private data.
204 *
205 * Use vfio_put_device() to release the structure after success return.
206 */
207struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
208				       const struct vfio_device_ops *ops)
209{
210	struct vfio_device *device;
211	int ret;
212
213	if (WARN_ON(size < sizeof(struct vfio_device)))
214		return ERR_PTR(-EINVAL);
215
216	device = kvzalloc(size, GFP_KERNEL);
217	if (!device)
218		return ERR_PTR(-ENOMEM);
219
220	ret = vfio_init_device(device, dev, ops);
221	if (ret)
222		goto out_free;
223	return device;
224
225out_free:
226	kvfree(device);
227	return ERR_PTR(ret);
228}
229EXPORT_SYMBOL_GPL(_vfio_alloc_device);
230
231/*
232 * Initialize a vfio_device so it can be registered to vfio core.
233 */
234static int vfio_init_device(struct vfio_device *device, struct device *dev,
235			    const struct vfio_device_ops *ops)
236{
237	int ret;
238
239	ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL);
240	if (ret < 0) {
241		dev_dbg(dev, "Error to alloc index\n");
242		return ret;
243	}
244
245	device->index = ret;
246	init_completion(&device->comp);
247	device->dev = dev;
248	device->ops = ops;
249
250	if (ops->init) {
251		ret = ops->init(device);
252		if (ret)
253			goto out_uninit;
254	}
255
256	device_initialize(&device->device);
257	device->device.release = vfio_device_release;
258	device->device.class = vfio.device_class;
259	device->device.parent = device->dev;
260	return 0;
261
262out_uninit:
263	vfio_release_device_set(device);
264	ida_free(&vfio.device_ida, device->index);
265	return ret;
266}
267
268static int __vfio_register_dev(struct vfio_device *device,
269			       enum vfio_group_type type)
270{
271	int ret;
272
273	if (WARN_ON(IS_ENABLED(CONFIG_IOMMUFD) &&
274		    (!device->ops->bind_iommufd ||
275		     !device->ops->unbind_iommufd ||
276		     !device->ops->attach_ioas ||
277		     !device->ops->detach_ioas)))
278		return -EINVAL;
279
280	/*
281	 * If the driver doesn't specify a set then the device is added to a
282	 * singleton set just for itself.
283	 */
284	if (!device->dev_set)
285		vfio_assign_device_set(device, device);
286
287	ret = dev_set_name(&device->device, "vfio%d", device->index);
288	if (ret)
289		return ret;
290
291	ret = vfio_device_set_group(device, type);
292	if (ret)
293		return ret;
294
295	/*
296	 * VFIO always sets IOMMU_CACHE because we offer no way for userspace to
297	 * restore cache coherency. It has to be checked here because it is only
298	 * valid for cases where we are using iommu groups.
299	 */
300	if (type == VFIO_IOMMU && !vfio_device_is_noiommu(device) &&
301	    !device_iommu_capable(device->dev, IOMMU_CAP_CACHE_COHERENCY)) {
302		ret = -EINVAL;
303		goto err_out;
304	}
305
306	ret = vfio_device_add(device);
307	if (ret)
308		goto err_out;
309
310	/* Refcounting can't start until the driver calls register */
311	refcount_set(&device->refcount, 1);
312
313	vfio_device_group_register(device);
314
315	return 0;
316err_out:
317	vfio_device_remove_group(device);
318	return ret;
319}
320
321int vfio_register_group_dev(struct vfio_device *device)
322{
323	return __vfio_register_dev(device, VFIO_IOMMU);
324}
325EXPORT_SYMBOL_GPL(vfio_register_group_dev);
326
327/*
328 * Register a virtual device without IOMMU backing.  The user of this
329 * device must not be able to directly trigger unmediated DMA.
330 */
331int vfio_register_emulated_iommu_dev(struct vfio_device *device)
332{
333	return __vfio_register_dev(device, VFIO_EMULATED_IOMMU);
334}
335EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
336
337/*
338 * Decrement the device reference count and wait for the device to be
339 * removed.  Open file descriptors for the device... */
340void vfio_unregister_group_dev(struct vfio_device *device)
341{
342	unsigned int i = 0;
343	bool interrupted = false;
344	long rc;
345
346	/*
347	 * Prevent new device opened by userspace via the
348	 * VFIO_GROUP_GET_DEVICE_FD in the group path.
349	 */
350	vfio_device_group_unregister(device);
351
352	/*
353	 * Balances vfio_device_add() in register path, also prevents
354	 * new device opened by userspace in the cdev path.
355	 */
356	vfio_device_del(device);
357
358	vfio_device_put_registration(device);
359	rc = try_wait_for_completion(&device->comp);
360	while (rc <= 0) {
361		if (device->ops->request)
362			device->ops->request(device, i++);
363
364		if (interrupted) {
365			rc = wait_for_completion_timeout(&device->comp,
366							 HZ * 10);
367		} else {
368			rc = wait_for_completion_interruptible_timeout(
369				&device->comp, HZ * 10);
370			if (rc < 0) {
371				interrupted = true;
372				dev_warn(device->dev,
373					 "Device is currently in use, task"
374					 " \"%s\" (%d) "
375					 "blocked until device is released",
376					 current->comm, task_pid_nr(current));
377			}
378		}
379	}
380
381	/* Balances vfio_device_set_group in register path */
382	vfio_device_remove_group(device);
383}
384EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
385
386#ifdef CONFIG_HAVE_KVM
387void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm)
388{
389	void (*pfn)(struct kvm *kvm);
390	bool (*fn)(struct kvm *kvm);
391	bool ret;
392
393	lockdep_assert_held(&device->dev_set->lock);
394
395	if (!kvm)
396		return;
397
398	pfn = symbol_get(kvm_put_kvm);
399	if (WARN_ON(!pfn))
400		return;
401
402	fn = symbol_get(kvm_get_kvm_safe);
403	if (WARN_ON(!fn)) {
404		symbol_put(kvm_put_kvm);
405		return;
406	}
407
408	ret = fn(kvm);
409	symbol_put(kvm_get_kvm_safe);
410	if (!ret) {
411		symbol_put(kvm_put_kvm);
412		return;
413	}
414
415	device->put_kvm = pfn;
416	device->kvm = kvm;
417}
418
419void vfio_device_put_kvm(struct vfio_device *device)
420{
421	lockdep_assert_held(&device->dev_set->lock);
422
423	if (!device->kvm)
424		return;
425
426	if (WARN_ON(!device->put_kvm))
427		goto clear;
428
429	device->put_kvm(device->kvm);
430	device->put_kvm = NULL;
431	symbol_put(kvm_put_kvm);
432
433clear:
434	device->kvm = NULL;
435}
436#endif
437
438/* true if the vfio_device has open_device() called but not close_device() */
439static bool vfio_assert_device_open(struct vfio_device *device)
440{
441	return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
442}
443
444struct vfio_device_file *
445vfio_allocate_device_file(struct vfio_device *device)
446{
447	struct vfio_device_file *df;
448
449	df = kzalloc(sizeof(*df), GFP_KERNEL_ACCOUNT);
450	if (!df)
451		return ERR_PTR(-ENOMEM);
452
453	df->device = device;
454	spin_lock_init(&df->kvm_ref_lock);
455
456	return df;
457}
458
459static int vfio_df_device_first_open(struct vfio_device_file *df)
460{
461	struct vfio_device *device = df->device;
462	struct iommufd_ctx *iommufd = df->iommufd;
463	int ret;
464
465	lockdep_assert_held(&device->dev_set->lock);
466
467	if (!try_module_get(device->dev->driver->owner))
468		return -ENODEV;
469
470	if (iommufd)
471		ret = vfio_df_iommufd_bind(df);
472	else
473		ret = vfio_device_group_use_iommu(device);
474	if (ret)
475		goto err_module_put;
476
477	if (device->ops->open_device) {
478		ret = device->ops->open_device(device);
479		if (ret)
480			goto err_unuse_iommu;
481	}
482	return 0;
483
484err_unuse_iommu:
485	if (iommufd)
486		vfio_df_iommufd_unbind(df);
487	else
488		vfio_device_group_unuse_iommu(device);
489err_module_put:
490	module_put(device->dev->driver->owner);
491	return ret;
492}
493
494static void vfio_df_device_last_close(struct vfio_device_file *df)
495{
496	struct vfio_device *device = df->device;
497	struct iommufd_ctx *iommufd = df->iommufd;
498
499	lockdep_assert_held(&device->dev_set->lock);
500
501	if (device->ops->close_device)
502		device->ops->close_device(device);
503	if (iommufd)
504		vfio_df_iommufd_unbind(df);
505	else
506		vfio_device_group_unuse_iommu(device);
507	module_put(device->dev->driver->owner);
508}
509
510int vfio_df_open(struct vfio_device_file *df)
511{
512	struct vfio_device *device = df->device;
513	int ret = 0;
514
515	lockdep_assert_held(&device->dev_set->lock);
516
517	/*
518	 * Only the group path allows the device to be opened multiple
519	 * times.  The device cdev path doesn't have a secure way for it.
520	 */
521	if (device->open_count != 0 && !df->group)
522		return -EINVAL;
523
524	device->open_count++;
525	if (device->open_count == 1) {
526		ret = vfio_df_device_first_open(df);
527		if (ret)
528			device->open_count--;
529	}
530
531	return ret;
532}
533
534void vfio_df_close(struct vfio_device_file *df)
535{
536	struct vfio_device *device = df->device;
537
538	lockdep_assert_held(&device->dev_set->lock);
539
540	vfio_assert_device_open(device);
541	if (device->open_count == 1)
542		vfio_df_device_last_close(df);
543	device->open_count--;
544}
545
546/*
547 * Wrapper around pm_runtime_resume_and_get().
548 * Return error code on failure or 0 on success.
549 */
550static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
551{
552	struct device *dev = device->dev;
553
554	if (dev->driver && dev->driver->pm) {
555		int ret;
556
557		ret = pm_runtime_resume_and_get(dev);
558		if (ret) {
559			dev_info_ratelimited(dev,
560				"vfio: runtime resume failed %d\n", ret);
561			return -EIO;
562		}
563	}
564
565	return 0;
566}
567
568/*
569 * Wrapper around pm_runtime_put().
570 */
571static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
572{
573	struct device *dev = device->dev;
574
575	if (dev->driver && dev->driver->pm)
576		pm_runtime_put(dev);
577}
578
579/*
580 * VFIO Device fd
581 */
582static int vfio_device_fops_release(struct inode *inode, struct file *filep)
583{
584	struct vfio_device_file *df = filep->private_data;
585	struct vfio_device *device = df->device;
586
587	if (df->group)
588		vfio_df_group_close(df);
589	else
590		vfio_df_unbind_iommufd(df);
591
592	vfio_device_put_registration(device);
593
594	kfree(df);
595
596	return 0;
597}
598
599/*
600 * vfio_mig_get_next_state - Compute the next step in the FSM
601 * @cur_fsm - The current state the device is in
602 * @new_fsm - The target state to reach
603 * @next_fsm - Pointer to the next step to get to new_fsm
604 *
605 * Return 0 upon success, otherwise -errno
606 * Upon success the next step in the state progression between cur_fsm and
607 * new_fsm will be set in next_fsm.
608 *
609 * This breaks down requests for combination transitions into smaller steps and
610 * returns the next step to get to new_fsm. The function may need to be called
611 * multiple times before reaching new_fsm.
612 *
613 */
614int vfio_mig_get_next_state(struct vfio_device *device,
615			    enum vfio_device_mig_state cur_fsm,
616			    enum vfio_device_mig_state new_fsm,
617			    enum vfio_device_mig_state *next_fsm)
618{
619	enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 };
620	/*
621	 * The coding in this table requires the driver to implement the
622	 * following FSM arcs:
623	 *         RESUMING -> STOP
624	 *         STOP -> RESUMING
625	 *         STOP -> STOP_COPY
626	 *         STOP_COPY -> STOP
627	 *
628	 * If P2P is supported then the driver must also implement these FSM
629	 * arcs:
630	 *         RUNNING -> RUNNING_P2P
631	 *         RUNNING_P2P -> RUNNING
632	 *         RUNNING_P2P -> STOP
633	 *         STOP -> RUNNING_P2P
634	 *
635	 * If precopy is supported then the driver must support these additional
636	 * FSM arcs:
637	 *         RUNNING -> PRE_COPY
638	 *         PRE_COPY -> RUNNING
639	 *         PRE_COPY -> STOP_COPY
640	 * However, if precopy and P2P are supported together then the driver
641	 * must support these additional arcs beyond the P2P arcs above:
642	 *         PRE_COPY -> RUNNING
643	 *         PRE_COPY -> PRE_COPY_P2P
644	 *         PRE_COPY_P2P -> PRE_COPY
645	 *         PRE_COPY_P2P -> RUNNING_P2P
646	 *         PRE_COPY_P2P -> STOP_COPY
647	 *         RUNNING -> PRE_COPY
648	 *         RUNNING_P2P -> PRE_COPY_P2P
649	 *
650	 * Without P2P and precopy the driver must implement:
651	 *         RUNNING -> STOP
652	 *         STOP -> RUNNING
653	 *
654	 * The coding will step through multiple states for some combination
655	 * transitions; if all optional features are supported, this means the
656	 * following ones:
657	 *         PRE_COPY -> PRE_COPY_P2P -> STOP_COPY
658	 *         PRE_COPY -> RUNNING -> RUNNING_P2P
659	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP
660	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING
661	 *         PRE_COPY_P2P -> RUNNING_P2P -> RUNNING
662	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP
663	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING
664	 *         RESUMING -> STOP -> RUNNING_P2P
665	 *         RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P
666	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING
667	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
668	 *         RESUMING -> STOP -> STOP_COPY
669	 *         RUNNING -> RUNNING_P2P -> PRE_COPY_P2P
670	 *         RUNNING -> RUNNING_P2P -> STOP
671	 *         RUNNING -> RUNNING_P2P -> STOP -> RESUMING
672	 *         RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
673	 *         RUNNING_P2P -> RUNNING -> PRE_COPY
674	 *         RUNNING_P2P -> STOP -> RESUMING
675	 *         RUNNING_P2P -> STOP -> STOP_COPY
676	 *         STOP -> RUNNING_P2P -> PRE_COPY_P2P
677	 *         STOP -> RUNNING_P2P -> RUNNING
678	 *         STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
679	 *         STOP_COPY -> STOP -> RESUMING
680	 *         STOP_COPY -> STOP -> RUNNING_P2P
681	 *         STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
682	 *
683	 *  The following transitions are blocked:
684	 *         STOP_COPY -> PRE_COPY
685	 *         STOP_COPY -> PRE_COPY_P2P
686	 */
687	static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
688		[VFIO_DEVICE_STATE_STOP] = {
689			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
690			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
691			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
692			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
693			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
694			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
695			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
696			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
697		},
698		[VFIO_DEVICE_STATE_RUNNING] = {
699			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
700			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
701			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
702			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
703			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
704			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
705			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
706			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
707		},
708		[VFIO_DEVICE_STATE_PRE_COPY] = {
709			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING,
710			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
711			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
712			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
713			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
714			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING,
715			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING,
716			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
717		},
718		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = {
719			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
720			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
721			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
722			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
723			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
724			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
725			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
726			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
727		},
728		[VFIO_DEVICE_STATE_STOP_COPY] = {
729			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
730			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
731			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
732			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
733			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
734			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
735			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
736			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
737		},
738		[VFIO_DEVICE_STATE_RESUMING] = {
739			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
740			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
741			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP,
742			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP,
743			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
744			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
745			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
746			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
747		},
748		[VFIO_DEVICE_STATE_RUNNING_P2P] = {
749			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
750			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
751			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING,
752			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
753			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
754			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
755			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
756			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
757		},
758		[VFIO_DEVICE_STATE_ERROR] = {
759			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
760			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
761			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
762			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
763			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
764			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
765			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
766			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
767		},
768	};
769
770	static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
771		[VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
772		[VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
773		[VFIO_DEVICE_STATE_PRE_COPY] =
774			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY,
775		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY |
776						   VFIO_MIGRATION_P2P |
777						   VFIO_MIGRATION_PRE_COPY,
778		[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
779		[VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
780		[VFIO_DEVICE_STATE_RUNNING_P2P] =
781			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
782		[VFIO_DEVICE_STATE_ERROR] = ~0U,
783	};
784
785	if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
786		    (state_flags_table[cur_fsm] & device->migration_flags) !=
787			state_flags_table[cur_fsm]))
788		return -EINVAL;
789
790	if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
791	   (state_flags_table[new_fsm] & device->migration_flags) !=
792			state_flags_table[new_fsm])
793		return -EINVAL;
794
795	/*
796	 * Arcs touching optional and unsupported states are skipped over. The
797	 * driver will instead see an arc from the original state to the next
798	 * logical state, as per the above comment.
799	 */
800	*next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
801	while ((state_flags_table[*next_fsm] & device->migration_flags) !=
802			state_flags_table[*next_fsm])
803		*next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
804
805	return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
806}
807EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
808
809/*
810 * Convert the drivers's struct file into a FD number and return it to userspace
811 */
812static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
813				   struct vfio_device_feature_mig_state *mig)
814{
815	int ret;
816	int fd;
817
818	fd = get_unused_fd_flags(O_CLOEXEC);
819	if (fd < 0) {
820		ret = fd;
821		goto out_fput;
822	}
823
824	mig->data_fd = fd;
825	if (copy_to_user(arg, mig, sizeof(*mig))) {
826		ret = -EFAULT;
827		goto out_put_unused;
828	}
829	fd_install(fd, filp);
830	return 0;
831
832out_put_unused:
833	put_unused_fd(fd);
834out_fput:
835	fput(filp);
836	return ret;
837}
838
839static int
840vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
841					   u32 flags, void __user *arg,
842					   size_t argsz)
843{
844	size_t minsz =
845		offsetofend(struct vfio_device_feature_mig_state, data_fd);
846	struct vfio_device_feature_mig_state mig;
847	struct file *filp = NULL;
848	int ret;
849
850	if (!device->mig_ops)
851		return -ENOTTY;
852
853	ret = vfio_check_feature(flags, argsz,
854				 VFIO_DEVICE_FEATURE_SET |
855				 VFIO_DEVICE_FEATURE_GET,
856				 sizeof(mig));
857	if (ret != 1)
858		return ret;
859
860	if (copy_from_user(&mig, arg, minsz))
861		return -EFAULT;
862
863	if (flags & VFIO_DEVICE_FEATURE_GET) {
864		enum vfio_device_mig_state curr_state;
865
866		ret = device->mig_ops->migration_get_state(device,
867							   &curr_state);
868		if (ret)
869			return ret;
870		mig.device_state = curr_state;
871		goto out_copy;
872	}
873
874	/* Handle the VFIO_DEVICE_FEATURE_SET */
875	filp = device->mig_ops->migration_set_state(device, mig.device_state);
876	if (IS_ERR(filp) || !filp)
877		goto out_copy;
878
879	return vfio_ioct_mig_return_fd(filp, arg, &mig);
880out_copy:
881	mig.data_fd = -1;
882	if (copy_to_user(arg, &mig, sizeof(mig)))
883		return -EFAULT;
884	if (IS_ERR(filp))
885		return PTR_ERR(filp);
886	return 0;
887}
888
889static int
890vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device,
891					      u32 flags, void __user *arg,
892					      size_t argsz)
893{
894	struct vfio_device_feature_mig_data_size data_size = {};
895	unsigned long stop_copy_length;
896	int ret;
897
898	if (!device->mig_ops)
899		return -ENOTTY;
900
901	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
902				 sizeof(data_size));
903	if (ret != 1)
904		return ret;
905
906	ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length);
907	if (ret)
908		return ret;
909
910	data_size.stop_copy_length = stop_copy_length;
911	if (copy_to_user(arg, &data_size, sizeof(data_size)))
912		return -EFAULT;
913
914	return 0;
915}
916
917static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
918					       u32 flags, void __user *arg,
919					       size_t argsz)
920{
921	struct vfio_device_feature_migration mig = {
922		.flags = device->migration_flags,
923	};
924	int ret;
925
926	if (!device->mig_ops)
927		return -ENOTTY;
928
929	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
930				 sizeof(mig));
931	if (ret != 1)
932		return ret;
933	if (copy_to_user(arg, &mig, sizeof(mig)))
934		return -EFAULT;
935	return 0;
936}
937
938void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes,
939			      u32 req_nodes)
940{
941	struct interval_tree_node *prev, *curr, *comb_start, *comb_end;
942	unsigned long min_gap, curr_gap;
943
944	/* Special shortcut when a single range is required */
945	if (req_nodes == 1) {
946		unsigned long last;
947
948		comb_start = interval_tree_iter_first(root, 0, ULONG_MAX);
949		curr = comb_start;
950		while (curr) {
951			last = curr->last;
952			prev = curr;
953			curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
954			if (prev != comb_start)
955				interval_tree_remove(prev, root);
956		}
957		comb_start->last = last;
958		return;
959	}
960
961	/* Combine ranges which have the smallest gap */
962	while (cur_nodes > req_nodes) {
963		prev = NULL;
964		min_gap = ULONG_MAX;
965		curr = interval_tree_iter_first(root, 0, ULONG_MAX);
966		while (curr) {
967			if (prev) {
968				curr_gap = curr->start - prev->last;
969				if (curr_gap < min_gap) {
970					min_gap = curr_gap;
971					comb_start = prev;
972					comb_end = curr;
973				}
974			}
975			prev = curr;
976			curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
977		}
978		comb_start->last = comb_end->last;
979		interval_tree_remove(comb_end, root);
980		cur_nodes--;
981	}
982}
983EXPORT_SYMBOL_GPL(vfio_combine_iova_ranges);
984
985/* Ranges should fit into a single kernel page */
986#define LOG_MAX_RANGES \
987	(PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))
988
989static int
990vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
991					u32 flags, void __user *arg,
992					size_t argsz)
993{
994	size_t minsz =
995		offsetofend(struct vfio_device_feature_dma_logging_control,
996			    ranges);
997	struct vfio_device_feature_dma_logging_range __user *ranges;
998	struct vfio_device_feature_dma_logging_control control;
999	struct vfio_device_feature_dma_logging_range range;
1000	struct rb_root_cached root = RB_ROOT_CACHED;
1001	struct interval_tree_node *nodes;
1002	u64 iova_end;
1003	u32 nnodes;
1004	int i, ret;
1005
1006	if (!device->log_ops)
1007		return -ENOTTY;
1008
1009	ret = vfio_check_feature(flags, argsz,
1010				 VFIO_DEVICE_FEATURE_SET,
1011				 sizeof(control));
1012	if (ret != 1)
1013		return ret;
1014
1015	if (copy_from_user(&control, arg, minsz))
1016		return -EFAULT;
1017
1018	nnodes = control.num_ranges;
1019	if (!nnodes)
1020		return -EINVAL;
1021
1022	if (nnodes > LOG_MAX_RANGES)
1023		return -E2BIG;
1024
1025	ranges = u64_to_user_ptr(control.ranges);
1026	nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node),
1027			      GFP_KERNEL);
1028	if (!nodes)
1029		return -ENOMEM;
1030
1031	for (i = 0; i < nnodes; i++) {
1032		if (copy_from_user(&range, &ranges[i], sizeof(range))) {
1033			ret = -EFAULT;
1034			goto end;
1035		}
1036		if (!IS_ALIGNED(range.iova, control.page_size) ||
1037		    !IS_ALIGNED(range.length, control.page_size)) {
1038			ret = -EINVAL;
1039			goto end;
1040		}
1041
1042		if (check_add_overflow(range.iova, range.length, &iova_end) ||
1043		    iova_end > ULONG_MAX) {
1044			ret = -EOVERFLOW;
1045			goto end;
1046		}
1047
1048		nodes[i].start = range.iova;
1049		nodes[i].last = range.iova + range.length - 1;
1050		if (interval_tree_iter_first(&root, nodes[i].start,
1051					     nodes[i].last)) {
1052			/* Range overlapping */
1053			ret = -EINVAL;
1054			goto end;
1055		}
1056		interval_tree_insert(nodes + i, &root);
1057	}
1058
1059	ret = device->log_ops->log_start(device, &root, nnodes,
1060					 &control.page_size);
1061	if (ret)
1062		goto end;
1063
1064	if (copy_to_user(arg, &control, sizeof(control))) {
1065		ret = -EFAULT;
1066		device->log_ops->log_stop(device);
1067	}
1068
1069end:
1070	kfree(nodes);
1071	return ret;
1072}
1073
1074static int
1075vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
1076				       u32 flags, void __user *arg,
1077				       size_t argsz)
1078{
1079	int ret;
1080
1081	if (!device->log_ops)
1082		return -ENOTTY;
1083
1084	ret = vfio_check_feature(flags, argsz,
1085				 VFIO_DEVICE_FEATURE_SET, 0);
1086	if (ret != 1)
1087		return ret;
1088
1089	return device->log_ops->log_stop(device);
1090}
1091
1092static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
1093					  unsigned long iova, size_t length,
1094					  void *opaque)
1095{
1096	struct vfio_device *device = opaque;
1097
1098	return device->log_ops->log_read_and_clear(device, iova, length, iter);
1099}
1100
1101static int
1102vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
1103					 u32 flags, void __user *arg,
1104					 size_t argsz)
1105{
1106	size_t minsz =
1107		offsetofend(struct vfio_device_feature_dma_logging_report,
1108			    bitmap);
1109	struct vfio_device_feature_dma_logging_report report;
1110	struct iova_bitmap *iter;
1111	u64 iova_end;
1112	int ret;
1113
1114	if (!device->log_ops)
1115		return -ENOTTY;
1116
1117	ret = vfio_check_feature(flags, argsz,
1118				 VFIO_DEVICE_FEATURE_GET,
1119				 sizeof(report));
1120	if (ret != 1)
1121		return ret;
1122
1123	if (copy_from_user(&report, arg, minsz))
1124		return -EFAULT;
1125
1126	if (report.page_size < SZ_4K || !is_power_of_2(report.page_size))
1127		return -EINVAL;
1128
1129	if (check_add_overflow(report.iova, report.length, &iova_end) ||
1130	    iova_end > ULONG_MAX)
1131		return -EOVERFLOW;
1132
1133	iter = iova_bitmap_alloc(report.iova, report.length,
1134				 report.page_size,
1135				 u64_to_user_ptr(report.bitmap));
1136	if (IS_ERR(iter))
1137		return PTR_ERR(iter);
1138
1139	ret = iova_bitmap_for_each(iter, device,
1140				   vfio_device_log_read_and_clear);
1141
1142	iova_bitmap_free(iter);
1143	return ret;
1144}
1145
1146static int vfio_ioctl_device_feature(struct vfio_device *device,
1147				     struct vfio_device_feature __user *arg)
1148{
1149	size_t minsz = offsetofend(struct vfio_device_feature, flags);
1150	struct vfio_device_feature feature;
1151
1152	if (copy_from_user(&feature, arg, minsz))
1153		return -EFAULT;
1154
1155	if (feature.argsz < minsz)
1156		return -EINVAL;
1157
1158	/* Check unknown flags */
1159	if (feature.flags &
1160	    ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1161	      VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1162		return -EINVAL;
1163
1164	/* GET & SET are mutually exclusive except with PROBE */
1165	if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1166	    (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1167	    (feature.flags & VFIO_DEVICE_FEATURE_GET))
1168		return -EINVAL;
1169
1170	switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1171	case VFIO_DEVICE_FEATURE_MIGRATION:
1172		return vfio_ioctl_device_feature_migration(
1173			device, feature.flags, arg->data,
1174			feature.argsz - minsz);
1175	case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1176		return vfio_ioctl_device_feature_mig_device_state(
1177			device, feature.flags, arg->data,
1178			feature.argsz - minsz);
1179	case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
1180		return vfio_ioctl_device_feature_logging_start(
1181			device, feature.flags, arg->data,
1182			feature.argsz - minsz);
1183	case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
1184		return vfio_ioctl_device_feature_logging_stop(
1185			device, feature.flags, arg->data,
1186			feature.argsz - minsz);
1187	case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
1188		return vfio_ioctl_device_feature_logging_report(
1189			device, feature.flags, arg->data,
1190			feature.argsz - minsz);
1191	case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE:
1192		return vfio_ioctl_device_feature_migration_data_size(
1193			device, feature.flags, arg->data,
1194			feature.argsz - minsz);
1195	default:
1196		if (unlikely(!device->ops->device_feature))
1197			return -EINVAL;
1198		return device->ops->device_feature(device, feature.flags,
1199						   arg->data,
1200						   feature.argsz - minsz);
1201	}
1202}
1203
1204static long vfio_device_fops_unl_ioctl(struct file *filep,
1205				       unsigned int cmd, unsigned long arg)
1206{
1207	struct vfio_device_file *df = filep->private_data;
1208	struct vfio_device *device = df->device;
1209	void __user *uptr = (void __user *)arg;
1210	int ret;
1211
1212	if (cmd == VFIO_DEVICE_BIND_IOMMUFD)
1213		return vfio_df_ioctl_bind_iommufd(df, uptr);
1214
1215	/* Paired with smp_store_release() following vfio_df_open() */
1216	if (!smp_load_acquire(&df->access_granted))
1217		return -EINVAL;
1218
1219	ret = vfio_device_pm_runtime_get(device);
1220	if (ret)
1221		return ret;
1222
1223	/* cdev only ioctls */
1224	if (IS_ENABLED(CONFIG_VFIO_DEVICE_CDEV) && !df->group) {
1225		switch (cmd) {
1226		case VFIO_DEVICE_ATTACH_IOMMUFD_PT:
1227			ret = vfio_df_ioctl_attach_pt(df, uptr);
1228			goto out;
1229
1230		case VFIO_DEVICE_DETACH_IOMMUFD_PT:
1231			ret = vfio_df_ioctl_detach_pt(df, uptr);
1232			goto out;
1233		}
1234	}
1235
1236	switch (cmd) {
1237	case VFIO_DEVICE_FEATURE:
1238		ret = vfio_ioctl_device_feature(device, uptr);
1239		break;
1240
1241	default:
1242		if (unlikely(!device->ops->ioctl))
1243			ret = -EINVAL;
1244		else
1245			ret = device->ops->ioctl(device, cmd, arg);
1246		break;
1247	}
1248out:
1249	vfio_device_pm_runtime_put(device);
1250	return ret;
1251}
1252
1253static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1254				     size_t count, loff_t *ppos)
1255{
1256	struct vfio_device_file *df = filep->private_data;
1257	struct vfio_device *device = df->device;
1258
1259	/* Paired with smp_store_release() following vfio_df_open() */
1260	if (!smp_load_acquire(&df->access_granted))
1261		return -EINVAL;
1262
1263	if (unlikely(!device->ops->read))
1264		return -EINVAL;
1265
1266	return device->ops->read(device, buf, count, ppos);
1267}
1268
1269static ssize_t vfio_device_fops_write(struct file *filep,
1270				      const char __user *buf,
1271				      size_t count, loff_t *ppos)
1272{
1273	struct vfio_device_file *df = filep->private_data;
1274	struct vfio_device *device = df->device;
1275
1276	/* Paired with smp_store_release() following vfio_df_open() */
1277	if (!smp_load_acquire(&df->access_granted))
1278		return -EINVAL;
1279
1280	if (unlikely(!device->ops->write))
1281		return -EINVAL;
1282
1283	return device->ops->write(device, buf, count, ppos);
1284}
1285
1286static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1287{
1288	struct vfio_device_file *df = filep->private_data;
1289	struct vfio_device *device = df->device;
1290
1291	/* Paired with smp_store_release() following vfio_df_open() */
1292	if (!smp_load_acquire(&df->access_granted))
1293		return -EINVAL;
1294
1295	if (unlikely(!device->ops->mmap))
1296		return -EINVAL;
1297
1298	return device->ops->mmap(device, vma);
1299}
1300
1301const struct file_operations vfio_device_fops = {
1302	.owner		= THIS_MODULE,
1303	.open		= vfio_device_fops_cdev_open,
1304	.release	= vfio_device_fops_release,
1305	.read		= vfio_device_fops_read,
1306	.write		= vfio_device_fops_write,
1307	.unlocked_ioctl	= vfio_device_fops_unl_ioctl,
1308	.compat_ioctl	= compat_ptr_ioctl,
1309	.mmap		= vfio_device_fops_mmap,
1310};
1311
1312static struct vfio_device *vfio_device_from_file(struct file *file)
1313{
1314	struct vfio_device_file *df = file->private_data;
1315
1316	if (file->f_op != &vfio_device_fops)
1317		return NULL;
1318	return df->device;
1319}
1320
1321/**
1322 * vfio_file_is_valid - True if the file is valid vfio file
1323 * @file: VFIO group file or VFIO device file
1324 */
1325bool vfio_file_is_valid(struct file *file)
1326{
1327	return vfio_group_from_file(file) ||
1328	       vfio_device_from_file(file);
1329}
1330EXPORT_SYMBOL_GPL(vfio_file_is_valid);
1331
1332/**
1333 * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file
1334 *        is always CPU cache coherent
1335 * @file: VFIO group file or VFIO device file
1336 *
1337 * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop
1338 * bit in DMA transactions. A return of false indicates that the user has
1339 * rights to access additional instructions such as wbinvd on x86.
1340 */
1341bool vfio_file_enforced_coherent(struct file *file)
1342{
1343	struct vfio_device *device;
1344	struct vfio_group *group;
1345
1346	group = vfio_group_from_file(file);
1347	if (group)
1348		return vfio_group_enforced_coherent(group);
1349
1350	device = vfio_device_from_file(file);
1351	if (device)
1352		return device_iommu_capable(device->dev,
1353					    IOMMU_CAP_ENFORCE_CACHE_COHERENCY);
1354
1355	return true;
1356}
1357EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent);
1358
1359static void vfio_device_file_set_kvm(struct file *file, struct kvm *kvm)
1360{
1361	struct vfio_device_file *df = file->private_data;
1362
1363	/*
1364	 * The kvm is first recorded in the vfio_device_file, and will
1365	 * be propagated to vfio_device::kvm when the file is bound to
1366	 * iommufd successfully in the vfio device cdev path.
1367	 */
1368	spin_lock(&df->kvm_ref_lock);
1369	df->kvm = kvm;
1370	spin_unlock(&df->kvm_ref_lock);
1371}
1372
1373/**
1374 * vfio_file_set_kvm - Link a kvm with VFIO drivers
1375 * @file: VFIO group file or VFIO device file
1376 * @kvm: KVM to link
1377 *
1378 * When a VFIO device is first opened the KVM will be available in
1379 * device->kvm if one was associated with the file.
1380 */
1381void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
1382{
1383	struct vfio_group *group;
1384
1385	group = vfio_group_from_file(file);
1386	if (group)
1387		vfio_group_set_kvm(group, kvm);
1388
1389	if (vfio_device_from_file(file))
1390		vfio_device_file_set_kvm(file, kvm);
1391}
1392EXPORT_SYMBOL_GPL(vfio_file_set_kvm);
1393
1394/*
1395 * Sub-module support
1396 */
1397/*
1398 * Helper for managing a buffer of info chain capabilities, allocate or
1399 * reallocate a buffer with additional @size, filling in @id and @version
1400 * of the capability.  A pointer to the new capability is returned.
1401 *
1402 * NB. The chain is based at the head of the buffer, so new entries are
1403 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1404 * next offsets prior to copying to the user buffer.
1405 */
1406struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1407					       size_t size, u16 id, u16 version)
1408{
1409	void *buf;
1410	struct vfio_info_cap_header *header, *tmp;
1411
1412	/* Ensure that the next capability struct will be aligned */
1413	size = ALIGN(size, sizeof(u64));
1414
1415	buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1416	if (!buf) {
1417		kfree(caps->buf);
1418		caps->buf = NULL;
1419		caps->size = 0;
1420		return ERR_PTR(-ENOMEM);
1421	}
1422
1423	caps->buf = buf;
1424	header = buf + caps->size;
1425
1426	/* Eventually copied to user buffer, zero */
1427	memset(header, 0, size);
1428
1429	header->id = id;
1430	header->version = version;
1431
1432	/* Add to the end of the capability chain */
1433	for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1434		; /* nothing */
1435
1436	tmp->next = caps->size;
1437	caps->size += size;
1438
1439	return header;
1440}
1441EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1442
1443void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1444{
1445	struct vfio_info_cap_header *tmp;
1446	void *buf = (void *)caps->buf;
1447
1448	/* Capability structs should start with proper alignment */
1449	WARN_ON(!IS_ALIGNED(offset, sizeof(u64)));
1450
1451	for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1452		tmp->next += offset;
1453}
1454EXPORT_SYMBOL(vfio_info_cap_shift);
1455
1456int vfio_info_add_capability(struct vfio_info_cap *caps,
1457			     struct vfio_info_cap_header *cap, size_t size)
1458{
1459	struct vfio_info_cap_header *header;
1460
1461	header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1462	if (IS_ERR(header))
1463		return PTR_ERR(header);
1464
1465	memcpy(header + 1, cap + 1, size - sizeof(*header));
1466
1467	return 0;
1468}
1469EXPORT_SYMBOL(vfio_info_add_capability);
1470
1471int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1472				       int max_irq_type, size_t *data_size)
1473{
1474	unsigned long minsz;
1475	size_t size;
1476
1477	minsz = offsetofend(struct vfio_irq_set, count);
1478
1479	if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1480	    (hdr->count >= (U32_MAX - hdr->start)) ||
1481	    (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1482				VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1483		return -EINVAL;
1484
1485	if (data_size)
1486		*data_size = 0;
1487
1488	if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1489		return -EINVAL;
1490
1491	switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1492	case VFIO_IRQ_SET_DATA_NONE:
1493		size = 0;
1494		break;
1495	case VFIO_IRQ_SET_DATA_BOOL:
1496		size = sizeof(uint8_t);
1497		break;
1498	case VFIO_IRQ_SET_DATA_EVENTFD:
1499		size = sizeof(int32_t);
1500		break;
1501	default:
1502		return -EINVAL;
1503	}
1504
1505	if (size) {
1506		if (hdr->argsz - minsz < hdr->count * size)
1507			return -EINVAL;
1508
1509		if (!data_size)
1510			return -EINVAL;
1511
1512		*data_size = hdr->count * size;
1513	}
1514
1515	return 0;
1516}
1517EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1518
1519/*
1520 * Pin contiguous user pages and return their associated host pages for local
1521 * domain only.
1522 * @device [in]  : device
1523 * @iova [in]    : starting IOVA of user pages to be pinned.
1524 * @npage [in]   : count of pages to be pinned.  This count should not
1525 *		   be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1526 * @prot [in]    : protection flags
1527 * @pages[out]   : array of host pages
1528 * Return error or number of pages pinned.
1529 *
1530 * A driver may only call this function if the vfio_device was created
1531 * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages().
1532 */
1533int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
1534		   int npage, int prot, struct page **pages)
1535{
1536	/* group->container cannot change while a vfio device is open */
1537	if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device)))
1538		return -EINVAL;
1539	if (!device->ops->dma_unmap)
1540		return -EINVAL;
1541	if (vfio_device_has_container(device))
1542		return vfio_device_container_pin_pages(device, iova,
1543						       npage, prot, pages);
1544	if (device->iommufd_access) {
1545		int ret;
1546
1547		if (iova > ULONG_MAX)
1548			return -EINVAL;
1549		/*
1550		 * VFIO ignores the sub page offset, npages is from the start of
1551		 * a PAGE_SIZE chunk of IOVA. The caller is expected to recover
1552		 * the sub page offset by doing:
1553		 *     pages[0] + (iova % PAGE_SIZE)
1554		 */
1555		ret = iommufd_access_pin_pages(
1556			device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE),
1557			npage * PAGE_SIZE, pages,
1558			(prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0);
1559		if (ret)
1560			return ret;
1561		return npage;
1562	}
1563	return -EINVAL;
1564}
1565EXPORT_SYMBOL(vfio_pin_pages);
1566
1567/*
1568 * Unpin contiguous host pages for local domain only.
1569 * @device [in]  : device
1570 * @iova [in]    : starting address of user pages to be unpinned.
1571 * @npage [in]   : count of pages to be unpinned.  This count should not
1572 *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1573 */
1574void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
1575{
1576	if (WARN_ON(!vfio_assert_device_open(device)))
1577		return;
1578	if (WARN_ON(!device->ops->dma_unmap))
1579		return;
1580
1581	if (vfio_device_has_container(device)) {
1582		vfio_device_container_unpin_pages(device, iova, npage);
1583		return;
1584	}
1585	if (device->iommufd_access) {
1586		if (WARN_ON(iova > ULONG_MAX))
1587			return;
1588		iommufd_access_unpin_pages(device->iommufd_access,
1589					   ALIGN_DOWN(iova, PAGE_SIZE),
1590					   npage * PAGE_SIZE);
1591		return;
1592	}
1593}
1594EXPORT_SYMBOL(vfio_unpin_pages);
1595
1596/*
1597 * This interface allows the CPUs to perform some sort of virtual DMA on
1598 * behalf of the device.
1599 *
1600 * CPUs read/write from/into a range of IOVAs pointing to user space memory
1601 * into/from a kernel buffer.
1602 *
1603 * As the read/write of user space memory is conducted via the CPUs and is
1604 * not a real device DMA, it is not necessary to pin the user space memory.
1605 *
1606 * @device [in]		: VFIO device
1607 * @iova [in]		: base IOVA of a user space buffer
1608 * @data [in]		: pointer to kernel buffer
1609 * @len [in]		: kernel buffer length
1610 * @write		: indicate read or write
1611 * Return error code on failure or 0 on success.
1612 */
1613int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
1614		size_t len, bool write)
1615{
1616	if (!data || len <= 0 || !vfio_assert_device_open(device))
1617		return -EINVAL;
1618
1619	if (vfio_device_has_container(device))
1620		return vfio_device_container_dma_rw(device, iova,
1621						    data, len, write);
1622
1623	if (device->iommufd_access) {
1624		unsigned int flags = 0;
1625
1626		if (iova > ULONG_MAX)
1627			return -EINVAL;
1628
1629		/* VFIO historically tries to auto-detect a kthread */
1630		if (!current->mm)
1631			flags |= IOMMUFD_ACCESS_RW_KTHREAD;
1632		if (write)
1633			flags |= IOMMUFD_ACCESS_RW_WRITE;
1634		return iommufd_access_rw(device->iommufd_access, iova, data,
1635					 len, flags);
1636	}
1637	return -EINVAL;
1638}
1639EXPORT_SYMBOL(vfio_dma_rw);
1640
1641/*
1642 * Module/class support
1643 */
1644static int __init vfio_init(void)
1645{
1646	int ret;
1647
1648	ida_init(&vfio.device_ida);
1649
1650	ret = vfio_group_init();
1651	if (ret)
1652		return ret;
1653
1654	ret = vfio_virqfd_init();
1655	if (ret)
1656		goto err_virqfd;
1657
1658	/* /sys/class/vfio-dev/vfioX */
1659	vfio.device_class = class_create("vfio-dev");
1660	if (IS_ERR(vfio.device_class)) {
1661		ret = PTR_ERR(vfio.device_class);
1662		goto err_dev_class;
1663	}
1664
1665	ret = vfio_cdev_init(vfio.device_class);
1666	if (ret)
1667		goto err_alloc_dev_chrdev;
1668
1669	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1670	return 0;
1671
1672err_alloc_dev_chrdev:
1673	class_destroy(vfio.device_class);
1674	vfio.device_class = NULL;
1675err_dev_class:
1676	vfio_virqfd_exit();
1677err_virqfd:
1678	vfio_group_cleanup();
1679	return ret;
1680}
1681
1682static void __exit vfio_cleanup(void)
1683{
1684	ida_destroy(&vfio.device_ida);
1685	vfio_cdev_cleanup();
1686	class_destroy(vfio.device_class);
1687	vfio.device_class = NULL;
1688	vfio_virqfd_exit();
1689	vfio_group_cleanup();
1690	xa_destroy(&vfio_device_set_xa);
1691}
1692
1693module_init(vfio_init);
1694module_exit(vfio_cleanup);
1695
1696MODULE_VERSION(DRIVER_VERSION);
1697MODULE_LICENSE("GPL v2");
1698MODULE_AUTHOR(DRIVER_AUTHOR);
1699MODULE_DESCRIPTION(DRIVER_DESC);
1700MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");
1701