xref: /kernel/linux/linux-6.6/drivers/vfio/pci/mlx5/main.c (revision 62306a36)
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
4 */
5
6#include <linux/device.h>
7#include <linux/eventfd.h>
8#include <linux/file.h>
9#include <linux/interrupt.h>
10#include <linux/iommu.h>
11#include <linux/module.h>
12#include <linux/mutex.h>
13#include <linux/notifier.h>
14#include <linux/pci.h>
15#include <linux/pm_runtime.h>
16#include <linux/types.h>
17#include <linux/uaccess.h>
18#include <linux/vfio.h>
19#include <linux/sched/mm.h>
20#include <linux/anon_inodes.h>
21
22#include "cmd.h"
23
24/* Device specification max LOAD size */
25#define MAX_LOAD_SIZE (BIT_ULL(__mlx5_bit_sz(load_vhca_state_in, size)) - 1)
26
27static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev)
28{
29	struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
30
31	return container_of(core_device, struct mlx5vf_pci_core_device,
32			    core_device);
33}
34
35struct page *
36mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
37			  unsigned long offset)
38{
39	unsigned long cur_offset = 0;
40	struct scatterlist *sg;
41	unsigned int i;
42
43	/* All accesses are sequential */
44	if (offset < buf->last_offset || !buf->last_offset_sg) {
45		buf->last_offset = 0;
46		buf->last_offset_sg = buf->table.sgt.sgl;
47		buf->sg_last_entry = 0;
48	}
49
50	cur_offset = buf->last_offset;
51
52	for_each_sg(buf->last_offset_sg, sg,
53			buf->table.sgt.orig_nents - buf->sg_last_entry, i) {
54		if (offset < sg->length + cur_offset) {
55			buf->last_offset_sg = sg;
56			buf->sg_last_entry += i;
57			buf->last_offset = cur_offset;
58			return nth_page(sg_page(sg),
59					(offset - cur_offset) / PAGE_SIZE);
60		}
61		cur_offset += sg->length;
62	}
63	return NULL;
64}
65
66int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
67			       unsigned int npages)
68{
69	unsigned int to_alloc = npages;
70	struct page **page_list;
71	unsigned long filled;
72	unsigned int to_fill;
73	int ret;
74
75	to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list));
76	page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL_ACCOUNT);
77	if (!page_list)
78		return -ENOMEM;
79
80	do {
81		filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill,
82						page_list);
83		if (!filled) {
84			ret = -ENOMEM;
85			goto err;
86		}
87		to_alloc -= filled;
88		ret = sg_alloc_append_table_from_pages(
89			&buf->table, page_list, filled, 0,
90			filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC,
91			GFP_KERNEL_ACCOUNT);
92
93		if (ret)
94			goto err;
95		buf->allocated_length += filled * PAGE_SIZE;
96		/* clean input for another bulk allocation */
97		memset(page_list, 0, filled * sizeof(*page_list));
98		to_fill = min_t(unsigned int, to_alloc,
99				PAGE_SIZE / sizeof(*page_list));
100	} while (to_alloc > 0);
101
102	kvfree(page_list);
103	return 0;
104
105err:
106	kvfree(page_list);
107	return ret;
108}
109
110static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf)
111{
112	mutex_lock(&migf->lock);
113	migf->state = MLX5_MIGF_STATE_ERROR;
114	migf->filp->f_pos = 0;
115	mutex_unlock(&migf->lock);
116}
117
118static int mlx5vf_release_file(struct inode *inode, struct file *filp)
119{
120	struct mlx5_vf_migration_file *migf = filp->private_data;
121
122	mlx5vf_disable_fd(migf);
123	mutex_destroy(&migf->lock);
124	kfree(migf);
125	return 0;
126}
127
128static struct mlx5_vhca_data_buffer *
129mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos,
130			      bool *end_of_data)
131{
132	struct mlx5_vhca_data_buffer *buf;
133	bool found = false;
134
135	*end_of_data = false;
136	spin_lock_irq(&migf->list_lock);
137	if (list_empty(&migf->buf_list)) {
138		*end_of_data = true;
139		goto end;
140	}
141
142	buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer,
143			       buf_elm);
144	if (pos >= buf->start_pos &&
145	    pos < buf->start_pos + buf->length) {
146		found = true;
147		goto end;
148	}
149
150	/*
151	 * As we use a stream based FD we may expect having the data always
152	 * on first chunk
153	 */
154	migf->state = MLX5_MIGF_STATE_ERROR;
155
156end:
157	spin_unlock_irq(&migf->list_lock);
158	return found ? buf : NULL;
159}
160
161static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf,
162			       char __user **buf, size_t *len, loff_t *pos)
163{
164	unsigned long offset;
165	ssize_t done = 0;
166	size_t copy_len;
167
168	copy_len = min_t(size_t,
169			 vhca_buf->start_pos + vhca_buf->length - *pos, *len);
170	while (copy_len) {
171		size_t page_offset;
172		struct page *page;
173		size_t page_len;
174		u8 *from_buff;
175		int ret;
176
177		offset = *pos - vhca_buf->start_pos;
178		page_offset = offset % PAGE_SIZE;
179		offset -= page_offset;
180		page = mlx5vf_get_migration_page(vhca_buf, offset);
181		if (!page)
182			return -EINVAL;
183		page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset);
184		from_buff = kmap_local_page(page);
185		ret = copy_to_user(*buf, from_buff + page_offset, page_len);
186		kunmap_local(from_buff);
187		if (ret)
188			return -EFAULT;
189		*pos += page_len;
190		*len -= page_len;
191		*buf += page_len;
192		done += page_len;
193		copy_len -= page_len;
194	}
195
196	if (*pos >= vhca_buf->start_pos + vhca_buf->length) {
197		spin_lock_irq(&vhca_buf->migf->list_lock);
198		list_del_init(&vhca_buf->buf_elm);
199		list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list);
200		spin_unlock_irq(&vhca_buf->migf->list_lock);
201	}
202
203	return done;
204}
205
206static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len,
207			       loff_t *pos)
208{
209	struct mlx5_vf_migration_file *migf = filp->private_data;
210	struct mlx5_vhca_data_buffer *vhca_buf;
211	bool first_loop_call = true;
212	bool end_of_data;
213	ssize_t done = 0;
214
215	if (pos)
216		return -ESPIPE;
217	pos = &filp->f_pos;
218
219	if (!(filp->f_flags & O_NONBLOCK)) {
220		if (wait_event_interruptible(migf->poll_wait,
221				!list_empty(&migf->buf_list) ||
222				migf->state == MLX5_MIGF_STATE_ERROR ||
223				migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR ||
224				migf->state == MLX5_MIGF_STATE_PRE_COPY ||
225				migf->state == MLX5_MIGF_STATE_COMPLETE))
226			return -ERESTARTSYS;
227	}
228
229	mutex_lock(&migf->lock);
230	if (migf->state == MLX5_MIGF_STATE_ERROR) {
231		done = -ENODEV;
232		goto out_unlock;
233	}
234
235	while (len) {
236		ssize_t count;
237
238		vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos,
239							 &end_of_data);
240		if (first_loop_call) {
241			first_loop_call = false;
242			/* Temporary end of file as part of PRE_COPY */
243			if (end_of_data && (migf->state == MLX5_MIGF_STATE_PRE_COPY ||
244				migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)) {
245				done = -ENOMSG;
246				goto out_unlock;
247			}
248
249			if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) {
250				if (filp->f_flags & O_NONBLOCK) {
251					done = -EAGAIN;
252					goto out_unlock;
253				}
254			}
255		}
256
257		if (end_of_data)
258			goto out_unlock;
259
260		if (!vhca_buf) {
261			done = -EINVAL;
262			goto out_unlock;
263		}
264
265		count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos);
266		if (count < 0) {
267			done = count;
268			goto out_unlock;
269		}
270		done += count;
271	}
272
273out_unlock:
274	mutex_unlock(&migf->lock);
275	return done;
276}
277
278static __poll_t mlx5vf_save_poll(struct file *filp,
279				 struct poll_table_struct *wait)
280{
281	struct mlx5_vf_migration_file *migf = filp->private_data;
282	__poll_t pollflags = 0;
283
284	poll_wait(filp, &migf->poll_wait, wait);
285
286	mutex_lock(&migf->lock);
287	if (migf->state == MLX5_MIGF_STATE_ERROR)
288		pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
289	else if (!list_empty(&migf->buf_list) ||
290		 migf->state == MLX5_MIGF_STATE_COMPLETE)
291		pollflags = EPOLLIN | EPOLLRDNORM;
292	mutex_unlock(&migf->lock);
293
294	return pollflags;
295}
296
297/*
298 * FD is exposed and user can use it after receiving an error.
299 * Mark migf in error, and wake the user.
300 */
301static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf)
302{
303	migf->state = MLX5_MIGF_STATE_ERROR;
304	wake_up_interruptible(&migf->poll_wait);
305}
306
307static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf)
308{
309	size_t size = sizeof(struct mlx5_vf_migration_header) +
310		sizeof(struct mlx5_vf_migration_tag_stop_copy_data);
311	struct mlx5_vf_migration_tag_stop_copy_data data = {};
312	struct mlx5_vhca_data_buffer *header_buf = NULL;
313	struct mlx5_vf_migration_header header = {};
314	unsigned long flags;
315	struct page *page;
316	u8 *to_buff;
317	int ret;
318
319	header_buf = mlx5vf_get_data_buffer(migf, size, DMA_NONE);
320	if (IS_ERR(header_buf))
321		return PTR_ERR(header_buf);
322
323	header.record_size = cpu_to_le64(sizeof(data));
324	header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL);
325	header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE);
326	page = mlx5vf_get_migration_page(header_buf, 0);
327	if (!page) {
328		ret = -EINVAL;
329		goto err;
330	}
331	to_buff = kmap_local_page(page);
332	memcpy(to_buff, &header, sizeof(header));
333	header_buf->length = sizeof(header);
334	data.stop_copy_size = cpu_to_le64(migf->buf->allocated_length);
335	memcpy(to_buff + sizeof(header), &data, sizeof(data));
336	header_buf->length += sizeof(data);
337	kunmap_local(to_buff);
338	header_buf->start_pos = header_buf->migf->max_pos;
339	migf->max_pos += header_buf->length;
340	spin_lock_irqsave(&migf->list_lock, flags);
341	list_add_tail(&header_buf->buf_elm, &migf->buf_list);
342	spin_unlock_irqrestore(&migf->list_lock, flags);
343	migf->pre_copy_initial_bytes = size;
344	return 0;
345err:
346	mlx5vf_put_data_buffer(header_buf);
347	return ret;
348}
349
350static int mlx5vf_prep_stop_copy(struct mlx5_vf_migration_file *migf,
351				 size_t state_size)
352{
353	struct mlx5_vhca_data_buffer *buf;
354	size_t inc_state_size;
355	int ret;
356
357	/* let's be ready for stop_copy size that might grow by 10 percents */
358	if (check_add_overflow(state_size, state_size / 10, &inc_state_size))
359		inc_state_size = state_size;
360
361	buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE);
362	if (IS_ERR(buf))
363		return PTR_ERR(buf);
364
365	migf->buf = buf;
366	buf = mlx5vf_get_data_buffer(migf,
367			sizeof(struct mlx5_vf_migration_header), DMA_NONE);
368	if (IS_ERR(buf)) {
369		ret = PTR_ERR(buf);
370		goto err;
371	}
372
373	migf->buf_header = buf;
374	ret = mlx5vf_add_stop_copy_header(migf);
375	if (ret)
376		goto err_header;
377	return 0;
378
379err_header:
380	mlx5vf_put_data_buffer(migf->buf_header);
381	migf->buf_header = NULL;
382err:
383	mlx5vf_put_data_buffer(migf->buf);
384	migf->buf = NULL;
385	return ret;
386}
387
388static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd,
389				 unsigned long arg)
390{
391	struct mlx5_vf_migration_file *migf = filp->private_data;
392	struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
393	struct mlx5_vhca_data_buffer *buf;
394	struct vfio_precopy_info info = {};
395	loff_t *pos = &filp->f_pos;
396	unsigned long minsz;
397	size_t inc_length = 0;
398	bool end_of_data = false;
399	int ret;
400
401	if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
402		return -ENOTTY;
403
404	minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
405
406	if (copy_from_user(&info, (void __user *)arg, minsz))
407		return -EFAULT;
408
409	if (info.argsz < minsz)
410		return -EINVAL;
411
412	mutex_lock(&mvdev->state_mutex);
413	if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY &&
414	    mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
415		ret = -EINVAL;
416		goto err_state_unlock;
417	}
418
419	/*
420	 * We can't issue a SAVE command when the device is suspended, so as
421	 * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra
422	 * bytes that can't be read.
423	 */
424	if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) {
425		/*
426		 * Once the query returns it's guaranteed that there is no
427		 * active SAVE command.
428		 * As so, the other code below is safe with the proper locks.
429		 */
430		ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length,
431							    MLX5VF_QUERY_INC);
432		if (ret)
433			goto err_state_unlock;
434	}
435
436	mutex_lock(&migf->lock);
437	if (migf->state == MLX5_MIGF_STATE_ERROR) {
438		ret = -ENODEV;
439		goto err_migf_unlock;
440	}
441
442	if (migf->pre_copy_initial_bytes > *pos) {
443		info.initial_bytes = migf->pre_copy_initial_bytes - *pos;
444	} else {
445		info.dirty_bytes = migf->max_pos - *pos;
446		if (!info.dirty_bytes)
447			end_of_data = true;
448		info.dirty_bytes += inc_length;
449	}
450
451	if (!end_of_data || !inc_length) {
452		mutex_unlock(&migf->lock);
453		goto done;
454	}
455
456	mutex_unlock(&migf->lock);
457	/*
458	 * We finished transferring the current state and the device has a
459	 * dirty state, save a new state to be ready for.
460	 */
461	buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE);
462	if (IS_ERR(buf)) {
463		ret = PTR_ERR(buf);
464		mlx5vf_mark_err(migf);
465		goto err_state_unlock;
466	}
467
468	ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true);
469	if (ret) {
470		mlx5vf_mark_err(migf);
471		mlx5vf_put_data_buffer(buf);
472		goto err_state_unlock;
473	}
474
475done:
476	mlx5vf_state_mutex_unlock(mvdev);
477	if (copy_to_user((void __user *)arg, &info, minsz))
478		return -EFAULT;
479	return 0;
480
481err_migf_unlock:
482	mutex_unlock(&migf->lock);
483err_state_unlock:
484	mlx5vf_state_mutex_unlock(mvdev);
485	return ret;
486}
487
488static const struct file_operations mlx5vf_save_fops = {
489	.owner = THIS_MODULE,
490	.read = mlx5vf_save_read,
491	.poll = mlx5vf_save_poll,
492	.unlocked_ioctl = mlx5vf_precopy_ioctl,
493	.compat_ioctl = compat_ptr_ioctl,
494	.release = mlx5vf_release_file,
495	.llseek = no_llseek,
496};
497
498static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev)
499{
500	struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
501	struct mlx5_vhca_data_buffer *buf;
502	size_t length;
503	int ret;
504
505	if (migf->state == MLX5_MIGF_STATE_ERROR)
506		return -ENODEV;
507
508	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length,
509				MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL);
510	if (ret)
511		goto err;
512
513	/* Checking whether we have a matching pre-allocated buffer that can fit */
514	if (migf->buf && migf->buf->allocated_length >= length) {
515		buf = migf->buf;
516		migf->buf = NULL;
517	} else {
518		buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE);
519		if (IS_ERR(buf)) {
520			ret = PTR_ERR(buf);
521			goto err;
522		}
523	}
524
525	ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false);
526	if (ret)
527		goto err_save;
528
529	return 0;
530
531err_save:
532	mlx5vf_put_data_buffer(buf);
533err:
534	mlx5vf_mark_err(migf);
535	return ret;
536}
537
538static struct mlx5_vf_migration_file *
539mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
540{
541	struct mlx5_vf_migration_file *migf;
542	struct mlx5_vhca_data_buffer *buf;
543	size_t length;
544	int ret;
545
546	migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
547	if (!migf)
548		return ERR_PTR(-ENOMEM);
549
550	migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf,
551					O_RDONLY);
552	if (IS_ERR(migf->filp)) {
553		ret = PTR_ERR(migf->filp);
554		goto end;
555	}
556
557	migf->mvdev = mvdev;
558	ret = mlx5vf_cmd_alloc_pd(migf);
559	if (ret)
560		goto out_free;
561
562	stream_open(migf->filp->f_inode, migf->filp);
563	mutex_init(&migf->lock);
564	init_waitqueue_head(&migf->poll_wait);
565	init_completion(&migf->save_comp);
566	/*
567	 * save_comp is being used as a binary semaphore built from
568	 * a completion. A normal mutex cannot be used because the lock is
569	 * passed between kernel threads and lockdep can't model this.
570	 */
571	complete(&migf->save_comp);
572	mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx);
573	INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb);
574	INIT_LIST_HEAD(&migf->buf_list);
575	INIT_LIST_HEAD(&migf->avail_list);
576	spin_lock_init(&migf->list_lock);
577	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 0);
578	if (ret)
579		goto out_pd;
580
581	if (track) {
582		ret = mlx5vf_prep_stop_copy(migf, length);
583		if (ret)
584			goto out_pd;
585	}
586
587	buf = mlx5vf_alloc_data_buffer(migf, length, DMA_FROM_DEVICE);
588	if (IS_ERR(buf)) {
589		ret = PTR_ERR(buf);
590		goto out_pd;
591	}
592
593	ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track);
594	if (ret)
595		goto out_save;
596	return migf;
597out_save:
598	mlx5vf_free_data_buffer(buf);
599out_pd:
600	mlx5fv_cmd_clean_migf_resources(migf);
601out_free:
602	fput(migf->filp);
603end:
604	kfree(migf);
605	return ERR_PTR(ret);
606}
607
608static int
609mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf,
610			      const char __user **buf, size_t *len,
611			      loff_t *pos, ssize_t *done)
612{
613	unsigned long offset;
614	size_t page_offset;
615	struct page *page;
616	size_t page_len;
617	u8 *to_buff;
618	int ret;
619
620	offset = *pos - vhca_buf->start_pos;
621	page_offset = offset % PAGE_SIZE;
622
623	page = mlx5vf_get_migration_page(vhca_buf, offset - page_offset);
624	if (!page)
625		return -EINVAL;
626	page_len = min_t(size_t, *len, PAGE_SIZE - page_offset);
627	to_buff = kmap_local_page(page);
628	ret = copy_from_user(to_buff + page_offset, *buf, page_len);
629	kunmap_local(to_buff);
630	if (ret)
631		return -EFAULT;
632
633	*pos += page_len;
634	*done += page_len;
635	*buf += page_len;
636	*len -= page_len;
637	vhca_buf->length += page_len;
638	return 0;
639}
640
641static int
642mlx5vf_resume_read_image_no_header(struct mlx5_vhca_data_buffer *vhca_buf,
643				   loff_t requested_length,
644				   const char __user **buf, size_t *len,
645				   loff_t *pos, ssize_t *done)
646{
647	int ret;
648
649	if (requested_length > MAX_LOAD_SIZE)
650		return -ENOMEM;
651
652	if (vhca_buf->allocated_length < requested_length) {
653		ret = mlx5vf_add_migration_pages(
654			vhca_buf,
655			DIV_ROUND_UP(requested_length - vhca_buf->allocated_length,
656				     PAGE_SIZE));
657		if (ret)
658			return ret;
659	}
660
661	while (*len) {
662		ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, len, pos,
663						    done);
664		if (ret)
665			return ret;
666	}
667
668	return 0;
669}
670
671static ssize_t
672mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf,
673			 struct mlx5_vhca_data_buffer *vhca_buf,
674			 size_t image_size, const char __user **buf,
675			 size_t *len, loff_t *pos, ssize_t *done,
676			 bool *has_work)
677{
678	size_t copy_len, to_copy;
679	int ret;
680
681	to_copy = min_t(size_t, *len, image_size - vhca_buf->length);
682	copy_len = to_copy;
683	while (to_copy) {
684		ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos,
685						    done);
686		if (ret)
687			return ret;
688	}
689
690	*len -= copy_len;
691	if (vhca_buf->length == image_size) {
692		migf->load_state = MLX5_VF_LOAD_STATE_LOAD_IMAGE;
693		migf->max_pos += image_size;
694		*has_work = true;
695	}
696
697	return 0;
698}
699
700static int
701mlx5vf_resume_read_header_data(struct mlx5_vf_migration_file *migf,
702			       struct mlx5_vhca_data_buffer *vhca_buf,
703			       const char __user **buf, size_t *len,
704			       loff_t *pos, ssize_t *done)
705{
706	size_t copy_len, to_copy;
707	size_t required_data;
708	u8 *to_buff;
709	int ret;
710
711	required_data = migf->record_size - vhca_buf->length;
712	to_copy = min_t(size_t, *len, required_data);
713	copy_len = to_copy;
714	while (to_copy) {
715		ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos,
716						    done);
717		if (ret)
718			return ret;
719	}
720
721	*len -= copy_len;
722	if (vhca_buf->length == migf->record_size) {
723		switch (migf->record_tag) {
724		case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE:
725		{
726			struct page *page;
727
728			page = mlx5vf_get_migration_page(vhca_buf, 0);
729			if (!page)
730				return -EINVAL;
731			to_buff = kmap_local_page(page);
732			migf->stop_copy_prep_size = min_t(u64,
733				le64_to_cpup((__le64 *)to_buff), MAX_LOAD_SIZE);
734			kunmap_local(to_buff);
735			break;
736		}
737		default:
738			/* Optional tag */
739			break;
740		}
741
742		migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
743		migf->max_pos += migf->record_size;
744		vhca_buf->length = 0;
745	}
746
747	return 0;
748}
749
750static int
751mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf,
752			  struct mlx5_vhca_data_buffer *vhca_buf,
753			  const char __user **buf,
754			  size_t *len, loff_t *pos,
755			  ssize_t *done, bool *has_work)
756{
757	struct page *page;
758	size_t copy_len;
759	u8 *to_buff;
760	int ret;
761
762	copy_len = min_t(size_t, *len,
763		sizeof(struct mlx5_vf_migration_header) - vhca_buf->length);
764	page = mlx5vf_get_migration_page(vhca_buf, 0);
765	if (!page)
766		return -EINVAL;
767	to_buff = kmap_local_page(page);
768	ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len);
769	if (ret) {
770		ret = -EFAULT;
771		goto end;
772	}
773
774	*buf += copy_len;
775	*pos += copy_len;
776	*done += copy_len;
777	*len -= copy_len;
778	vhca_buf->length += copy_len;
779	if (vhca_buf->length == sizeof(struct mlx5_vf_migration_header)) {
780		u64 record_size;
781		u32 flags;
782
783		record_size = le64_to_cpup((__le64 *)to_buff);
784		if (record_size > MAX_LOAD_SIZE) {
785			ret = -ENOMEM;
786			goto end;
787		}
788
789		migf->record_size = record_size;
790		flags = le32_to_cpup((__le32 *)(to_buff +
791			    offsetof(struct mlx5_vf_migration_header, flags)));
792		migf->record_tag = le32_to_cpup((__le32 *)(to_buff +
793			    offsetof(struct mlx5_vf_migration_header, tag)));
794		switch (migf->record_tag) {
795		case MLX5_MIGF_HEADER_TAG_FW_DATA:
796			migf->load_state = MLX5_VF_LOAD_STATE_PREP_IMAGE;
797			break;
798		case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE:
799			migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA;
800			break;
801		default:
802			if (!(flags & MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL)) {
803				ret = -EOPNOTSUPP;
804				goto end;
805			}
806			/* We may read and skip this optional record data */
807			migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA;
808		}
809
810		migf->max_pos += vhca_buf->length;
811		vhca_buf->length = 0;
812		*has_work = true;
813	}
814end:
815	kunmap_local(to_buff);
816	return ret;
817}
818
819static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
820				   size_t len, loff_t *pos)
821{
822	struct mlx5_vf_migration_file *migf = filp->private_data;
823	struct mlx5_vhca_data_buffer *vhca_buf = migf->buf;
824	struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header;
825	loff_t requested_length;
826	bool has_work = false;
827	ssize_t done = 0;
828	int ret = 0;
829
830	if (pos)
831		return -ESPIPE;
832	pos = &filp->f_pos;
833
834	if (*pos < 0 ||
835	    check_add_overflow((loff_t)len, *pos, &requested_length))
836		return -EINVAL;
837
838	mutex_lock(&migf->mvdev->state_mutex);
839	mutex_lock(&migf->lock);
840	if (migf->state == MLX5_MIGF_STATE_ERROR) {
841		ret = -ENODEV;
842		goto out_unlock;
843	}
844
845	while (len || has_work) {
846		has_work = false;
847		switch (migf->load_state) {
848		case MLX5_VF_LOAD_STATE_READ_HEADER:
849			ret = mlx5vf_resume_read_header(migf, vhca_buf_header,
850							&buf, &len, pos,
851							&done, &has_work);
852			if (ret)
853				goto out_unlock;
854			break;
855		case MLX5_VF_LOAD_STATE_PREP_HEADER_DATA:
856			if (vhca_buf_header->allocated_length < migf->record_size) {
857				mlx5vf_free_data_buffer(vhca_buf_header);
858
859				migf->buf_header = mlx5vf_alloc_data_buffer(migf,
860						migf->record_size, DMA_NONE);
861				if (IS_ERR(migf->buf_header)) {
862					ret = PTR_ERR(migf->buf_header);
863					migf->buf_header = NULL;
864					goto out_unlock;
865				}
866
867				vhca_buf_header = migf->buf_header;
868			}
869
870			vhca_buf_header->start_pos = migf->max_pos;
871			migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER_DATA;
872			break;
873		case MLX5_VF_LOAD_STATE_READ_HEADER_DATA:
874			ret = mlx5vf_resume_read_header_data(migf, vhca_buf_header,
875							&buf, &len, pos, &done);
876			if (ret)
877				goto out_unlock;
878			break;
879		case MLX5_VF_LOAD_STATE_PREP_IMAGE:
880		{
881			u64 size = max(migf->record_size,
882				       migf->stop_copy_prep_size);
883
884			if (vhca_buf->allocated_length < size) {
885				mlx5vf_free_data_buffer(vhca_buf);
886
887				migf->buf = mlx5vf_alloc_data_buffer(migf,
888							size, DMA_TO_DEVICE);
889				if (IS_ERR(migf->buf)) {
890					ret = PTR_ERR(migf->buf);
891					migf->buf = NULL;
892					goto out_unlock;
893				}
894
895				vhca_buf = migf->buf;
896			}
897
898			vhca_buf->start_pos = migf->max_pos;
899			migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE;
900			break;
901		}
902		case MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER:
903			ret = mlx5vf_resume_read_image_no_header(vhca_buf,
904						requested_length,
905						&buf, &len, pos, &done);
906			if (ret)
907				goto out_unlock;
908			break;
909		case MLX5_VF_LOAD_STATE_READ_IMAGE:
910			ret = mlx5vf_resume_read_image(migf, vhca_buf,
911						migf->record_size,
912						&buf, &len, pos, &done, &has_work);
913			if (ret)
914				goto out_unlock;
915			break;
916		case MLX5_VF_LOAD_STATE_LOAD_IMAGE:
917			ret = mlx5vf_cmd_load_vhca_state(migf->mvdev, migf, vhca_buf);
918			if (ret)
919				goto out_unlock;
920			migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
921
922			/* prep header buf for next image */
923			vhca_buf_header->length = 0;
924			/* prep data buf for next image */
925			vhca_buf->length = 0;
926
927			break;
928		default:
929			break;
930		}
931	}
932
933out_unlock:
934	if (ret)
935		migf->state = MLX5_MIGF_STATE_ERROR;
936	mutex_unlock(&migf->lock);
937	mlx5vf_state_mutex_unlock(migf->mvdev);
938	return ret ? ret : done;
939}
940
941static const struct file_operations mlx5vf_resume_fops = {
942	.owner = THIS_MODULE,
943	.write = mlx5vf_resume_write,
944	.release = mlx5vf_release_file,
945	.llseek = no_llseek,
946};
947
948static struct mlx5_vf_migration_file *
949mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
950{
951	struct mlx5_vf_migration_file *migf;
952	struct mlx5_vhca_data_buffer *buf;
953	int ret;
954
955	migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
956	if (!migf)
957		return ERR_PTR(-ENOMEM);
958
959	migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf,
960					O_WRONLY);
961	if (IS_ERR(migf->filp)) {
962		ret = PTR_ERR(migf->filp);
963		goto end;
964	}
965
966	migf->mvdev = mvdev;
967	ret = mlx5vf_cmd_alloc_pd(migf);
968	if (ret)
969		goto out_free;
970
971	buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE);
972	if (IS_ERR(buf)) {
973		ret = PTR_ERR(buf);
974		goto out_pd;
975	}
976
977	migf->buf = buf;
978	if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
979		buf = mlx5vf_alloc_data_buffer(migf,
980			sizeof(struct mlx5_vf_migration_header), DMA_NONE);
981		if (IS_ERR(buf)) {
982			ret = PTR_ERR(buf);
983			goto out_buf;
984		}
985
986		migf->buf_header = buf;
987		migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
988	} else {
989		/* Initial state will be to read the image */
990		migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER;
991	}
992
993	stream_open(migf->filp->f_inode, migf->filp);
994	mutex_init(&migf->lock);
995	INIT_LIST_HEAD(&migf->buf_list);
996	INIT_LIST_HEAD(&migf->avail_list);
997	spin_lock_init(&migf->list_lock);
998	return migf;
999out_buf:
1000	mlx5vf_free_data_buffer(migf->buf);
1001out_pd:
1002	mlx5vf_cmd_dealloc_pd(migf);
1003out_free:
1004	fput(migf->filp);
1005end:
1006	kfree(migf);
1007	return ERR_PTR(ret);
1008}
1009
1010void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev)
1011{
1012	if (mvdev->resuming_migf) {
1013		mlx5vf_disable_fd(mvdev->resuming_migf);
1014		mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf);
1015		fput(mvdev->resuming_migf->filp);
1016		mvdev->resuming_migf = NULL;
1017	}
1018	if (mvdev->saving_migf) {
1019		mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx);
1020		cancel_work_sync(&mvdev->saving_migf->async_data.work);
1021		mlx5vf_disable_fd(mvdev->saving_migf);
1022		mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf);
1023		fput(mvdev->saving_migf->filp);
1024		mvdev->saving_migf = NULL;
1025	}
1026}
1027
1028static struct file *
1029mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
1030				    u32 new)
1031{
1032	u32 cur = mvdev->mig_state;
1033	int ret;
1034
1035	if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) {
1036		ret = mlx5vf_cmd_suspend_vhca(mvdev,
1037			MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
1038		if (ret)
1039			return ERR_PTR(ret);
1040		return NULL;
1041	}
1042
1043	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) {
1044		ret = mlx5vf_cmd_resume_vhca(mvdev,
1045			MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER);
1046		if (ret)
1047			return ERR_PTR(ret);
1048		return NULL;
1049	}
1050
1051	if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) ||
1052	    (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
1053		ret = mlx5vf_cmd_suspend_vhca(mvdev,
1054			MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR);
1055		if (ret)
1056			return ERR_PTR(ret);
1057		return NULL;
1058	}
1059
1060	if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) ||
1061	    (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) {
1062		ret = mlx5vf_cmd_resume_vhca(mvdev,
1063			MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR);
1064		if (ret)
1065			return ERR_PTR(ret);
1066		return NULL;
1067	}
1068
1069	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
1070		struct mlx5_vf_migration_file *migf;
1071
1072		migf = mlx5vf_pci_save_device_data(mvdev, false);
1073		if (IS_ERR(migf))
1074			return ERR_CAST(migf);
1075		get_file(migf->filp);
1076		mvdev->saving_migf = migf;
1077		return migf->filp;
1078	}
1079
1080	if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) ||
1081	    (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) ||
1082	    (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P &&
1083	     new == VFIO_DEVICE_STATE_RUNNING_P2P)) {
1084		mlx5vf_disable_fds(mvdev);
1085		return NULL;
1086	}
1087
1088	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
1089		struct mlx5_vf_migration_file *migf;
1090
1091		migf = mlx5vf_pci_resume_device_data(mvdev);
1092		if (IS_ERR(migf))
1093			return ERR_CAST(migf);
1094		get_file(migf->filp);
1095		mvdev->resuming_migf = migf;
1096		return migf->filp;
1097	}
1098
1099	if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
1100		if (!MLX5VF_PRE_COPY_SUPP(mvdev)) {
1101			ret = mlx5vf_cmd_load_vhca_state(mvdev,
1102							 mvdev->resuming_migf,
1103							 mvdev->resuming_migf->buf);
1104			if (ret)
1105				return ERR_PTR(ret);
1106		}
1107		mlx5vf_disable_fds(mvdev);
1108		return NULL;
1109	}
1110
1111	if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) ||
1112	    (cur == VFIO_DEVICE_STATE_RUNNING_P2P &&
1113	     new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
1114		struct mlx5_vf_migration_file *migf;
1115
1116		migf = mlx5vf_pci_save_device_data(mvdev, true);
1117		if (IS_ERR(migf))
1118			return ERR_CAST(migf);
1119		get_file(migf->filp);
1120		mvdev->saving_migf = migf;
1121		return migf->filp;
1122	}
1123
1124	if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) {
1125		ret = mlx5vf_cmd_suspend_vhca(mvdev,
1126			MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
1127		if (ret)
1128			return ERR_PTR(ret);
1129		ret = mlx5vf_pci_save_device_inc_data(mvdev);
1130		return ret ? ERR_PTR(ret) : NULL;
1131	}
1132
1133	/*
1134	 * vfio_mig_get_next_state() does not use arcs other than the above
1135	 */
1136	WARN_ON(true);
1137	return ERR_PTR(-EINVAL);
1138}
1139
1140/*
1141 * This function is called in all state_mutex unlock cases to
1142 * handle a 'deferred_reset' if exists.
1143 */
1144void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev)
1145{
1146again:
1147	spin_lock(&mvdev->reset_lock);
1148	if (mvdev->deferred_reset) {
1149		mvdev->deferred_reset = false;
1150		spin_unlock(&mvdev->reset_lock);
1151		mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
1152		mlx5vf_disable_fds(mvdev);
1153		goto again;
1154	}
1155	mutex_unlock(&mvdev->state_mutex);
1156	spin_unlock(&mvdev->reset_lock);
1157}
1158
1159static struct file *
1160mlx5vf_pci_set_device_state(struct vfio_device *vdev,
1161			    enum vfio_device_mig_state new_state)
1162{
1163	struct mlx5vf_pci_core_device *mvdev = container_of(
1164		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1165	enum vfio_device_mig_state next_state;
1166	struct file *res = NULL;
1167	int ret;
1168
1169	mutex_lock(&mvdev->state_mutex);
1170	while (new_state != mvdev->mig_state) {
1171		ret = vfio_mig_get_next_state(vdev, mvdev->mig_state,
1172					      new_state, &next_state);
1173		if (ret) {
1174			res = ERR_PTR(ret);
1175			break;
1176		}
1177		res = mlx5vf_pci_step_device_state_locked(mvdev, next_state);
1178		if (IS_ERR(res))
1179			break;
1180		mvdev->mig_state = next_state;
1181		if (WARN_ON(res && new_state != mvdev->mig_state)) {
1182			fput(res);
1183			res = ERR_PTR(-EINVAL);
1184			break;
1185		}
1186	}
1187	mlx5vf_state_mutex_unlock(mvdev);
1188	return res;
1189}
1190
1191static int mlx5vf_pci_get_data_size(struct vfio_device *vdev,
1192				    unsigned long *stop_copy_length)
1193{
1194	struct mlx5vf_pci_core_device *mvdev = container_of(
1195		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1196	size_t state_size;
1197	int ret;
1198
1199	mutex_lock(&mvdev->state_mutex);
1200	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev,
1201						    &state_size, 0);
1202	if (!ret)
1203		*stop_copy_length = state_size;
1204	mlx5vf_state_mutex_unlock(mvdev);
1205	return ret;
1206}
1207
1208static int mlx5vf_pci_get_device_state(struct vfio_device *vdev,
1209				       enum vfio_device_mig_state *curr_state)
1210{
1211	struct mlx5vf_pci_core_device *mvdev = container_of(
1212		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1213
1214	mutex_lock(&mvdev->state_mutex);
1215	*curr_state = mvdev->mig_state;
1216	mlx5vf_state_mutex_unlock(mvdev);
1217	return 0;
1218}
1219
1220static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev)
1221{
1222	struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
1223
1224	if (!mvdev->migrate_cap)
1225		return;
1226
1227	/*
1228	 * As the higher VFIO layers are holding locks across reset and using
1229	 * those same locks with the mm_lock we need to prevent ABBA deadlock
1230	 * with the state_mutex and mm_lock.
1231	 * In case the state_mutex was taken already we defer the cleanup work
1232	 * to the unlock flow of the other running context.
1233	 */
1234	spin_lock(&mvdev->reset_lock);
1235	mvdev->deferred_reset = true;
1236	if (!mutex_trylock(&mvdev->state_mutex)) {
1237		spin_unlock(&mvdev->reset_lock);
1238		return;
1239	}
1240	spin_unlock(&mvdev->reset_lock);
1241	mlx5vf_state_mutex_unlock(mvdev);
1242}
1243
1244static int mlx5vf_pci_open_device(struct vfio_device *core_vdev)
1245{
1246	struct mlx5vf_pci_core_device *mvdev = container_of(
1247		core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1248	struct vfio_pci_core_device *vdev = &mvdev->core_device;
1249	int ret;
1250
1251	ret = vfio_pci_core_enable(vdev);
1252	if (ret)
1253		return ret;
1254
1255	if (mvdev->migrate_cap)
1256		mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
1257	vfio_pci_core_finish_enable(vdev);
1258	return 0;
1259}
1260
1261static void mlx5vf_pci_close_device(struct vfio_device *core_vdev)
1262{
1263	struct mlx5vf_pci_core_device *mvdev = container_of(
1264		core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1265
1266	mlx5vf_cmd_close_migratable(mvdev);
1267	vfio_pci_core_close_device(core_vdev);
1268}
1269
1270static const struct vfio_migration_ops mlx5vf_pci_mig_ops = {
1271	.migration_set_state = mlx5vf_pci_set_device_state,
1272	.migration_get_state = mlx5vf_pci_get_device_state,
1273	.migration_get_data_size = mlx5vf_pci_get_data_size,
1274};
1275
1276static const struct vfio_log_ops mlx5vf_pci_log_ops = {
1277	.log_start = mlx5vf_start_page_tracker,
1278	.log_stop = mlx5vf_stop_page_tracker,
1279	.log_read_and_clear = mlx5vf_tracker_read_and_clear,
1280};
1281
1282static int mlx5vf_pci_init_dev(struct vfio_device *core_vdev)
1283{
1284	struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
1285			struct mlx5vf_pci_core_device, core_device.vdev);
1286	int ret;
1287
1288	ret = vfio_pci_core_init_dev(core_vdev);
1289	if (ret)
1290		return ret;
1291
1292	mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops,
1293				  &mlx5vf_pci_log_ops);
1294
1295	return 0;
1296}
1297
1298static void mlx5vf_pci_release_dev(struct vfio_device *core_vdev)
1299{
1300	struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
1301			struct mlx5vf_pci_core_device, core_device.vdev);
1302
1303	mlx5vf_cmd_remove_migratable(mvdev);
1304	vfio_pci_core_release_dev(core_vdev);
1305}
1306
1307static const struct vfio_device_ops mlx5vf_pci_ops = {
1308	.name = "mlx5-vfio-pci",
1309	.init = mlx5vf_pci_init_dev,
1310	.release = mlx5vf_pci_release_dev,
1311	.open_device = mlx5vf_pci_open_device,
1312	.close_device = mlx5vf_pci_close_device,
1313	.ioctl = vfio_pci_core_ioctl,
1314	.device_feature = vfio_pci_core_ioctl_feature,
1315	.read = vfio_pci_core_read,
1316	.write = vfio_pci_core_write,
1317	.mmap = vfio_pci_core_mmap,
1318	.request = vfio_pci_core_request,
1319	.match = vfio_pci_core_match,
1320	.bind_iommufd = vfio_iommufd_physical_bind,
1321	.unbind_iommufd = vfio_iommufd_physical_unbind,
1322	.attach_ioas = vfio_iommufd_physical_attach_ioas,
1323	.detach_ioas = vfio_iommufd_physical_detach_ioas,
1324};
1325
1326static int mlx5vf_pci_probe(struct pci_dev *pdev,
1327			    const struct pci_device_id *id)
1328{
1329	struct mlx5vf_pci_core_device *mvdev;
1330	int ret;
1331
1332	mvdev = vfio_alloc_device(mlx5vf_pci_core_device, core_device.vdev,
1333				  &pdev->dev, &mlx5vf_pci_ops);
1334	if (IS_ERR(mvdev))
1335		return PTR_ERR(mvdev);
1336
1337	dev_set_drvdata(&pdev->dev, &mvdev->core_device);
1338	ret = vfio_pci_core_register_device(&mvdev->core_device);
1339	if (ret)
1340		goto out_put_vdev;
1341	return 0;
1342
1343out_put_vdev:
1344	vfio_put_device(&mvdev->core_device.vdev);
1345	return ret;
1346}
1347
1348static void mlx5vf_pci_remove(struct pci_dev *pdev)
1349{
1350	struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
1351
1352	vfio_pci_core_unregister_device(&mvdev->core_device);
1353	vfio_put_device(&mvdev->core_device.vdev);
1354}
1355
1356static const struct pci_device_id mlx5vf_pci_table[] = {
1357	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_MELLANOX, 0x101e) }, /* ConnectX Family mlx5Gen Virtual Function */
1358	{}
1359};
1360
1361MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table);
1362
1363static const struct pci_error_handlers mlx5vf_err_handlers = {
1364	.reset_done = mlx5vf_pci_aer_reset_done,
1365	.error_detected = vfio_pci_core_aer_err_detected,
1366};
1367
1368static struct pci_driver mlx5vf_pci_driver = {
1369	.name = KBUILD_MODNAME,
1370	.id_table = mlx5vf_pci_table,
1371	.probe = mlx5vf_pci_probe,
1372	.remove = mlx5vf_pci_remove,
1373	.err_handler = &mlx5vf_err_handlers,
1374	.driver_managed_dma = true,
1375};
1376
1377module_pci_driver(mlx5vf_pci_driver);
1378
1379MODULE_LICENSE("GPL");
1380MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>");
1381MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>");
1382MODULE_DESCRIPTION(
1383	"MLX5 VFIO PCI - User Level meta-driver for MLX5 device family");
1384