xref: /kernel/linux/linux-6.6/drivers/vfio/pci/mlx5/cmd.c (revision 62306a36)
1// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2/*
3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
4 */
5
6#include "cmd.h"
7
8enum { CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 };
9
10static int mlx5vf_is_migratable(struct mlx5_core_dev *mdev, u16 func_id)
11{
12	int query_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out);
13	void *query_cap = NULL, *cap;
14	int ret;
15
16	query_cap = kzalloc(query_sz, GFP_KERNEL);
17	if (!query_cap)
18		return -ENOMEM;
19
20	ret = mlx5_vport_get_other_func_cap(mdev, func_id, query_cap,
21					    MLX5_CAP_GENERAL_2);
22	if (ret)
23		goto out;
24
25	cap = MLX5_ADDR_OF(query_hca_cap_out, query_cap, capability);
26	if (!MLX5_GET(cmd_hca_cap_2, cap, migratable))
27		ret = -EOPNOTSUPP;
28out:
29	kfree(query_cap);
30	return ret;
31}
32
33static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id,
34				  u16 *vhca_id);
35static void
36_mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev);
37
38int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
39{
40	struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
41	u32 out[MLX5_ST_SZ_DW(suspend_vhca_out)] = {};
42	u32 in[MLX5_ST_SZ_DW(suspend_vhca_in)] = {};
43	int err;
44
45	lockdep_assert_held(&mvdev->state_mutex);
46	if (mvdev->mdev_detach)
47		return -ENOTCONN;
48
49	/*
50	 * In case PRE_COPY is used, saving_migf is exposed while the device is
51	 * running. Make sure to run only once there is no active save command.
52	 * Running both in parallel, might end-up with a failure in the save
53	 * command once it will try to turn on 'tracking' on a suspended device.
54	 */
55	if (migf) {
56		err = wait_for_completion_interruptible(&migf->save_comp);
57		if (err)
58			return err;
59	}
60
61	MLX5_SET(suspend_vhca_in, in, opcode, MLX5_CMD_OP_SUSPEND_VHCA);
62	MLX5_SET(suspend_vhca_in, in, vhca_id, mvdev->vhca_id);
63	MLX5_SET(suspend_vhca_in, in, op_mod, op_mod);
64
65	err = mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out);
66	if (migf)
67		complete(&migf->save_comp);
68
69	return err;
70}
71
72int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
73{
74	u32 out[MLX5_ST_SZ_DW(resume_vhca_out)] = {};
75	u32 in[MLX5_ST_SZ_DW(resume_vhca_in)] = {};
76
77	lockdep_assert_held(&mvdev->state_mutex);
78	if (mvdev->mdev_detach)
79		return -ENOTCONN;
80
81	MLX5_SET(resume_vhca_in, in, opcode, MLX5_CMD_OP_RESUME_VHCA);
82	MLX5_SET(resume_vhca_in, in, vhca_id, mvdev->vhca_id);
83	MLX5_SET(resume_vhca_in, in, op_mod, op_mod);
84
85	return mlx5_cmd_exec_inout(mvdev->mdev, resume_vhca, in, out);
86}
87
88int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
89					  size_t *state_size, u8 query_flags)
90{
91	u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {};
92	u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {};
93	bool inc = query_flags & MLX5VF_QUERY_INC;
94	int ret;
95
96	lockdep_assert_held(&mvdev->state_mutex);
97	if (mvdev->mdev_detach)
98		return -ENOTCONN;
99
100	/*
101	 * In case PRE_COPY is used, saving_migf is exposed while device is
102	 * running. Make sure to run only once there is no active save command.
103	 * Running both in parallel, might end-up with a failure in the
104	 * incremental query command on un-tracked vhca.
105	 */
106	if (inc) {
107		ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp);
108		if (ret)
109			return ret;
110		if (mvdev->saving_migf->state ==
111		    MLX5_MIGF_STATE_PRE_COPY_ERROR) {
112			/*
113			 * In case we had a PRE_COPY error, only query full
114			 * image for final image
115			 */
116			if (!(query_flags & MLX5VF_QUERY_FINAL)) {
117				*state_size = 0;
118				complete(&mvdev->saving_migf->save_comp);
119				return 0;
120			}
121			query_flags &= ~MLX5VF_QUERY_INC;
122		}
123	}
124
125	MLX5_SET(query_vhca_migration_state_in, in, opcode,
126		 MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE);
127	MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id);
128	MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0);
129	MLX5_SET(query_vhca_migration_state_in, in, incremental,
130		 query_flags & MLX5VF_QUERY_INC);
131
132	ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in,
133				  out);
134	if (inc)
135		complete(&mvdev->saving_migf->save_comp);
136
137	if (ret)
138		return ret;
139
140	*state_size = MLX5_GET(query_vhca_migration_state_out, out,
141			       required_umem_size);
142	return 0;
143}
144
145static void set_tracker_error(struct mlx5vf_pci_core_device *mvdev)
146{
147	/* Mark the tracker under an error and wake it up if it's running */
148	mvdev->tracker.is_err = true;
149	complete(&mvdev->tracker_comp);
150}
151
152static int mlx5fv_vf_event(struct notifier_block *nb,
153			   unsigned long event, void *data)
154{
155	struct mlx5vf_pci_core_device *mvdev =
156		container_of(nb, struct mlx5vf_pci_core_device, nb);
157
158	switch (event) {
159	case MLX5_PF_NOTIFY_ENABLE_VF:
160		mutex_lock(&mvdev->state_mutex);
161		mvdev->mdev_detach = false;
162		mlx5vf_state_mutex_unlock(mvdev);
163		break;
164	case MLX5_PF_NOTIFY_DISABLE_VF:
165		mlx5vf_cmd_close_migratable(mvdev);
166		mutex_lock(&mvdev->state_mutex);
167		mvdev->mdev_detach = true;
168		mlx5vf_state_mutex_unlock(mvdev);
169		break;
170	default:
171		break;
172	}
173
174	return 0;
175}
176
177void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev)
178{
179	if (!mvdev->migrate_cap)
180		return;
181
182	/* Must be done outside the lock to let it progress */
183	set_tracker_error(mvdev);
184	mutex_lock(&mvdev->state_mutex);
185	mlx5vf_disable_fds(mvdev);
186	_mlx5vf_free_page_tracker_resources(mvdev);
187	mlx5vf_state_mutex_unlock(mvdev);
188}
189
190void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev)
191{
192	if (!mvdev->migrate_cap)
193		return;
194
195	mlx5_sriov_blocking_notifier_unregister(mvdev->mdev, mvdev->vf_id,
196						&mvdev->nb);
197	destroy_workqueue(mvdev->cb_wq);
198}
199
200void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
201			       const struct vfio_migration_ops *mig_ops,
202			       const struct vfio_log_ops *log_ops)
203{
204	struct pci_dev *pdev = mvdev->core_device.pdev;
205	int ret;
206
207	if (!pdev->is_virtfn)
208		return;
209
210	mvdev->mdev = mlx5_vf_get_core_dev(pdev);
211	if (!mvdev->mdev)
212		return;
213
214	if (!MLX5_CAP_GEN(mvdev->mdev, migration))
215		goto end;
216
217	mvdev->vf_id = pci_iov_vf_id(pdev);
218	if (mvdev->vf_id < 0)
219		goto end;
220
221	ret = mlx5vf_is_migratable(mvdev->mdev, mvdev->vf_id + 1);
222	if (ret)
223		goto end;
224
225	if (mlx5vf_cmd_get_vhca_id(mvdev->mdev, mvdev->vf_id + 1,
226				   &mvdev->vhca_id))
227		goto end;
228
229	mvdev->cb_wq = alloc_ordered_workqueue("mlx5vf_wq", 0);
230	if (!mvdev->cb_wq)
231		goto end;
232
233	mutex_init(&mvdev->state_mutex);
234	spin_lock_init(&mvdev->reset_lock);
235	mvdev->nb.notifier_call = mlx5fv_vf_event;
236	ret = mlx5_sriov_blocking_notifier_register(mvdev->mdev, mvdev->vf_id,
237						    &mvdev->nb);
238	if (ret) {
239		destroy_workqueue(mvdev->cb_wq);
240		goto end;
241	}
242
243	mvdev->migrate_cap = 1;
244	mvdev->core_device.vdev.migration_flags =
245		VFIO_MIGRATION_STOP_COPY |
246		VFIO_MIGRATION_P2P;
247	mvdev->core_device.vdev.mig_ops = mig_ops;
248	init_completion(&mvdev->tracker_comp);
249	if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization))
250		mvdev->core_device.vdev.log_ops = log_ops;
251
252	if (MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) &&
253	    MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state))
254		mvdev->core_device.vdev.migration_flags |=
255			VFIO_MIGRATION_PRE_COPY;
256
257end:
258	mlx5_vf_put_core_dev(mvdev->mdev);
259}
260
261static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id,
262				  u16 *vhca_id)
263{
264	u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {};
265	int out_size;
266	void *out;
267	int ret;
268
269	out_size = MLX5_ST_SZ_BYTES(query_hca_cap_out);
270	out = kzalloc(out_size, GFP_KERNEL);
271	if (!out)
272		return -ENOMEM;
273
274	MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
275	MLX5_SET(query_hca_cap_in, in, other_function, 1);
276	MLX5_SET(query_hca_cap_in, in, function_id, function_id);
277	MLX5_SET(query_hca_cap_in, in, op_mod,
278		 MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE << 1 |
279		 HCA_CAP_OPMOD_GET_CUR);
280
281	ret = mlx5_cmd_exec_inout(mdev, query_hca_cap, in, out);
282	if (ret)
283		goto err_exec;
284
285	*vhca_id = MLX5_GET(query_hca_cap_out, out,
286			    capability.cmd_hca_cap.vhca_id);
287
288err_exec:
289	kfree(out);
290	return ret;
291}
292
293static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
294			struct mlx5_vhca_data_buffer *buf,
295			struct mlx5_vhca_recv_buf *recv_buf,
296			u32 *mkey)
297{
298	size_t npages = buf ? DIV_ROUND_UP(buf->allocated_length, PAGE_SIZE) :
299				recv_buf->npages;
300	int err = 0, inlen;
301	__be64 *mtt;
302	void *mkc;
303	u32 *in;
304
305	inlen = MLX5_ST_SZ_BYTES(create_mkey_in) +
306		sizeof(*mtt) * round_up(npages, 2);
307
308	in = kvzalloc(inlen, GFP_KERNEL);
309	if (!in)
310		return -ENOMEM;
311
312	MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
313		 DIV_ROUND_UP(npages, 2));
314	mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
315
316	if (buf) {
317		struct sg_dma_page_iter dma_iter;
318
319		for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0)
320			*mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter));
321	} else {
322		int i;
323
324		for (i = 0; i < npages; i++)
325			*mtt++ = cpu_to_be64(recv_buf->dma_addrs[i]);
326	}
327
328	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
329	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
330	MLX5_SET(mkc, mkc, lr, 1);
331	MLX5_SET(mkc, mkc, lw, 1);
332	MLX5_SET(mkc, mkc, rr, 1);
333	MLX5_SET(mkc, mkc, rw, 1);
334	MLX5_SET(mkc, mkc, pd, pdn);
335	MLX5_SET(mkc, mkc, bsf_octword_size, 0);
336	MLX5_SET(mkc, mkc, qpn, 0xffffff);
337	MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
338	MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2));
339	MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE);
340	err = mlx5_core_create_mkey(mdev, mkey, in, inlen);
341	kvfree(in);
342	return err;
343}
344
345static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf)
346{
347	struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev;
348	struct mlx5_core_dev *mdev = mvdev->mdev;
349	int ret;
350
351	lockdep_assert_held(&mvdev->state_mutex);
352	if (mvdev->mdev_detach)
353		return -ENOTCONN;
354
355	if (buf->dmaed || !buf->allocated_length)
356		return -EINVAL;
357
358	ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
359	if (ret)
360		return ret;
361
362	ret = _create_mkey(mdev, buf->migf->pdn, buf, NULL, &buf->mkey);
363	if (ret)
364		goto err;
365
366	buf->dmaed = true;
367
368	return 0;
369err:
370	dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
371	return ret;
372}
373
374void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf)
375{
376	struct mlx5_vf_migration_file *migf = buf->migf;
377	struct sg_page_iter sg_iter;
378
379	lockdep_assert_held(&migf->mvdev->state_mutex);
380	WARN_ON(migf->mvdev->mdev_detach);
381
382	if (buf->dmaed) {
383		mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey);
384		dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt,
385				  buf->dma_dir, 0);
386	}
387
388	/* Undo alloc_pages_bulk_array() */
389	for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0)
390		__free_page(sg_page_iter_page(&sg_iter));
391	sg_free_append_table(&buf->table);
392	kfree(buf);
393}
394
395struct mlx5_vhca_data_buffer *
396mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf,
397			 size_t length,
398			 enum dma_data_direction dma_dir)
399{
400	struct mlx5_vhca_data_buffer *buf;
401	int ret;
402
403	buf = kzalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT);
404	if (!buf)
405		return ERR_PTR(-ENOMEM);
406
407	buf->dma_dir = dma_dir;
408	buf->migf = migf;
409	if (length) {
410		ret = mlx5vf_add_migration_pages(buf,
411				DIV_ROUND_UP_ULL(length, PAGE_SIZE));
412		if (ret)
413			goto end;
414
415		if (dma_dir != DMA_NONE) {
416			ret = mlx5vf_dma_data_buffer(buf);
417			if (ret)
418				goto end;
419		}
420	}
421
422	return buf;
423end:
424	mlx5vf_free_data_buffer(buf);
425	return ERR_PTR(ret);
426}
427
428void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf)
429{
430	spin_lock_irq(&buf->migf->list_lock);
431	list_add_tail(&buf->buf_elm, &buf->migf->avail_list);
432	spin_unlock_irq(&buf->migf->list_lock);
433}
434
435struct mlx5_vhca_data_buffer *
436mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf,
437		       size_t length, enum dma_data_direction dma_dir)
438{
439	struct mlx5_vhca_data_buffer *buf, *temp_buf;
440	struct list_head free_list;
441
442	lockdep_assert_held(&migf->mvdev->state_mutex);
443	if (migf->mvdev->mdev_detach)
444		return ERR_PTR(-ENOTCONN);
445
446	INIT_LIST_HEAD(&free_list);
447
448	spin_lock_irq(&migf->list_lock);
449	list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) {
450		if (buf->dma_dir == dma_dir) {
451			list_del_init(&buf->buf_elm);
452			if (buf->allocated_length >= length) {
453				spin_unlock_irq(&migf->list_lock);
454				goto found;
455			}
456			/*
457			 * Prevent holding redundant buffers. Put in a free
458			 * list and call at the end not under the spin lock
459			 * (&migf->list_lock) to mlx5vf_free_data_buffer which
460			 * might sleep.
461			 */
462			list_add(&buf->buf_elm, &free_list);
463		}
464	}
465	spin_unlock_irq(&migf->list_lock);
466	buf = mlx5vf_alloc_data_buffer(migf, length, dma_dir);
467
468found:
469	while ((temp_buf = list_first_entry_or_null(&free_list,
470				struct mlx5_vhca_data_buffer, buf_elm))) {
471		list_del(&temp_buf->buf_elm);
472		mlx5vf_free_data_buffer(temp_buf);
473	}
474
475	return buf;
476}
477
478void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work)
479{
480	struct mlx5vf_async_data *async_data = container_of(_work,
481		struct mlx5vf_async_data, work);
482	struct mlx5_vf_migration_file *migf = container_of(async_data,
483		struct mlx5_vf_migration_file, async_data);
484
485	mutex_lock(&migf->lock);
486	if (async_data->status) {
487		mlx5vf_put_data_buffer(async_data->buf);
488		if (async_data->header_buf)
489			mlx5vf_put_data_buffer(async_data->header_buf);
490		if (async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR)
491			migf->state = MLX5_MIGF_STATE_PRE_COPY_ERROR;
492		else
493			migf->state = MLX5_MIGF_STATE_ERROR;
494		wake_up_interruptible(&migf->poll_wait);
495	}
496	mutex_unlock(&migf->lock);
497	kvfree(async_data->out);
498	complete(&migf->save_comp);
499	fput(migf->filp);
500}
501
502static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf,
503			  size_t image_size, bool initial_pre_copy)
504{
505	struct mlx5_vf_migration_file *migf = header_buf->migf;
506	struct mlx5_vf_migration_header header = {};
507	unsigned long flags;
508	struct page *page;
509	u8 *to_buff;
510
511	header.record_size = cpu_to_le64(image_size);
512	header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_MANDATORY);
513	header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_FW_DATA);
514	page = mlx5vf_get_migration_page(header_buf, 0);
515	if (!page)
516		return -EINVAL;
517	to_buff = kmap_local_page(page);
518	memcpy(to_buff, &header, sizeof(header));
519	kunmap_local(to_buff);
520	header_buf->length = sizeof(header);
521	header_buf->start_pos = header_buf->migf->max_pos;
522	migf->max_pos += header_buf->length;
523	spin_lock_irqsave(&migf->list_lock, flags);
524	list_add_tail(&header_buf->buf_elm, &migf->buf_list);
525	spin_unlock_irqrestore(&migf->list_lock, flags);
526	if (initial_pre_copy)
527		migf->pre_copy_initial_bytes += sizeof(header);
528	return 0;
529}
530
531static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
532{
533	struct mlx5vf_async_data *async_data = container_of(context,
534			struct mlx5vf_async_data, cb_work);
535	struct mlx5_vf_migration_file *migf = container_of(async_data,
536			struct mlx5_vf_migration_file, async_data);
537
538	if (!status) {
539		size_t image_size;
540		unsigned long flags;
541		bool initial_pre_copy = migf->state != MLX5_MIGF_STATE_PRE_COPY &&
542				!async_data->last_chunk;
543
544		image_size = MLX5_GET(save_vhca_state_out, async_data->out,
545				      actual_image_size);
546		if (async_data->header_buf) {
547			status = add_buf_header(async_data->header_buf, image_size,
548						initial_pre_copy);
549			if (status)
550				goto err;
551		}
552		async_data->buf->length = image_size;
553		async_data->buf->start_pos = migf->max_pos;
554		migf->max_pos += async_data->buf->length;
555		spin_lock_irqsave(&migf->list_lock, flags);
556		list_add_tail(&async_data->buf->buf_elm, &migf->buf_list);
557		spin_unlock_irqrestore(&migf->list_lock, flags);
558		if (initial_pre_copy)
559			migf->pre_copy_initial_bytes += image_size;
560		migf->state = async_data->last_chunk ?
561			MLX5_MIGF_STATE_COMPLETE : MLX5_MIGF_STATE_PRE_COPY;
562		wake_up_interruptible(&migf->poll_wait);
563	}
564
565err:
566	/*
567	 * The error and the cleanup flows can't run from an
568	 * interrupt context
569	 */
570	if (status == -EREMOTEIO)
571		status = MLX5_GET(save_vhca_state_out, async_data->out, status);
572	async_data->status = status;
573	queue_work(migf->mvdev->cb_wq, &async_data->work);
574}
575
576int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
577			       struct mlx5_vf_migration_file *migf,
578			       struct mlx5_vhca_data_buffer *buf, bool inc,
579			       bool track)
580{
581	u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out);
582	u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {};
583	struct mlx5_vhca_data_buffer *header_buf = NULL;
584	struct mlx5vf_async_data *async_data;
585	int err;
586
587	lockdep_assert_held(&mvdev->state_mutex);
588	if (mvdev->mdev_detach)
589		return -ENOTCONN;
590
591	err = wait_for_completion_interruptible(&migf->save_comp);
592	if (err)
593		return err;
594
595	if (migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)
596		/*
597		 * In case we had a PRE_COPY error, SAVE is triggered only for
598		 * the final image, read device full image.
599		 */
600		inc = false;
601
602	MLX5_SET(save_vhca_state_in, in, opcode,
603		 MLX5_CMD_OP_SAVE_VHCA_STATE);
604	MLX5_SET(save_vhca_state_in, in, op_mod, 0);
605	MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id);
606	MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey);
607	MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length);
608	MLX5_SET(save_vhca_state_in, in, incremental, inc);
609	MLX5_SET(save_vhca_state_in, in, set_track, track);
610
611	async_data = &migf->async_data;
612	async_data->buf = buf;
613	async_data->last_chunk = !track;
614	async_data->out = kvzalloc(out_size, GFP_KERNEL);
615	if (!async_data->out) {
616		err = -ENOMEM;
617		goto err_out;
618	}
619
620	if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
621		if (async_data->last_chunk && migf->buf_header) {
622			header_buf = migf->buf_header;
623			migf->buf_header = NULL;
624		} else {
625			header_buf = mlx5vf_get_data_buffer(migf,
626				sizeof(struct mlx5_vf_migration_header), DMA_NONE);
627			if (IS_ERR(header_buf)) {
628				err = PTR_ERR(header_buf);
629				goto err_free;
630			}
631		}
632	}
633
634	if (async_data->last_chunk)
635		migf->state = MLX5_MIGF_STATE_SAVE_LAST;
636
637	async_data->header_buf = header_buf;
638	get_file(migf->filp);
639	err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in),
640			       async_data->out,
641			       out_size, mlx5vf_save_callback,
642			       &async_data->cb_work);
643	if (err)
644		goto err_exec;
645
646	return 0;
647
648err_exec:
649	if (header_buf)
650		mlx5vf_put_data_buffer(header_buf);
651	fput(migf->filp);
652err_free:
653	kvfree(async_data->out);
654err_out:
655	complete(&migf->save_comp);
656	return err;
657}
658
659int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev,
660			       struct mlx5_vf_migration_file *migf,
661			       struct mlx5_vhca_data_buffer *buf)
662{
663	u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {};
664	u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {};
665	int err;
666
667	lockdep_assert_held(&mvdev->state_mutex);
668	if (mvdev->mdev_detach)
669		return -ENOTCONN;
670
671	if (!buf->dmaed) {
672		err = mlx5vf_dma_data_buffer(buf);
673		if (err)
674			return err;
675	}
676
677	MLX5_SET(load_vhca_state_in, in, opcode,
678		 MLX5_CMD_OP_LOAD_VHCA_STATE);
679	MLX5_SET(load_vhca_state_in, in, op_mod, 0);
680	MLX5_SET(load_vhca_state_in, in, vhca_id, mvdev->vhca_id);
681	MLX5_SET(load_vhca_state_in, in, mkey, buf->mkey);
682	MLX5_SET(load_vhca_state_in, in, size, buf->length);
683	return mlx5_cmd_exec_inout(mvdev->mdev, load_vhca_state, in, out);
684}
685
686int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf)
687{
688	int err;
689
690	lockdep_assert_held(&migf->mvdev->state_mutex);
691	if (migf->mvdev->mdev_detach)
692		return -ENOTCONN;
693
694	err = mlx5_core_alloc_pd(migf->mvdev->mdev, &migf->pdn);
695	return err;
696}
697
698void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf)
699{
700	lockdep_assert_held(&migf->mvdev->state_mutex);
701	if (migf->mvdev->mdev_detach)
702		return;
703
704	mlx5_core_dealloc_pd(migf->mvdev->mdev, migf->pdn);
705}
706
707void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf)
708{
709	struct mlx5_vhca_data_buffer *entry;
710
711	lockdep_assert_held(&migf->mvdev->state_mutex);
712	WARN_ON(migf->mvdev->mdev_detach);
713
714	if (migf->buf) {
715		mlx5vf_free_data_buffer(migf->buf);
716		migf->buf = NULL;
717	}
718
719	if (migf->buf_header) {
720		mlx5vf_free_data_buffer(migf->buf_header);
721		migf->buf_header = NULL;
722	}
723
724	list_splice(&migf->avail_list, &migf->buf_list);
725
726	while ((entry = list_first_entry_or_null(&migf->buf_list,
727				struct mlx5_vhca_data_buffer, buf_elm))) {
728		list_del(&entry->buf_elm);
729		mlx5vf_free_data_buffer(entry);
730	}
731
732	mlx5vf_cmd_dealloc_pd(migf);
733}
734
735static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev,
736				 struct mlx5vf_pci_core_device *mvdev,
737				 struct rb_root_cached *ranges, u32 nnodes)
738{
739	int max_num_range =
740		MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_max_num_range);
741	struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
742	int record_size = MLX5_ST_SZ_BYTES(page_track_range);
743	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
744	struct interval_tree_node *node = NULL;
745	u64 total_ranges_len = 0;
746	u32 num_ranges = nnodes;
747	u8 log_addr_space_size;
748	void *range_list_ptr;
749	void *obj_context;
750	void *cmd_hdr;
751	int inlen;
752	void *in;
753	int err;
754	int i;
755
756	if (num_ranges > max_num_range) {
757		vfio_combine_iova_ranges(ranges, nnodes, max_num_range);
758		num_ranges = max_num_range;
759	}
760
761	inlen = MLX5_ST_SZ_BYTES(create_page_track_obj_in) +
762				 record_size * num_ranges;
763	in = kzalloc(inlen, GFP_KERNEL);
764	if (!in)
765		return -ENOMEM;
766
767	cmd_hdr = MLX5_ADDR_OF(create_page_track_obj_in, in,
768			       general_obj_in_cmd_hdr);
769	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode,
770		 MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
771	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type,
772		 MLX5_OBJ_TYPE_PAGE_TRACK);
773	obj_context = MLX5_ADDR_OF(create_page_track_obj_in, in, obj_context);
774	MLX5_SET(page_track, obj_context, vhca_id, mvdev->vhca_id);
775	MLX5_SET(page_track, obj_context, track_type, 1);
776	MLX5_SET(page_track, obj_context, log_page_size,
777		 ilog2(tracker->host_qp->tracked_page_size));
778	MLX5_SET(page_track, obj_context, log_msg_size,
779		 ilog2(tracker->host_qp->max_msg_size));
780	MLX5_SET(page_track, obj_context, reporting_qpn, tracker->fw_qp->qpn);
781	MLX5_SET(page_track, obj_context, num_ranges, num_ranges);
782
783	range_list_ptr = MLX5_ADDR_OF(page_track, obj_context, track_range);
784	node = interval_tree_iter_first(ranges, 0, ULONG_MAX);
785	for (i = 0; i < num_ranges; i++) {
786		void *addr_range_i_base = range_list_ptr + record_size * i;
787		unsigned long length = node->last - node->start + 1;
788
789		MLX5_SET64(page_track_range, addr_range_i_base, start_address,
790			   node->start);
791		MLX5_SET64(page_track_range, addr_range_i_base, length, length);
792		total_ranges_len += length;
793		node = interval_tree_iter_next(node, 0, ULONG_MAX);
794	}
795
796	WARN_ON(node);
797	log_addr_space_size = ilog2(roundup_pow_of_two(total_ranges_len));
798	if (log_addr_space_size <
799	    (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_min_addr_space)) ||
800	    log_addr_space_size >
801	    (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_addr_space))) {
802		err = -EOPNOTSUPP;
803		goto out;
804	}
805
806	MLX5_SET(page_track, obj_context, log_addr_space_size,
807		 log_addr_space_size);
808	err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
809	if (err)
810		goto out;
811
812	tracker->id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
813out:
814	kfree(in);
815	return err;
816}
817
818static int mlx5vf_cmd_destroy_tracker(struct mlx5_core_dev *mdev,
819				      u32 tracker_id)
820{
821	u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {};
822	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
823
824	MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
825	MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
826	MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, tracker_id);
827
828	return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
829}
830
831static int mlx5vf_cmd_modify_tracker(struct mlx5_core_dev *mdev,
832				     u32 tracker_id, unsigned long iova,
833				     unsigned long length, u32 tracker_state)
834{
835	u32 in[MLX5_ST_SZ_DW(modify_page_track_obj_in)] = {};
836	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
837	void *obj_context;
838	void *cmd_hdr;
839
840	cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr);
841	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT);
842	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
843	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker_id);
844
845	obj_context = MLX5_ADDR_OF(modify_page_track_obj_in, in, obj_context);
846	MLX5_SET64(page_track, obj_context, modify_field_select, 0x3);
847	MLX5_SET64(page_track, obj_context, range_start_address, iova);
848	MLX5_SET64(page_track, obj_context, length, length);
849	MLX5_SET(page_track, obj_context, state, tracker_state);
850
851	return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
852}
853
854static int alloc_cq_frag_buf(struct mlx5_core_dev *mdev,
855			     struct mlx5_vhca_cq_buf *buf, int nent,
856			     int cqe_size)
857{
858	struct mlx5_frag_buf *frag_buf = &buf->frag_buf;
859	u8 log_wq_stride = 6 + (cqe_size == 128 ? 1 : 0);
860	u8 log_wq_sz = ilog2(cqe_size);
861	int err;
862
863	err = mlx5_frag_buf_alloc_node(mdev, nent * cqe_size, frag_buf,
864				       mdev->priv.numa_node);
865	if (err)
866		return err;
867
868	mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc);
869	buf->cqe_size = cqe_size;
870	buf->nent = nent;
871	return 0;
872}
873
874static void init_cq_frag_buf(struct mlx5_vhca_cq_buf *buf)
875{
876	struct mlx5_cqe64 *cqe64;
877	void *cqe;
878	int i;
879
880	for (i = 0; i < buf->nent; i++) {
881		cqe = mlx5_frag_buf_get_wqe(&buf->fbc, i);
882		cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64;
883		cqe64->op_own = MLX5_CQE_INVALID << 4;
884	}
885}
886
887static void mlx5vf_destroy_cq(struct mlx5_core_dev *mdev,
888			      struct mlx5_vhca_cq *cq)
889{
890	mlx5_core_destroy_cq(mdev, &cq->mcq);
891	mlx5_frag_buf_free(mdev, &cq->buf.frag_buf);
892	mlx5_db_free(mdev, &cq->db);
893}
894
895static void mlx5vf_cq_event(struct mlx5_core_cq *mcq, enum mlx5_event type)
896{
897	if (type != MLX5_EVENT_TYPE_CQ_ERROR)
898		return;
899
900	set_tracker_error(container_of(mcq, struct mlx5vf_pci_core_device,
901				       tracker.cq.mcq));
902}
903
904static int mlx5vf_event_notifier(struct notifier_block *nb, unsigned long type,
905				 void *data)
906{
907	struct mlx5_vhca_page_tracker *tracker =
908		mlx5_nb_cof(nb, struct mlx5_vhca_page_tracker, nb);
909	struct mlx5vf_pci_core_device *mvdev = container_of(
910		tracker, struct mlx5vf_pci_core_device, tracker);
911	struct mlx5_eqe *eqe = data;
912	u8 event_type = (u8)type;
913	u8 queue_type;
914	int qp_num;
915
916	switch (event_type) {
917	case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
918	case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
919	case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
920		queue_type = eqe->data.qp_srq.type;
921		if (queue_type != MLX5_EVENT_QUEUE_TYPE_QP)
922			break;
923		qp_num = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff;
924		if (qp_num != tracker->host_qp->qpn &&
925		    qp_num != tracker->fw_qp->qpn)
926			break;
927		set_tracker_error(mvdev);
928		break;
929	default:
930		break;
931	}
932
933	return NOTIFY_OK;
934}
935
936static void mlx5vf_cq_complete(struct mlx5_core_cq *mcq,
937			       struct mlx5_eqe *eqe)
938{
939	struct mlx5vf_pci_core_device *mvdev =
940		container_of(mcq, struct mlx5vf_pci_core_device,
941			     tracker.cq.mcq);
942
943	complete(&mvdev->tracker_comp);
944}
945
946static int mlx5vf_create_cq(struct mlx5_core_dev *mdev,
947			    struct mlx5_vhca_page_tracker *tracker,
948			    size_t ncqe)
949{
950	int cqe_size = cache_line_size() == 128 ? 128 : 64;
951	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
952	struct mlx5_vhca_cq *cq;
953	int inlen, err, eqn;
954	void *cqc, *in;
955	__be64 *pas;
956	int vector;
957
958	cq = &tracker->cq;
959	ncqe = roundup_pow_of_two(ncqe);
960	err = mlx5_db_alloc_node(mdev, &cq->db, mdev->priv.numa_node);
961	if (err)
962		return err;
963
964	cq->ncqe = ncqe;
965	cq->mcq.set_ci_db = cq->db.db;
966	cq->mcq.arm_db = cq->db.db + 1;
967	cq->mcq.cqe_sz = cqe_size;
968	err = alloc_cq_frag_buf(mdev, &cq->buf, ncqe, cqe_size);
969	if (err)
970		goto err_db_free;
971
972	init_cq_frag_buf(&cq->buf);
973	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
974		MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) *
975		cq->buf.frag_buf.npages;
976	in = kvzalloc(inlen, GFP_KERNEL);
977	if (!in) {
978		err = -ENOMEM;
979		goto err_buff;
980	}
981
982	vector = raw_smp_processor_id() % mlx5_comp_vectors_max(mdev);
983	err = mlx5_comp_eqn_get(mdev, vector, &eqn);
984	if (err)
985		goto err_vec;
986
987	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
988	MLX5_SET(cqc, cqc, log_cq_size, ilog2(ncqe));
989	MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
990	MLX5_SET(cqc, cqc, uar_page, tracker->uar->index);
991	MLX5_SET(cqc, cqc, log_page_size, cq->buf.frag_buf.page_shift -
992		 MLX5_ADAPTER_PAGE_SHIFT);
993	MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma);
994	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
995	mlx5_fill_page_frag_array(&cq->buf.frag_buf, pas);
996	cq->mcq.comp = mlx5vf_cq_complete;
997	cq->mcq.event = mlx5vf_cq_event;
998	err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out));
999	if (err)
1000		goto err_vec;
1001
1002	mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map,
1003		    cq->mcq.cons_index);
1004	kvfree(in);
1005	return 0;
1006
1007err_vec:
1008	kvfree(in);
1009err_buff:
1010	mlx5_frag_buf_free(mdev, &cq->buf.frag_buf);
1011err_db_free:
1012	mlx5_db_free(mdev, &cq->db);
1013	return err;
1014}
1015
1016static struct mlx5_vhca_qp *
1017mlx5vf_create_rc_qp(struct mlx5_core_dev *mdev,
1018		    struct mlx5_vhca_page_tracker *tracker, u32 max_recv_wr)
1019{
1020	u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
1021	struct mlx5_vhca_qp *qp;
1022	u8 log_rq_stride;
1023	u8 log_rq_sz;
1024	void *qpc;
1025	int inlen;
1026	void *in;
1027	int err;
1028
1029	qp = kzalloc(sizeof(*qp), GFP_KERNEL_ACCOUNT);
1030	if (!qp)
1031		return ERR_PTR(-ENOMEM);
1032
1033	err = mlx5_db_alloc_node(mdev, &qp->db, mdev->priv.numa_node);
1034	if (err)
1035		goto err_free;
1036
1037	if (max_recv_wr) {
1038		qp->rq.wqe_cnt = roundup_pow_of_two(max_recv_wr);
1039		log_rq_stride = ilog2(MLX5_SEND_WQE_DS);
1040		log_rq_sz = ilog2(qp->rq.wqe_cnt);
1041		err = mlx5_frag_buf_alloc_node(mdev,
1042			wq_get_byte_sz(log_rq_sz, log_rq_stride),
1043			&qp->buf, mdev->priv.numa_node);
1044		if (err)
1045			goto err_db_free;
1046		mlx5_init_fbc(qp->buf.frags, log_rq_stride, log_rq_sz, &qp->rq.fbc);
1047	}
1048
1049	qp->rq.db = &qp->db.db[MLX5_RCV_DBR];
1050	inlen = MLX5_ST_SZ_BYTES(create_qp_in) +
1051		MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) *
1052		qp->buf.npages;
1053	in = kvzalloc(inlen, GFP_KERNEL);
1054	if (!in) {
1055		err = -ENOMEM;
1056		goto err_in;
1057	}
1058
1059	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
1060	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
1061	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
1062	MLX5_SET(qpc, qpc, pd, tracker->pdn);
1063	MLX5_SET(qpc, qpc, uar_page, tracker->uar->index);
1064	MLX5_SET(qpc, qpc, log_page_size,
1065		 qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
1066	MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev));
1067	if (MLX5_CAP_GEN(mdev, cqe_version) == 1)
1068		MLX5_SET(qpc, qpc, user_index, 0xFFFFFF);
1069	MLX5_SET(qpc, qpc, no_sq, 1);
1070	if (max_recv_wr) {
1071		MLX5_SET(qpc, qpc, cqn_rcv, tracker->cq.mcq.cqn);
1072		MLX5_SET(qpc, qpc, log_rq_stride, log_rq_stride - 4);
1073		MLX5_SET(qpc, qpc, log_rq_size, log_rq_sz);
1074		MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
1075		MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma);
1076		mlx5_fill_page_frag_array(&qp->buf,
1077					  (__be64 *)MLX5_ADDR_OF(create_qp_in,
1078								 in, pas));
1079	} else {
1080		MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ);
1081	}
1082
1083	MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
1084	err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
1085	kvfree(in);
1086	if (err)
1087		goto err_in;
1088
1089	qp->qpn = MLX5_GET(create_qp_out, out, qpn);
1090	return qp;
1091
1092err_in:
1093	if (max_recv_wr)
1094		mlx5_frag_buf_free(mdev, &qp->buf);
1095err_db_free:
1096	mlx5_db_free(mdev, &qp->db);
1097err_free:
1098	kfree(qp);
1099	return ERR_PTR(err);
1100}
1101
1102static void mlx5vf_post_recv(struct mlx5_vhca_qp *qp)
1103{
1104	struct mlx5_wqe_data_seg *data;
1105	unsigned int ix;
1106
1107	WARN_ON(qp->rq.pc - qp->rq.cc >= qp->rq.wqe_cnt);
1108	ix = qp->rq.pc & (qp->rq.wqe_cnt - 1);
1109	data = mlx5_frag_buf_get_wqe(&qp->rq.fbc, ix);
1110	data->byte_count = cpu_to_be32(qp->max_msg_size);
1111	data->lkey = cpu_to_be32(qp->recv_buf.mkey);
1112	data->addr = cpu_to_be64(qp->recv_buf.next_rq_offset);
1113	qp->rq.pc++;
1114	/* Make sure that descriptors are written before doorbell record. */
1115	dma_wmb();
1116	*qp->rq.db = cpu_to_be32(qp->rq.pc & 0xffff);
1117}
1118
1119static int mlx5vf_activate_qp(struct mlx5_core_dev *mdev,
1120			      struct mlx5_vhca_qp *qp, u32 remote_qpn,
1121			      bool host_qp)
1122{
1123	u32 init_in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {};
1124	u32 rtr_in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {};
1125	u32 rts_in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {};
1126	void *qpc;
1127	int ret;
1128
1129	/* Init */
1130	qpc = MLX5_ADDR_OF(rst2init_qp_in, init_in, qpc);
1131	MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1);
1132	MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED);
1133	MLX5_SET(qpc, qpc, rre, 1);
1134	MLX5_SET(qpc, qpc, rwe, 1);
1135	MLX5_SET(rst2init_qp_in, init_in, opcode, MLX5_CMD_OP_RST2INIT_QP);
1136	MLX5_SET(rst2init_qp_in, init_in, qpn, qp->qpn);
1137	ret = mlx5_cmd_exec_in(mdev, rst2init_qp, init_in);
1138	if (ret)
1139		return ret;
1140
1141	if (host_qp) {
1142		struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1143		int i;
1144
1145		for (i = 0; i < qp->rq.wqe_cnt; i++) {
1146			mlx5vf_post_recv(qp);
1147			recv_buf->next_rq_offset += qp->max_msg_size;
1148		}
1149	}
1150
1151	/* RTR */
1152	qpc = MLX5_ADDR_OF(init2rtr_qp_in, rtr_in, qpc);
1153	MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn);
1154	MLX5_SET(qpc, qpc, mtu, IB_MTU_4096);
1155	MLX5_SET(qpc, qpc, log_msg_max, MLX5_CAP_GEN(mdev, log_max_msg));
1156	MLX5_SET(qpc, qpc, remote_qpn, remote_qpn);
1157	MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1);
1158	MLX5_SET(qpc, qpc, primary_address_path.fl, 1);
1159	MLX5_SET(qpc, qpc, min_rnr_nak, 1);
1160	MLX5_SET(init2rtr_qp_in, rtr_in, opcode, MLX5_CMD_OP_INIT2RTR_QP);
1161	MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn);
1162	ret = mlx5_cmd_exec_in(mdev, init2rtr_qp, rtr_in);
1163	if (ret || host_qp)
1164		return ret;
1165
1166	/* RTS */
1167	qpc = MLX5_ADDR_OF(rtr2rts_qp_in, rts_in, qpc);
1168	MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn);
1169	MLX5_SET(qpc, qpc, retry_count, 7);
1170	MLX5_SET(qpc, qpc, rnr_retry, 7); /* Infinite retry if RNR NACK */
1171	MLX5_SET(qpc, qpc, primary_address_path.ack_timeout, 0x8); /* ~1ms */
1172	MLX5_SET(rtr2rts_qp_in, rts_in, opcode, MLX5_CMD_OP_RTR2RTS_QP);
1173	MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn);
1174
1175	return mlx5_cmd_exec_in(mdev, rtr2rts_qp, rts_in);
1176}
1177
1178static void mlx5vf_destroy_qp(struct mlx5_core_dev *mdev,
1179			      struct mlx5_vhca_qp *qp)
1180{
1181	u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
1182
1183	MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
1184	MLX5_SET(destroy_qp_in, in, qpn, qp->qpn);
1185	mlx5_cmd_exec_in(mdev, destroy_qp, in);
1186
1187	mlx5_frag_buf_free(mdev, &qp->buf);
1188	mlx5_db_free(mdev, &qp->db);
1189	kfree(qp);
1190}
1191
1192static void free_recv_pages(struct mlx5_vhca_recv_buf *recv_buf)
1193{
1194	int i;
1195
1196	/* Undo alloc_pages_bulk_array() */
1197	for (i = 0; i < recv_buf->npages; i++)
1198		__free_page(recv_buf->page_list[i]);
1199
1200	kvfree(recv_buf->page_list);
1201}
1202
1203static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf,
1204			    unsigned int npages)
1205{
1206	unsigned int filled = 0, done = 0;
1207	int i;
1208
1209	recv_buf->page_list = kvcalloc(npages, sizeof(*recv_buf->page_list),
1210				       GFP_KERNEL_ACCOUNT);
1211	if (!recv_buf->page_list)
1212		return -ENOMEM;
1213
1214	for (;;) {
1215		filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT,
1216						npages - done,
1217						recv_buf->page_list + done);
1218		if (!filled)
1219			goto err;
1220
1221		done += filled;
1222		if (done == npages)
1223			break;
1224	}
1225
1226	recv_buf->npages = npages;
1227	return 0;
1228
1229err:
1230	for (i = 0; i < npages; i++) {
1231		if (recv_buf->page_list[i])
1232			__free_page(recv_buf->page_list[i]);
1233	}
1234
1235	kvfree(recv_buf->page_list);
1236	return -ENOMEM;
1237}
1238
1239static int register_dma_recv_pages(struct mlx5_core_dev *mdev,
1240				   struct mlx5_vhca_recv_buf *recv_buf)
1241{
1242	int i, j;
1243
1244	recv_buf->dma_addrs = kvcalloc(recv_buf->npages,
1245				       sizeof(*recv_buf->dma_addrs),
1246				       GFP_KERNEL_ACCOUNT);
1247	if (!recv_buf->dma_addrs)
1248		return -ENOMEM;
1249
1250	for (i = 0; i < recv_buf->npages; i++) {
1251		recv_buf->dma_addrs[i] = dma_map_page(mdev->device,
1252						      recv_buf->page_list[i],
1253						      0, PAGE_SIZE,
1254						      DMA_FROM_DEVICE);
1255		if (dma_mapping_error(mdev->device, recv_buf->dma_addrs[i]))
1256			goto error;
1257	}
1258	return 0;
1259
1260error:
1261	for (j = 0; j < i; j++)
1262		dma_unmap_single(mdev->device, recv_buf->dma_addrs[j],
1263				 PAGE_SIZE, DMA_FROM_DEVICE);
1264
1265	kvfree(recv_buf->dma_addrs);
1266	return -ENOMEM;
1267}
1268
1269static void unregister_dma_recv_pages(struct mlx5_core_dev *mdev,
1270				      struct mlx5_vhca_recv_buf *recv_buf)
1271{
1272	int i;
1273
1274	for (i = 0; i < recv_buf->npages; i++)
1275		dma_unmap_single(mdev->device, recv_buf->dma_addrs[i],
1276				 PAGE_SIZE, DMA_FROM_DEVICE);
1277
1278	kvfree(recv_buf->dma_addrs);
1279}
1280
1281static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev,
1282					  struct mlx5_vhca_qp *qp)
1283{
1284	struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1285
1286	mlx5_core_destroy_mkey(mdev, recv_buf->mkey);
1287	unregister_dma_recv_pages(mdev, recv_buf);
1288	free_recv_pages(&qp->recv_buf);
1289}
1290
1291static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev,
1292					  struct mlx5_vhca_qp *qp, u32 pdn,
1293					  u64 rq_size)
1294{
1295	unsigned int npages = DIV_ROUND_UP_ULL(rq_size, PAGE_SIZE);
1296	struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1297	int err;
1298
1299	err = alloc_recv_pages(recv_buf, npages);
1300	if (err < 0)
1301		return err;
1302
1303	err = register_dma_recv_pages(mdev, recv_buf);
1304	if (err)
1305		goto end;
1306
1307	err = _create_mkey(mdev, pdn, NULL, recv_buf, &recv_buf->mkey);
1308	if (err)
1309		goto err_create_mkey;
1310
1311	return 0;
1312
1313err_create_mkey:
1314	unregister_dma_recv_pages(mdev, recv_buf);
1315end:
1316	free_recv_pages(recv_buf);
1317	return err;
1318}
1319
1320static void
1321_mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev)
1322{
1323	struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1324	struct mlx5_core_dev *mdev = mvdev->mdev;
1325
1326	lockdep_assert_held(&mvdev->state_mutex);
1327
1328	if (!mvdev->log_active)
1329		return;
1330
1331	WARN_ON(mvdev->mdev_detach);
1332
1333	mlx5_eq_notifier_unregister(mdev, &tracker->nb);
1334	mlx5vf_cmd_destroy_tracker(mdev, tracker->id);
1335	mlx5vf_destroy_qp(mdev, tracker->fw_qp);
1336	mlx5vf_free_qp_recv_resources(mdev, tracker->host_qp);
1337	mlx5vf_destroy_qp(mdev, tracker->host_qp);
1338	mlx5vf_destroy_cq(mdev, &tracker->cq);
1339	mlx5_core_dealloc_pd(mdev, tracker->pdn);
1340	mlx5_put_uars_page(mdev, tracker->uar);
1341	mvdev->log_active = false;
1342}
1343
1344int mlx5vf_stop_page_tracker(struct vfio_device *vdev)
1345{
1346	struct mlx5vf_pci_core_device *mvdev = container_of(
1347		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1348
1349	mutex_lock(&mvdev->state_mutex);
1350	if (!mvdev->log_active)
1351		goto end;
1352
1353	_mlx5vf_free_page_tracker_resources(mvdev);
1354	mvdev->log_active = false;
1355end:
1356	mlx5vf_state_mutex_unlock(mvdev);
1357	return 0;
1358}
1359
1360int mlx5vf_start_page_tracker(struct vfio_device *vdev,
1361			      struct rb_root_cached *ranges, u32 nnodes,
1362			      u64 *page_size)
1363{
1364	struct mlx5vf_pci_core_device *mvdev = container_of(
1365		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1366	struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1367	u8 log_tracked_page = ilog2(*page_size);
1368	struct mlx5_vhca_qp *host_qp;
1369	struct mlx5_vhca_qp *fw_qp;
1370	struct mlx5_core_dev *mdev;
1371	u32 max_msg_size = PAGE_SIZE;
1372	u64 rq_size = SZ_2M;
1373	u32 max_recv_wr;
1374	int err;
1375
1376	mutex_lock(&mvdev->state_mutex);
1377	if (mvdev->mdev_detach) {
1378		err = -ENOTCONN;
1379		goto end;
1380	}
1381
1382	if (mvdev->log_active) {
1383		err = -EINVAL;
1384		goto end;
1385	}
1386
1387	mdev = mvdev->mdev;
1388	memset(tracker, 0, sizeof(*tracker));
1389	tracker->uar = mlx5_get_uars_page(mdev);
1390	if (IS_ERR(tracker->uar)) {
1391		err = PTR_ERR(tracker->uar);
1392		goto end;
1393	}
1394
1395	err = mlx5_core_alloc_pd(mdev, &tracker->pdn);
1396	if (err)
1397		goto err_uar;
1398
1399	max_recv_wr = DIV_ROUND_UP_ULL(rq_size, max_msg_size);
1400	err = mlx5vf_create_cq(mdev, tracker, max_recv_wr);
1401	if (err)
1402		goto err_dealloc_pd;
1403
1404	host_qp = mlx5vf_create_rc_qp(mdev, tracker, max_recv_wr);
1405	if (IS_ERR(host_qp)) {
1406		err = PTR_ERR(host_qp);
1407		goto err_cq;
1408	}
1409
1410	host_qp->max_msg_size = max_msg_size;
1411	if (log_tracked_page < MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1412				pg_track_log_min_page_size)) {
1413		log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1414				pg_track_log_min_page_size);
1415	} else if (log_tracked_page > MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1416				pg_track_log_max_page_size)) {
1417		log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1418				pg_track_log_max_page_size);
1419	}
1420
1421	host_qp->tracked_page_size = (1ULL << log_tracked_page);
1422	err = mlx5vf_alloc_qp_recv_resources(mdev, host_qp, tracker->pdn,
1423					     rq_size);
1424	if (err)
1425		goto err_host_qp;
1426
1427	fw_qp = mlx5vf_create_rc_qp(mdev, tracker, 0);
1428	if (IS_ERR(fw_qp)) {
1429		err = PTR_ERR(fw_qp);
1430		goto err_recv_resources;
1431	}
1432
1433	err = mlx5vf_activate_qp(mdev, host_qp, fw_qp->qpn, true);
1434	if (err)
1435		goto err_activate;
1436
1437	err = mlx5vf_activate_qp(mdev, fw_qp, host_qp->qpn, false);
1438	if (err)
1439		goto err_activate;
1440
1441	tracker->host_qp = host_qp;
1442	tracker->fw_qp = fw_qp;
1443	err = mlx5vf_create_tracker(mdev, mvdev, ranges, nnodes);
1444	if (err)
1445		goto err_activate;
1446
1447	MLX5_NB_INIT(&tracker->nb, mlx5vf_event_notifier, NOTIFY_ANY);
1448	mlx5_eq_notifier_register(mdev, &tracker->nb);
1449	*page_size = host_qp->tracked_page_size;
1450	mvdev->log_active = true;
1451	mlx5vf_state_mutex_unlock(mvdev);
1452	return 0;
1453
1454err_activate:
1455	mlx5vf_destroy_qp(mdev, fw_qp);
1456err_recv_resources:
1457	mlx5vf_free_qp_recv_resources(mdev, host_qp);
1458err_host_qp:
1459	mlx5vf_destroy_qp(mdev, host_qp);
1460err_cq:
1461	mlx5vf_destroy_cq(mdev, &tracker->cq);
1462err_dealloc_pd:
1463	mlx5_core_dealloc_pd(mdev, tracker->pdn);
1464err_uar:
1465	mlx5_put_uars_page(mdev, tracker->uar);
1466end:
1467	mlx5vf_state_mutex_unlock(mvdev);
1468	return err;
1469}
1470
1471static void
1472set_report_output(u32 size, int index, struct mlx5_vhca_qp *qp,
1473		  struct iova_bitmap *dirty)
1474{
1475	u32 entry_size = MLX5_ST_SZ_BYTES(page_track_report_entry);
1476	u32 nent = size / entry_size;
1477	struct page *page;
1478	u64 addr;
1479	u64 *buf;
1480	int i;
1481
1482	if (WARN_ON(index >= qp->recv_buf.npages ||
1483		    (nent > qp->max_msg_size / entry_size)))
1484		return;
1485
1486	page = qp->recv_buf.page_list[index];
1487	buf = kmap_local_page(page);
1488	for (i = 0; i < nent; i++) {
1489		addr = MLX5_GET(page_track_report_entry, buf + i,
1490				dirty_address_low);
1491		addr |= (u64)MLX5_GET(page_track_report_entry, buf + i,
1492				      dirty_address_high) << 32;
1493		iova_bitmap_set(dirty, addr, qp->tracked_page_size);
1494	}
1495	kunmap_local(buf);
1496}
1497
1498static void
1499mlx5vf_rq_cqe(struct mlx5_vhca_qp *qp, struct mlx5_cqe64 *cqe,
1500	      struct iova_bitmap *dirty, int *tracker_status)
1501{
1502	u32 size;
1503	int ix;
1504
1505	qp->rq.cc++;
1506	*tracker_status = be32_to_cpu(cqe->immediate) >> 28;
1507	size = be32_to_cpu(cqe->byte_cnt);
1508	ix = be16_to_cpu(cqe->wqe_counter) & (qp->rq.wqe_cnt - 1);
1509
1510	/* zero length CQE, no data */
1511	WARN_ON(!size && *tracker_status == MLX5_PAGE_TRACK_STATE_REPORTING);
1512	if (size)
1513		set_report_output(size, ix, qp, dirty);
1514
1515	qp->recv_buf.next_rq_offset = ix * qp->max_msg_size;
1516	mlx5vf_post_recv(qp);
1517}
1518
1519static void *get_cqe(struct mlx5_vhca_cq *cq, int n)
1520{
1521	return mlx5_frag_buf_get_wqe(&cq->buf.fbc, n);
1522}
1523
1524static struct mlx5_cqe64 *get_sw_cqe(struct mlx5_vhca_cq *cq, int n)
1525{
1526	void *cqe = get_cqe(cq, n & (cq->ncqe - 1));
1527	struct mlx5_cqe64 *cqe64;
1528
1529	cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64;
1530
1531	if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) &&
1532	    !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ncqe)))) {
1533		return cqe64;
1534	} else {
1535		return NULL;
1536	}
1537}
1538
1539static int
1540mlx5vf_cq_poll_one(struct mlx5_vhca_cq *cq, struct mlx5_vhca_qp *qp,
1541		   struct iova_bitmap *dirty, int *tracker_status)
1542{
1543	struct mlx5_cqe64 *cqe;
1544	u8 opcode;
1545
1546	cqe = get_sw_cqe(cq, cq->mcq.cons_index);
1547	if (!cqe)
1548		return CQ_EMPTY;
1549
1550	++cq->mcq.cons_index;
1551	/*
1552	 * Make sure we read CQ entry contents after we've checked the
1553	 * ownership bit.
1554	 */
1555	rmb();
1556	opcode = get_cqe_opcode(cqe);
1557	switch (opcode) {
1558	case MLX5_CQE_RESP_SEND_IMM:
1559		mlx5vf_rq_cqe(qp, cqe, dirty, tracker_status);
1560		return CQ_OK;
1561	default:
1562		return CQ_POLL_ERR;
1563	}
1564}
1565
1566int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova,
1567				  unsigned long length,
1568				  struct iova_bitmap *dirty)
1569{
1570	struct mlx5vf_pci_core_device *mvdev = container_of(
1571		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1572	struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1573	struct mlx5_vhca_cq *cq = &tracker->cq;
1574	struct mlx5_core_dev *mdev;
1575	int poll_err, err;
1576
1577	mutex_lock(&mvdev->state_mutex);
1578	if (!mvdev->log_active) {
1579		err = -EINVAL;
1580		goto end;
1581	}
1582
1583	if (mvdev->mdev_detach) {
1584		err = -ENOTCONN;
1585		goto end;
1586	}
1587
1588	mdev = mvdev->mdev;
1589	err = mlx5vf_cmd_modify_tracker(mdev, tracker->id, iova, length,
1590					MLX5_PAGE_TRACK_STATE_REPORTING);
1591	if (err)
1592		goto end;
1593
1594	tracker->status = MLX5_PAGE_TRACK_STATE_REPORTING;
1595	while (tracker->status == MLX5_PAGE_TRACK_STATE_REPORTING &&
1596	       !tracker->is_err) {
1597		poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp, dirty,
1598					      &tracker->status);
1599		if (poll_err == CQ_EMPTY) {
1600			mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map,
1601				    cq->mcq.cons_index);
1602			poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp,
1603						      dirty, &tracker->status);
1604			if (poll_err == CQ_EMPTY) {
1605				wait_for_completion(&mvdev->tracker_comp);
1606				continue;
1607			}
1608		}
1609		if (poll_err == CQ_POLL_ERR) {
1610			err = -EIO;
1611			goto end;
1612		}
1613		mlx5_cq_set_ci(&cq->mcq);
1614	}
1615
1616	if (tracker->status == MLX5_PAGE_TRACK_STATE_ERROR)
1617		tracker->is_err = true;
1618
1619	if (tracker->is_err)
1620		err = -EIO;
1621end:
1622	mlx5vf_state_mutex_unlock(mvdev);
1623	return err;
1624}
1625