1// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2/* Copyright (c) 2020 Mellanox Technologies Ltd. */
3
4#include <linux/vdpa.h>
5#include <uapi/linux/virtio_ids.h>
6#include <linux/virtio_config.h>
7#include <linux/mlx5/qp.h>
8#include <linux/mlx5/device.h>
9#include <linux/mlx5/vport.h>
10#include <linux/mlx5/fs.h>
11#include <linux/mlx5/device.h>
12#include <linux/mlx5/mpfs.h>
13#include "mlx5_vnet.h"
14#include "mlx5_vdpa_ifc.h"
15#include "mlx5_vdpa.h"
16
17#define to_mvdev(__vdev) container_of((__vdev), struct mlx5_vdpa_dev, vdev)
18
19#define VALID_FEATURES_MASK                                                                        \
20	(BIT_ULL(VIRTIO_NET_F_CSUM) | BIT_ULL(VIRTIO_NET_F_GUEST_CSUM) |                                   \
21	 BIT_ULL(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) | BIT_ULL(VIRTIO_NET_F_MTU) | BIT_ULL(VIRTIO_NET_F_MAC) |   \
22	 BIT_ULL(VIRTIO_NET_F_GUEST_TSO4) | BIT_ULL(VIRTIO_NET_F_GUEST_TSO6) |                             \
23	 BIT_ULL(VIRTIO_NET_F_GUEST_ECN) | BIT_ULL(VIRTIO_NET_F_GUEST_UFO) | BIT_ULL(VIRTIO_NET_F_HOST_TSO4) | \
24	 BIT_ULL(VIRTIO_NET_F_HOST_TSO6) | BIT_ULL(VIRTIO_NET_F_HOST_ECN) | BIT_ULL(VIRTIO_NET_F_HOST_UFO) |   \
25	 BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) | BIT_ULL(VIRTIO_NET_F_STATUS) | BIT_ULL(VIRTIO_NET_F_CTRL_VQ) |      \
26	 BIT_ULL(VIRTIO_NET_F_CTRL_RX) | BIT_ULL(VIRTIO_NET_F_CTRL_VLAN) |                                 \
27	 BIT_ULL(VIRTIO_NET_F_CTRL_RX_EXTRA) | BIT_ULL(VIRTIO_NET_F_GUEST_ANNOUNCE) |                      \
28	 BIT_ULL(VIRTIO_NET_F_MQ) | BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR) | BIT_ULL(VIRTIO_NET_F_HASH_REPORT) |  \
29	 BIT_ULL(VIRTIO_NET_F_RSS) | BIT_ULL(VIRTIO_NET_F_RSC_EXT) | BIT_ULL(VIRTIO_NET_F_STANDBY) |           \
30	 BIT_ULL(VIRTIO_NET_F_SPEED_DUPLEX) | BIT_ULL(VIRTIO_F_NOTIFY_ON_EMPTY) |                          \
31	 BIT_ULL(VIRTIO_F_ANY_LAYOUT) | BIT_ULL(VIRTIO_F_VERSION_1) | BIT_ULL(VIRTIO_F_ACCESS_PLATFORM) |      \
32	 BIT_ULL(VIRTIO_F_RING_PACKED) | BIT_ULL(VIRTIO_F_ORDER_PLATFORM) | BIT_ULL(VIRTIO_F_SR_IOV))
33
34#define VALID_STATUS_MASK                                                                          \
35	(VIRTIO_CONFIG_S_ACKNOWLEDGE | VIRTIO_CONFIG_S_DRIVER | VIRTIO_CONFIG_S_DRIVER_OK |        \
36	 VIRTIO_CONFIG_S_FEATURES_OK | VIRTIO_CONFIG_S_NEEDS_RESET | VIRTIO_CONFIG_S_FAILED)
37
38struct mlx5_vdpa_net_resources {
39	u32 tisn;
40	u32 tdn;
41	u32 tirn;
42	u32 rqtn;
43	bool valid;
44};
45
46struct mlx5_vdpa_cq_buf {
47	struct mlx5_frag_buf_ctrl fbc;
48	struct mlx5_frag_buf frag_buf;
49	int cqe_size;
50	int nent;
51};
52
53struct mlx5_vdpa_cq {
54	struct mlx5_core_cq mcq;
55	struct mlx5_vdpa_cq_buf buf;
56	struct mlx5_db db;
57	int cqe;
58};
59
60struct mlx5_vdpa_umem {
61	struct mlx5_frag_buf_ctrl fbc;
62	struct mlx5_frag_buf frag_buf;
63	int size;
64	u32 id;
65};
66
67struct mlx5_vdpa_qp {
68	struct mlx5_core_qp mqp;
69	struct mlx5_frag_buf frag_buf;
70	struct mlx5_db db;
71	u16 head;
72	bool fw;
73};
74
75struct mlx5_vq_restore_info {
76	u32 num_ent;
77	u64 desc_addr;
78	u64 device_addr;
79	u64 driver_addr;
80	u16 avail_index;
81	u16 used_index;
82	bool ready;
83	struct vdpa_callback cb;
84	bool restore;
85};
86
87struct mlx5_vdpa_virtqueue {
88	bool ready;
89	u64 desc_addr;
90	u64 device_addr;
91	u64 driver_addr;
92	u32 num_ent;
93	struct vdpa_callback event_cb;
94
95	/* Resources for implementing the notification channel from the device
96	 * to the driver. fwqp is the firmware end of an RC connection; the
97	 * other end is vqqp used by the driver. cq is is where completions are
98	 * reported.
99	 */
100	struct mlx5_vdpa_cq cq;
101	struct mlx5_vdpa_qp fwqp;
102	struct mlx5_vdpa_qp vqqp;
103
104	/* umem resources are required for the virtqueue operation. They're use
105	 * is internal and they must be provided by the driver.
106	 */
107	struct mlx5_vdpa_umem umem1;
108	struct mlx5_vdpa_umem umem2;
109	struct mlx5_vdpa_umem umem3;
110
111	bool initialized;
112	int index;
113	u32 virtq_id;
114	struct mlx5_vdpa_net *ndev;
115	u16 avail_idx;
116	u16 used_idx;
117	int fw_state;
118
119	/* keep last in the struct */
120	struct mlx5_vq_restore_info ri;
121};
122
123/* We will remove this limitation once mlx5_vdpa_alloc_resources()
124 * provides for driver space allocation
125 */
126#define MLX5_MAX_SUPPORTED_VQS 16
127
128struct mlx5_vdpa_net {
129	struct mlx5_vdpa_dev mvdev;
130	struct mlx5_vdpa_net_resources res;
131	struct virtio_net_config config;
132	struct mlx5_vdpa_virtqueue vqs[MLX5_MAX_SUPPORTED_VQS];
133
134	/* Serialize vq resources creation and destruction. This is required
135	 * since memory map might change and we need to destroy and create
136	 * resources while driver in operational.
137	 */
138	struct mutex reslock;
139	struct mlx5_flow_table *rxft;
140	struct mlx5_fc *rx_counter;
141	struct mlx5_flow_handle *rx_rule;
142	bool setup;
143	u16 mtu;
144};
145
146static void free_resources(struct mlx5_vdpa_net *ndev);
147static void init_mvqs(struct mlx5_vdpa_net *ndev);
148static int setup_driver(struct mlx5_vdpa_net *ndev);
149static void teardown_driver(struct mlx5_vdpa_net *ndev);
150
151static bool mlx5_vdpa_debug;
152
153#define MLX5_LOG_VIO_FLAG(_feature)                                                                \
154	do {                                                                                       \
155		if (features & BIT_ULL(_feature))                                                  \
156			mlx5_vdpa_info(mvdev, "%s\n", #_feature);                                  \
157	} while (0)
158
159#define MLX5_LOG_VIO_STAT(_status)                                                                 \
160	do {                                                                                       \
161		if (status & (_status))                                                            \
162			mlx5_vdpa_info(mvdev, "%s\n", #_status);                                   \
163	} while (0)
164
165static void print_status(struct mlx5_vdpa_dev *mvdev, u8 status, bool set)
166{
167	if (status & ~VALID_STATUS_MASK)
168		mlx5_vdpa_warn(mvdev, "Warning: there are invalid status bits 0x%x\n",
169			       status & ~VALID_STATUS_MASK);
170
171	if (!mlx5_vdpa_debug)
172		return;
173
174	mlx5_vdpa_info(mvdev, "driver status %s", set ? "set" : "get");
175	if (set && !status) {
176		mlx5_vdpa_info(mvdev, "driver resets the device\n");
177		return;
178	}
179
180	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_ACKNOWLEDGE);
181	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_DRIVER);
182	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_DRIVER_OK);
183	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_FEATURES_OK);
184	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_NEEDS_RESET);
185	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_FAILED);
186}
187
188static void print_features(struct mlx5_vdpa_dev *mvdev, u64 features, bool set)
189{
190	if (features & ~VALID_FEATURES_MASK)
191		mlx5_vdpa_warn(mvdev, "There are invalid feature bits 0x%llx\n",
192			       features & ~VALID_FEATURES_MASK);
193
194	if (!mlx5_vdpa_debug)
195		return;
196
197	mlx5_vdpa_info(mvdev, "driver %s feature bits:\n", set ? "sets" : "reads");
198	if (!features)
199		mlx5_vdpa_info(mvdev, "all feature bits are cleared\n");
200
201	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CSUM);
202	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_CSUM);
203	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS);
204	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MTU);
205	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MAC);
206	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_TSO4);
207	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_TSO6);
208	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_ECN);
209	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_UFO);
210	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_TSO4);
211	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_TSO6);
212	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_ECN);
213	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_UFO);
214	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MRG_RXBUF);
215	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_STATUS);
216	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_VQ);
217	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_RX);
218	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_VLAN);
219	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_RX_EXTRA);
220	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_ANNOUNCE);
221	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MQ);
222	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_MAC_ADDR);
223	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HASH_REPORT);
224	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_RSS);
225	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_RSC_EXT);
226	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_STANDBY);
227	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_SPEED_DUPLEX);
228	MLX5_LOG_VIO_FLAG(VIRTIO_F_NOTIFY_ON_EMPTY);
229	MLX5_LOG_VIO_FLAG(VIRTIO_F_ANY_LAYOUT);
230	MLX5_LOG_VIO_FLAG(VIRTIO_F_VERSION_1);
231	MLX5_LOG_VIO_FLAG(VIRTIO_F_ACCESS_PLATFORM);
232	MLX5_LOG_VIO_FLAG(VIRTIO_F_RING_PACKED);
233	MLX5_LOG_VIO_FLAG(VIRTIO_F_ORDER_PLATFORM);
234	MLX5_LOG_VIO_FLAG(VIRTIO_F_SR_IOV);
235}
236
237static int create_tis(struct mlx5_vdpa_net *ndev)
238{
239	struct mlx5_vdpa_dev *mvdev = &ndev->mvdev;
240	u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {};
241	void *tisc;
242	int err;
243
244	tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
245	MLX5_SET(tisc, tisc, transport_domain, ndev->res.tdn);
246	err = mlx5_vdpa_create_tis(mvdev, in, &ndev->res.tisn);
247	if (err)
248		mlx5_vdpa_warn(mvdev, "create TIS (%d)\n", err);
249
250	return err;
251}
252
253static void destroy_tis(struct mlx5_vdpa_net *ndev)
254{
255	mlx5_vdpa_destroy_tis(&ndev->mvdev, ndev->res.tisn);
256}
257
258#define MLX5_VDPA_CQE_SIZE 64
259#define MLX5_VDPA_LOG_CQE_SIZE ilog2(MLX5_VDPA_CQE_SIZE)
260
261static int cq_frag_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_cq_buf *buf, int nent)
262{
263	struct mlx5_frag_buf *frag_buf = &buf->frag_buf;
264	u8 log_wq_stride = MLX5_VDPA_LOG_CQE_SIZE;
265	u8 log_wq_sz = MLX5_VDPA_LOG_CQE_SIZE;
266	int err;
267
268	err = mlx5_frag_buf_alloc_node(ndev->mvdev.mdev, nent * MLX5_VDPA_CQE_SIZE, frag_buf,
269				       ndev->mvdev.mdev->priv.numa_node);
270	if (err)
271		return err;
272
273	mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc);
274
275	buf->cqe_size = MLX5_VDPA_CQE_SIZE;
276	buf->nent = nent;
277
278	return 0;
279}
280
281static int umem_frag_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_umem *umem, int size)
282{
283	struct mlx5_frag_buf *frag_buf = &umem->frag_buf;
284
285	return mlx5_frag_buf_alloc_node(ndev->mvdev.mdev, size, frag_buf,
286					ndev->mvdev.mdev->priv.numa_node);
287}
288
289static void cq_frag_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_cq_buf *buf)
290{
291	mlx5_frag_buf_free(ndev->mvdev.mdev, &buf->frag_buf);
292}
293
294static void *get_cqe(struct mlx5_vdpa_cq *vcq, int n)
295{
296	return mlx5_frag_buf_get_wqe(&vcq->buf.fbc, n);
297}
298
299static void cq_frag_buf_init(struct mlx5_vdpa_cq *vcq, struct mlx5_vdpa_cq_buf *buf)
300{
301	struct mlx5_cqe64 *cqe64;
302	void *cqe;
303	int i;
304
305	for (i = 0; i < buf->nent; i++) {
306		cqe = get_cqe(vcq, i);
307		cqe64 = cqe;
308		cqe64->op_own = MLX5_CQE_INVALID << 4;
309	}
310}
311
312static void *get_sw_cqe(struct mlx5_vdpa_cq *cq, int n)
313{
314	struct mlx5_cqe64 *cqe64 = get_cqe(cq, n & (cq->cqe - 1));
315
316	if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) &&
317	    !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & cq->cqe)))
318		return cqe64;
319
320	return NULL;
321}
322
323static void rx_post(struct mlx5_vdpa_qp *vqp, int n)
324{
325	vqp->head += n;
326	vqp->db.db[0] = cpu_to_be32(vqp->head);
327}
328
329static void qp_prepare(struct mlx5_vdpa_net *ndev, bool fw, void *in,
330		       struct mlx5_vdpa_virtqueue *mvq, u32 num_ent)
331{
332	struct mlx5_vdpa_qp *vqp;
333	__be64 *pas;
334	void *qpc;
335
336	vqp = fw ? &mvq->fwqp : &mvq->vqqp;
337	MLX5_SET(create_qp_in, in, uid, ndev->mvdev.res.uid);
338	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
339	if (vqp->fw) {
340		/* Firmware QP is allocated by the driver for the firmware's
341		 * use so we can skip part of the params as they will be chosen by firmware
342		 */
343		qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
344		MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ);
345		MLX5_SET(qpc, qpc, no_sq, 1);
346		return;
347	}
348
349	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
350	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
351	MLX5_SET(qpc, qpc, pd, ndev->mvdev.res.pdn);
352	MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
353	MLX5_SET(qpc, qpc, uar_page, ndev->mvdev.res.uar->index);
354	MLX5_SET(qpc, qpc, log_page_size, vqp->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
355	MLX5_SET(qpc, qpc, no_sq, 1);
356	MLX5_SET(qpc, qpc, cqn_rcv, mvq->cq.mcq.cqn);
357	MLX5_SET(qpc, qpc, log_rq_size, ilog2(num_ent));
358	MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
359	pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, in, pas);
360	mlx5_fill_page_frag_array(&vqp->frag_buf, pas);
361}
362
363static int rq_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp, u32 num_ent)
364{
365	return mlx5_frag_buf_alloc_node(ndev->mvdev.mdev,
366					num_ent * sizeof(struct mlx5_wqe_data_seg), &vqp->frag_buf,
367					ndev->mvdev.mdev->priv.numa_node);
368}
369
370static void rq_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp)
371{
372	mlx5_frag_buf_free(ndev->mvdev.mdev, &vqp->frag_buf);
373}
374
375static int qp_create(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
376		     struct mlx5_vdpa_qp *vqp)
377{
378	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
379	int inlen = MLX5_ST_SZ_BYTES(create_qp_in);
380	u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
381	void *qpc;
382	void *in;
383	int err;
384
385	if (!vqp->fw) {
386		vqp = &mvq->vqqp;
387		err = rq_buf_alloc(ndev, vqp, mvq->num_ent);
388		if (err)
389			return err;
390
391		err = mlx5_db_alloc(ndev->mvdev.mdev, &vqp->db);
392		if (err)
393			goto err_db;
394		inlen += vqp->frag_buf.npages * sizeof(__be64);
395	}
396
397	in = kzalloc(inlen, GFP_KERNEL);
398	if (!in) {
399		err = -ENOMEM;
400		goto err_kzalloc;
401	}
402
403	qp_prepare(ndev, vqp->fw, in, mvq, mvq->num_ent);
404	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
405	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
406	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
407	MLX5_SET(qpc, qpc, pd, ndev->mvdev.res.pdn);
408	MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
409	if (!vqp->fw)
410		MLX5_SET64(qpc, qpc, dbr_addr, vqp->db.dma);
411	MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
412	err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
413	kfree(in);
414	if (err)
415		goto err_kzalloc;
416
417	vqp->mqp.uid = ndev->mvdev.res.uid;
418	vqp->mqp.qpn = MLX5_GET(create_qp_out, out, qpn);
419
420	if (!vqp->fw)
421		rx_post(vqp, mvq->num_ent);
422
423	return 0;
424
425err_kzalloc:
426	if (!vqp->fw)
427		mlx5_db_free(ndev->mvdev.mdev, &vqp->db);
428err_db:
429	if (!vqp->fw)
430		rq_buf_free(ndev, vqp);
431
432	return err;
433}
434
435static void qp_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp)
436{
437	u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
438
439	MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
440	MLX5_SET(destroy_qp_in, in, qpn, vqp->mqp.qpn);
441	MLX5_SET(destroy_qp_in, in, uid, ndev->mvdev.res.uid);
442	if (mlx5_cmd_exec_in(ndev->mvdev.mdev, destroy_qp, in))
443		mlx5_vdpa_warn(&ndev->mvdev, "destroy qp 0x%x\n", vqp->mqp.qpn);
444	if (!vqp->fw) {
445		mlx5_db_free(ndev->mvdev.mdev, &vqp->db);
446		rq_buf_free(ndev, vqp);
447	}
448}
449
450static void *next_cqe_sw(struct mlx5_vdpa_cq *cq)
451{
452	return get_sw_cqe(cq, cq->mcq.cons_index);
453}
454
455static int mlx5_vdpa_poll_one(struct mlx5_vdpa_cq *vcq)
456{
457	struct mlx5_cqe64 *cqe64;
458
459	cqe64 = next_cqe_sw(vcq);
460	if (!cqe64)
461		return -EAGAIN;
462
463	vcq->mcq.cons_index++;
464	return 0;
465}
466
467static void mlx5_vdpa_handle_completions(struct mlx5_vdpa_virtqueue *mvq, int num)
468{
469	mlx5_cq_set_ci(&mvq->cq.mcq);
470
471	/* make sure CQ cosumer update is visible to the hardware before updating
472	 * RX doorbell record.
473	 */
474	dma_wmb();
475	rx_post(&mvq->vqqp, num);
476	if (mvq->event_cb.callback)
477		mvq->event_cb.callback(mvq->event_cb.private);
478}
479
480static void mlx5_vdpa_cq_comp(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe)
481{
482	struct mlx5_vdpa_virtqueue *mvq = container_of(mcq, struct mlx5_vdpa_virtqueue, cq.mcq);
483	struct mlx5_vdpa_net *ndev = mvq->ndev;
484	void __iomem *uar_page = ndev->mvdev.res.uar->map;
485	int num = 0;
486
487	while (!mlx5_vdpa_poll_one(&mvq->cq)) {
488		num++;
489		if (num > mvq->num_ent / 2) {
490			/* If completions keep coming while we poll, we want to
491			 * let the hardware know that we consumed them by
492			 * updating the doorbell record.  We also let vdpa core
493			 * know about this so it passes it on the virtio driver
494			 * on the guest.
495			 */
496			mlx5_vdpa_handle_completions(mvq, num);
497			num = 0;
498		}
499	}
500
501	if (num)
502		mlx5_vdpa_handle_completions(mvq, num);
503
504	mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index);
505}
506
507static int cq_create(struct mlx5_vdpa_net *ndev, u16 idx, u32 num_ent)
508{
509	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
510	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
511	void __iomem *uar_page = ndev->mvdev.res.uar->map;
512	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
513	struct mlx5_vdpa_cq *vcq = &mvq->cq;
514	__be64 *pas;
515	int inlen;
516	void *cqc;
517	void *in;
518	int err;
519	int eqn;
520
521	err = mlx5_db_alloc(mdev, &vcq->db);
522	if (err)
523		return err;
524
525	vcq->mcq.set_ci_db = vcq->db.db;
526	vcq->mcq.arm_db = vcq->db.db + 1;
527	vcq->mcq.cqe_sz = 64;
528
529	err = cq_frag_buf_alloc(ndev, &vcq->buf, num_ent);
530	if (err)
531		goto err_db;
532
533	cq_frag_buf_init(vcq, &vcq->buf);
534
535	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
536		MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) * vcq->buf.frag_buf.npages;
537	in = kzalloc(inlen, GFP_KERNEL);
538	if (!in) {
539		err = -ENOMEM;
540		goto err_vzalloc;
541	}
542
543	MLX5_SET(create_cq_in, in, uid, ndev->mvdev.res.uid);
544	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
545	mlx5_fill_page_frag_array(&vcq->buf.frag_buf, pas);
546
547	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
548	MLX5_SET(cqc, cqc, log_page_size, vcq->buf.frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
549
550	/* Use vector 0 by default. Consider adding code to choose least used
551	 * vector.
552	 */
553	err = mlx5_vector2eqn(mdev, 0, &eqn);
554	if (err)
555		goto err_vec;
556
557	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
558	MLX5_SET(cqc, cqc, log_cq_size, ilog2(num_ent));
559	MLX5_SET(cqc, cqc, uar_page, ndev->mvdev.res.uar->index);
560	MLX5_SET(cqc, cqc, c_eqn, eqn);
561	MLX5_SET64(cqc, cqc, dbr_addr, vcq->db.dma);
562
563	err = mlx5_core_create_cq(mdev, &vcq->mcq, in, inlen, out, sizeof(out));
564	if (err)
565		goto err_vec;
566
567	vcq->mcq.comp = mlx5_vdpa_cq_comp;
568	vcq->cqe = num_ent;
569	vcq->mcq.set_ci_db = vcq->db.db;
570	vcq->mcq.arm_db = vcq->db.db + 1;
571	mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index);
572	kfree(in);
573	return 0;
574
575err_vec:
576	kfree(in);
577err_vzalloc:
578	cq_frag_buf_free(ndev, &vcq->buf);
579err_db:
580	mlx5_db_free(ndev->mvdev.mdev, &vcq->db);
581	return err;
582}
583
584static void cq_destroy(struct mlx5_vdpa_net *ndev, u16 idx)
585{
586	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
587	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
588	struct mlx5_vdpa_cq *vcq = &mvq->cq;
589
590	if (mlx5_core_destroy_cq(mdev, &vcq->mcq)) {
591		mlx5_vdpa_warn(&ndev->mvdev, "destroy CQ 0x%x\n", vcq->mcq.cqn);
592		return;
593	}
594	cq_frag_buf_free(ndev, &vcq->buf);
595	mlx5_db_free(ndev->mvdev.mdev, &vcq->db);
596}
597
598static void set_umem_size(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num,
599			  struct mlx5_vdpa_umem **umemp)
600{
601	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
602	int p_a;
603	int p_b;
604
605	switch (num) {
606	case 1:
607		p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_1_buffer_param_a);
608		p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_1_buffer_param_b);
609		*umemp = &mvq->umem1;
610		break;
611	case 2:
612		p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_2_buffer_param_a);
613		p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_2_buffer_param_b);
614		*umemp = &mvq->umem2;
615		break;
616	case 3:
617		p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_3_buffer_param_a);
618		p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_3_buffer_param_b);
619		*umemp = &mvq->umem3;
620		break;
621	}
622	(*umemp)->size = p_a * mvq->num_ent + p_b;
623}
624
625static void umem_frag_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_umem *umem)
626{
627	mlx5_frag_buf_free(ndev->mvdev.mdev, &umem->frag_buf);
628}
629
630static int create_umem(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num)
631{
632	int inlen;
633	u32 out[MLX5_ST_SZ_DW(create_umem_out)] = {};
634	void *um;
635	void *in;
636	int err;
637	__be64 *pas;
638	struct mlx5_vdpa_umem *umem;
639
640	set_umem_size(ndev, mvq, num, &umem);
641	err = umem_frag_buf_alloc(ndev, umem, umem->size);
642	if (err)
643		return err;
644
645	inlen = MLX5_ST_SZ_BYTES(create_umem_in) + MLX5_ST_SZ_BYTES(mtt) * umem->frag_buf.npages;
646
647	in = kzalloc(inlen, GFP_KERNEL);
648	if (!in) {
649		err = -ENOMEM;
650		goto err_in;
651	}
652
653	MLX5_SET(create_umem_in, in, opcode, MLX5_CMD_OP_CREATE_UMEM);
654	MLX5_SET(create_umem_in, in, uid, ndev->mvdev.res.uid);
655	um = MLX5_ADDR_OF(create_umem_in, in, umem);
656	MLX5_SET(umem, um, log_page_size, umem->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
657	MLX5_SET64(umem, um, num_of_mtt, umem->frag_buf.npages);
658
659	pas = (__be64 *)MLX5_ADDR_OF(umem, um, mtt[0]);
660	mlx5_fill_page_frag_array_perm(&umem->frag_buf, pas, MLX5_MTT_PERM_RW);
661
662	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
663	if (err) {
664		mlx5_vdpa_warn(&ndev->mvdev, "create umem(%d)\n", err);
665		goto err_cmd;
666	}
667
668	kfree(in);
669	umem->id = MLX5_GET(create_umem_out, out, umem_id);
670
671	return 0;
672
673err_cmd:
674	kfree(in);
675err_in:
676	umem_frag_buf_free(ndev, umem);
677	return err;
678}
679
680static void umem_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num)
681{
682	u32 in[MLX5_ST_SZ_DW(destroy_umem_in)] = {};
683	u32 out[MLX5_ST_SZ_DW(destroy_umem_out)] = {};
684	struct mlx5_vdpa_umem *umem;
685
686	switch (num) {
687	case 1:
688		umem = &mvq->umem1;
689		break;
690	case 2:
691		umem = &mvq->umem2;
692		break;
693	case 3:
694		umem = &mvq->umem3;
695		break;
696	}
697
698	MLX5_SET(destroy_umem_in, in, opcode, MLX5_CMD_OP_DESTROY_UMEM);
699	MLX5_SET(destroy_umem_in, in, umem_id, umem->id);
700	if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out)))
701		return;
702
703	umem_frag_buf_free(ndev, umem);
704}
705
706static int umems_create(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
707{
708	int num;
709	int err;
710
711	for (num = 1; num <= 3; num++) {
712		err = create_umem(ndev, mvq, num);
713		if (err)
714			goto err_umem;
715	}
716	return 0;
717
718err_umem:
719	for (num--; num > 0; num--)
720		umem_destroy(ndev, mvq, num);
721
722	return err;
723}
724
725static void umems_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
726{
727	int num;
728
729	for (num = 3; num > 0; num--)
730		umem_destroy(ndev, mvq, num);
731}
732
733static int get_queue_type(struct mlx5_vdpa_net *ndev)
734{
735	u32 type_mask;
736
737	type_mask = MLX5_CAP_DEV_VDPA_EMULATION(ndev->mvdev.mdev, virtio_queue_type);
738
739	/* prefer split queue */
740	if (type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_PACKED)
741		return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED;
742
743	WARN_ON(!(type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT));
744
745	return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT;
746}
747
748static bool vq_is_tx(u16 idx)
749{
750	return idx % 2;
751}
752
753static u16 get_features_12_3(u64 features)
754{
755	return (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO4)) << 9) |
756	       (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO6)) << 8) |
757	       (!!(features & BIT_ULL(VIRTIO_NET_F_CSUM)) << 7) |
758	       (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_CSUM)) << 6);
759}
760
761static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
762{
763	int inlen = MLX5_ST_SZ_BYTES(create_virtio_net_q_in);
764	u32 out[MLX5_ST_SZ_DW(create_virtio_net_q_out)] = {};
765	void *obj_context;
766	void *cmd_hdr;
767	void *vq_ctx;
768	void *in;
769	int err;
770
771	err = umems_create(ndev, mvq);
772	if (err)
773		return err;
774
775	in = kzalloc(inlen, GFP_KERNEL);
776	if (!in) {
777		err = -ENOMEM;
778		goto err_alloc;
779	}
780
781	cmd_hdr = MLX5_ADDR_OF(create_virtio_net_q_in, in, general_obj_in_cmd_hdr);
782
783	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
784	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
785	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
786
787	obj_context = MLX5_ADDR_OF(create_virtio_net_q_in, in, obj_context);
788	MLX5_SET(virtio_net_q_object, obj_context, hw_available_index, mvq->avail_idx);
789	MLX5_SET(virtio_net_q_object, obj_context, hw_used_index, mvq->used_idx);
790	MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_12_3,
791		 get_features_12_3(ndev->mvdev.actual_features));
792	vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, virtio_q_context);
793	MLX5_SET(virtio_q, vq_ctx, virtio_q_type, get_queue_type(ndev));
794
795	if (vq_is_tx(mvq->index))
796		MLX5_SET(virtio_net_q_object, obj_context, tisn_or_qpn, ndev->res.tisn);
797
798	MLX5_SET(virtio_q, vq_ctx, event_mode, MLX5_VIRTIO_Q_EVENT_MODE_QP_MODE);
799	MLX5_SET(virtio_q, vq_ctx, queue_index, mvq->index);
800	MLX5_SET(virtio_q, vq_ctx, event_qpn_or_msix, mvq->fwqp.mqp.qpn);
801	MLX5_SET(virtio_q, vq_ctx, queue_size, mvq->num_ent);
802	MLX5_SET(virtio_q, vq_ctx, virtio_version_1_0,
803		 !!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_F_VERSION_1)));
804	MLX5_SET64(virtio_q, vq_ctx, desc_addr, mvq->desc_addr);
805	MLX5_SET64(virtio_q, vq_ctx, used_addr, mvq->device_addr);
806	MLX5_SET64(virtio_q, vq_ctx, available_addr, mvq->driver_addr);
807	MLX5_SET(virtio_q, vq_ctx, virtio_q_mkey, ndev->mvdev.mr.mkey.key);
808	MLX5_SET(virtio_q, vq_ctx, umem_1_id, mvq->umem1.id);
809	MLX5_SET(virtio_q, vq_ctx, umem_1_size, mvq->umem1.size);
810	MLX5_SET(virtio_q, vq_ctx, umem_2_id, mvq->umem2.id);
811	MLX5_SET(virtio_q, vq_ctx, umem_2_size, mvq->umem2.size);
812	MLX5_SET(virtio_q, vq_ctx, umem_3_id, mvq->umem3.id);
813	MLX5_SET(virtio_q, vq_ctx, umem_3_size, mvq->umem3.size);
814	MLX5_SET(virtio_q, vq_ctx, pd, ndev->mvdev.res.pdn);
815
816	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
817	if (err)
818		goto err_cmd;
819
820	kfree(in);
821	mvq->virtq_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
822
823	return 0;
824
825err_cmd:
826	kfree(in);
827err_alloc:
828	umems_destroy(ndev, mvq);
829	return err;
830}
831
832static void destroy_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
833{
834	u32 in[MLX5_ST_SZ_DW(destroy_virtio_net_q_in)] = {};
835	u32 out[MLX5_ST_SZ_DW(destroy_virtio_net_q_out)] = {};
836
837	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.opcode,
838		 MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
839	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.obj_id, mvq->virtq_id);
840	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.uid, ndev->mvdev.res.uid);
841	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.obj_type,
842		 MLX5_OBJ_TYPE_VIRTIO_NET_Q);
843	if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out))) {
844		mlx5_vdpa_warn(&ndev->mvdev, "destroy virtqueue 0x%x\n", mvq->virtq_id);
845		return;
846	}
847	umems_destroy(ndev, mvq);
848}
849
850static u32 get_rqpn(struct mlx5_vdpa_virtqueue *mvq, bool fw)
851{
852	return fw ? mvq->vqqp.mqp.qpn : mvq->fwqp.mqp.qpn;
853}
854
855static u32 get_qpn(struct mlx5_vdpa_virtqueue *mvq, bool fw)
856{
857	return fw ? mvq->fwqp.mqp.qpn : mvq->vqqp.mqp.qpn;
858}
859
860static void alloc_inout(struct mlx5_vdpa_net *ndev, int cmd, void **in, int *inlen, void **out,
861			int *outlen, u32 qpn, u32 rqpn)
862{
863	void *qpc;
864	void *pp;
865
866	switch (cmd) {
867	case MLX5_CMD_OP_2RST_QP:
868		*inlen = MLX5_ST_SZ_BYTES(qp_2rst_in);
869		*outlen = MLX5_ST_SZ_BYTES(qp_2rst_out);
870		*in = kzalloc(*inlen, GFP_KERNEL);
871		*out = kzalloc(*outlen, GFP_KERNEL);
872		if (!*in || !*out)
873			goto outerr;
874
875		MLX5_SET(qp_2rst_in, *in, opcode, cmd);
876		MLX5_SET(qp_2rst_in, *in, uid, ndev->mvdev.res.uid);
877		MLX5_SET(qp_2rst_in, *in, qpn, qpn);
878		break;
879	case MLX5_CMD_OP_RST2INIT_QP:
880		*inlen = MLX5_ST_SZ_BYTES(rst2init_qp_in);
881		*outlen = MLX5_ST_SZ_BYTES(rst2init_qp_out);
882		*in = kzalloc(*inlen, GFP_KERNEL);
883		*out = kzalloc(MLX5_ST_SZ_BYTES(rst2init_qp_out), GFP_KERNEL);
884		if (!*in || !*out)
885			goto outerr;
886
887		MLX5_SET(rst2init_qp_in, *in, opcode, cmd);
888		MLX5_SET(rst2init_qp_in, *in, uid, ndev->mvdev.res.uid);
889		MLX5_SET(rst2init_qp_in, *in, qpn, qpn);
890		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
891		MLX5_SET(qpc, qpc, remote_qpn, rqpn);
892		MLX5_SET(qpc, qpc, rwe, 1);
893		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
894		MLX5_SET(ads, pp, vhca_port_num, 1);
895		break;
896	case MLX5_CMD_OP_INIT2RTR_QP:
897		*inlen = MLX5_ST_SZ_BYTES(init2rtr_qp_in);
898		*outlen = MLX5_ST_SZ_BYTES(init2rtr_qp_out);
899		*in = kzalloc(*inlen, GFP_KERNEL);
900		*out = kzalloc(MLX5_ST_SZ_BYTES(init2rtr_qp_out), GFP_KERNEL);
901		if (!*in || !*out)
902			goto outerr;
903
904		MLX5_SET(init2rtr_qp_in, *in, opcode, cmd);
905		MLX5_SET(init2rtr_qp_in, *in, uid, ndev->mvdev.res.uid);
906		MLX5_SET(init2rtr_qp_in, *in, qpn, qpn);
907		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
908		MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
909		MLX5_SET(qpc, qpc, log_msg_max, 30);
910		MLX5_SET(qpc, qpc, remote_qpn, rqpn);
911		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
912		MLX5_SET(ads, pp, fl, 1);
913		break;
914	case MLX5_CMD_OP_RTR2RTS_QP:
915		*inlen = MLX5_ST_SZ_BYTES(rtr2rts_qp_in);
916		*outlen = MLX5_ST_SZ_BYTES(rtr2rts_qp_out);
917		*in = kzalloc(*inlen, GFP_KERNEL);
918		*out = kzalloc(MLX5_ST_SZ_BYTES(rtr2rts_qp_out), GFP_KERNEL);
919		if (!*in || !*out)
920			goto outerr;
921
922		MLX5_SET(rtr2rts_qp_in, *in, opcode, cmd);
923		MLX5_SET(rtr2rts_qp_in, *in, uid, ndev->mvdev.res.uid);
924		MLX5_SET(rtr2rts_qp_in, *in, qpn, qpn);
925		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
926		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
927		MLX5_SET(ads, pp, ack_timeout, 14);
928		MLX5_SET(qpc, qpc, retry_count, 7);
929		MLX5_SET(qpc, qpc, rnr_retry, 7);
930		break;
931	default:
932		goto outerr_nullify;
933	}
934
935	return;
936
937outerr:
938	kfree(*in);
939	kfree(*out);
940outerr_nullify:
941	*in = NULL;
942	*out = NULL;
943}
944
945static void free_inout(void *in, void *out)
946{
947	kfree(in);
948	kfree(out);
949}
950
951/* Two QPs are used by each virtqueue. One is used by the driver and one by
952 * firmware. The fw argument indicates whether the subjected QP is the one used
953 * by firmware.
954 */
955static int modify_qp(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, bool fw, int cmd)
956{
957	int outlen;
958	int inlen;
959	void *out;
960	void *in;
961	int err;
962
963	alloc_inout(ndev, cmd, &in, &inlen, &out, &outlen, get_qpn(mvq, fw), get_rqpn(mvq, fw));
964	if (!in || !out)
965		return -ENOMEM;
966
967	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, outlen);
968	free_inout(in, out);
969	return err;
970}
971
972static int connect_qps(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
973{
974	int err;
975
976	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_2RST_QP);
977	if (err)
978		return err;
979
980	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_2RST_QP);
981	if (err)
982		return err;
983
984	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_RST2INIT_QP);
985	if (err)
986		return err;
987
988	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_RST2INIT_QP);
989	if (err)
990		return err;
991
992	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_INIT2RTR_QP);
993	if (err)
994		return err;
995
996	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_INIT2RTR_QP);
997	if (err)
998		return err;
999
1000	return modify_qp(ndev, mvq, true, MLX5_CMD_OP_RTR2RTS_QP);
1001}
1002
1003struct mlx5_virtq_attr {
1004	u8 state;
1005	u16 available_index;
1006	u16 used_index;
1007};
1008
1009static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
1010			   struct mlx5_virtq_attr *attr)
1011{
1012	int outlen = MLX5_ST_SZ_BYTES(query_virtio_net_q_out);
1013	u32 in[MLX5_ST_SZ_DW(query_virtio_net_q_in)] = {};
1014	void *out;
1015	void *obj_context;
1016	void *cmd_hdr;
1017	int err;
1018
1019	out = kzalloc(outlen, GFP_KERNEL);
1020	if (!out)
1021		return -ENOMEM;
1022
1023	cmd_hdr = MLX5_ADDR_OF(query_virtio_net_q_in, in, general_obj_in_cmd_hdr);
1024
1025	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT);
1026	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
1027	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);
1028	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
1029	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, outlen);
1030	if (err)
1031		goto err_cmd;
1032
1033	obj_context = MLX5_ADDR_OF(query_virtio_net_q_out, out, obj_context);
1034	memset(attr, 0, sizeof(*attr));
1035	attr->state = MLX5_GET(virtio_net_q_object, obj_context, state);
1036	attr->available_index = MLX5_GET(virtio_net_q_object, obj_context, hw_available_index);
1037	attr->used_index = MLX5_GET(virtio_net_q_object, obj_context, hw_used_index);
1038	kfree(out);
1039	return 0;
1040
1041err_cmd:
1042	kfree(out);
1043	return err;
1044}
1045
1046static int modify_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int state)
1047{
1048	int inlen = MLX5_ST_SZ_BYTES(modify_virtio_net_q_in);
1049	u32 out[MLX5_ST_SZ_DW(modify_virtio_net_q_out)] = {};
1050	void *obj_context;
1051	void *cmd_hdr;
1052	void *in;
1053	int err;
1054
1055	in = kzalloc(inlen, GFP_KERNEL);
1056	if (!in)
1057		return -ENOMEM;
1058
1059	cmd_hdr = MLX5_ADDR_OF(modify_virtio_net_q_in, in, general_obj_in_cmd_hdr);
1060
1061	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT);
1062	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
1063	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);
1064	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
1065
1066	obj_context = MLX5_ADDR_OF(modify_virtio_net_q_in, in, obj_context);
1067	MLX5_SET64(virtio_net_q_object, obj_context, modify_field_select,
1068		   MLX5_VIRTQ_MODIFY_MASK_STATE);
1069	MLX5_SET(virtio_net_q_object, obj_context, state, state);
1070	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
1071	kfree(in);
1072	if (!err)
1073		mvq->fw_state = state;
1074
1075	return err;
1076}
1077
1078static int setup_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1079{
1080	u16 idx = mvq->index;
1081	int err;
1082
1083	if (!mvq->num_ent)
1084		return 0;
1085
1086	if (mvq->initialized) {
1087		mlx5_vdpa_warn(&ndev->mvdev, "attempt re init\n");
1088		return -EINVAL;
1089	}
1090
1091	err = cq_create(ndev, idx, mvq->num_ent);
1092	if (err)
1093		return err;
1094
1095	err = qp_create(ndev, mvq, &mvq->fwqp);
1096	if (err)
1097		goto err_fwqp;
1098
1099	err = qp_create(ndev, mvq, &mvq->vqqp);
1100	if (err)
1101		goto err_vqqp;
1102
1103	err = connect_qps(ndev, mvq);
1104	if (err)
1105		goto err_connect;
1106
1107	err = create_virtqueue(ndev, mvq);
1108	if (err)
1109		goto err_connect;
1110
1111	if (mvq->ready) {
1112		err = modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY);
1113		if (err) {
1114			mlx5_vdpa_warn(&ndev->mvdev, "failed to modify to ready vq idx %d(%d)\n",
1115				       idx, err);
1116			goto err_connect;
1117		}
1118	}
1119
1120	mvq->initialized = true;
1121	return 0;
1122
1123err_connect:
1124	qp_destroy(ndev, &mvq->vqqp);
1125err_vqqp:
1126	qp_destroy(ndev, &mvq->fwqp);
1127err_fwqp:
1128	cq_destroy(ndev, idx);
1129	return err;
1130}
1131
1132static void suspend_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1133{
1134	struct mlx5_virtq_attr attr;
1135
1136	if (!mvq->initialized)
1137		return;
1138
1139	if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY)
1140		return;
1141
1142	if (modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND))
1143		mlx5_vdpa_warn(&ndev->mvdev, "modify to suspend failed\n");
1144
1145	if (query_virtqueue(ndev, mvq, &attr)) {
1146		mlx5_vdpa_warn(&ndev->mvdev, "failed to query virtqueue\n");
1147		return;
1148	}
1149	mvq->avail_idx = attr.available_index;
1150	mvq->used_idx = attr.used_index;
1151}
1152
1153static void suspend_vqs(struct mlx5_vdpa_net *ndev)
1154{
1155	int i;
1156
1157	for (i = 0; i < MLX5_MAX_SUPPORTED_VQS; i++)
1158		suspend_vq(ndev, &ndev->vqs[i]);
1159}
1160
1161static void teardown_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1162{
1163	if (!mvq->initialized)
1164		return;
1165
1166	suspend_vq(ndev, mvq);
1167	destroy_virtqueue(ndev, mvq);
1168	qp_destroy(ndev, &mvq->vqqp);
1169	qp_destroy(ndev, &mvq->fwqp);
1170	cq_destroy(ndev, mvq->index);
1171	mvq->initialized = false;
1172}
1173
1174static int create_rqt(struct mlx5_vdpa_net *ndev)
1175{
1176	int log_max_rqt;
1177	__be32 *list;
1178	void *rqtc;
1179	int inlen;
1180	void *in;
1181	int i, j;
1182	int err;
1183
1184	log_max_rqt = min_t(int, 1, MLX5_CAP_GEN(ndev->mvdev.mdev, log_max_rqt_size));
1185	if (log_max_rqt < 1)
1186		return -EOPNOTSUPP;
1187
1188	inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + (1 << log_max_rqt) * MLX5_ST_SZ_BYTES(rq_num);
1189	in = kzalloc(inlen, GFP_KERNEL);
1190	if (!in)
1191		return -ENOMEM;
1192
1193	MLX5_SET(create_rqt_in, in, uid, ndev->mvdev.res.uid);
1194	rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context);
1195
1196	MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q);
1197	MLX5_SET(rqtc, rqtc, rqt_max_size, 1 << log_max_rqt);
1198	MLX5_SET(rqtc, rqtc, rqt_actual_size, 1);
1199	list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]);
1200	for (i = 0, j = 0; j < ndev->mvdev.max_vqs; j++) {
1201		if (!ndev->vqs[j].initialized)
1202			continue;
1203
1204		if (!vq_is_tx(ndev->vqs[j].index)) {
1205			list[i] = cpu_to_be32(ndev->vqs[j].virtq_id);
1206			i++;
1207		}
1208	}
1209
1210	err = mlx5_vdpa_create_rqt(&ndev->mvdev, in, inlen, &ndev->res.rqtn);
1211	kfree(in);
1212	if (err)
1213		return err;
1214
1215	return 0;
1216}
1217
1218static void destroy_rqt(struct mlx5_vdpa_net *ndev)
1219{
1220	mlx5_vdpa_destroy_rqt(&ndev->mvdev, ndev->res.rqtn);
1221}
1222
1223static int create_tir(struct mlx5_vdpa_net *ndev)
1224{
1225#define HASH_IP_L4PORTS                                                                            \
1226	(MLX5_HASH_FIELD_SEL_SRC_IP | MLX5_HASH_FIELD_SEL_DST_IP | MLX5_HASH_FIELD_SEL_L4_SPORT |  \
1227	 MLX5_HASH_FIELD_SEL_L4_DPORT)
1228	static const u8 rx_hash_toeplitz_key[] = { 0x2c, 0xc6, 0x81, 0xd1, 0x5b, 0xdb, 0xf4, 0xf7,
1229						   0xfc, 0xa2, 0x83, 0x19, 0xdb, 0x1a, 0x3e, 0x94,
1230						   0x6b, 0x9e, 0x38, 0xd9, 0x2c, 0x9c, 0x03, 0xd1,
1231						   0xad, 0x99, 0x44, 0xa7, 0xd9, 0x56, 0x3d, 0x59,
1232						   0x06, 0x3c, 0x25, 0xf3, 0xfc, 0x1f, 0xdc, 0x2a };
1233	void *rss_key;
1234	void *outer;
1235	void *tirc;
1236	void *in;
1237	int err;
1238
1239	in = kzalloc(MLX5_ST_SZ_BYTES(create_tir_in), GFP_KERNEL);
1240	if (!in)
1241		return -ENOMEM;
1242
1243	MLX5_SET(create_tir_in, in, uid, ndev->mvdev.res.uid);
1244	tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
1245	MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT);
1246
1247	MLX5_SET(tirc, tirc, rx_hash_symmetric, 1);
1248	MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_TOEPLITZ);
1249	rss_key = MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key);
1250	memcpy(rss_key, rx_hash_toeplitz_key, sizeof(rx_hash_toeplitz_key));
1251
1252	outer = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer);
1253	MLX5_SET(rx_hash_field_select, outer, l3_prot_type, MLX5_L3_PROT_TYPE_IPV4);
1254	MLX5_SET(rx_hash_field_select, outer, l4_prot_type, MLX5_L4_PROT_TYPE_TCP);
1255	MLX5_SET(rx_hash_field_select, outer, selected_fields, HASH_IP_L4PORTS);
1256
1257	MLX5_SET(tirc, tirc, indirect_table, ndev->res.rqtn);
1258	MLX5_SET(tirc, tirc, transport_domain, ndev->res.tdn);
1259
1260	err = mlx5_vdpa_create_tir(&ndev->mvdev, in, &ndev->res.tirn);
1261	kfree(in);
1262	return err;
1263}
1264
1265static void destroy_tir(struct mlx5_vdpa_net *ndev)
1266{
1267	mlx5_vdpa_destroy_tir(&ndev->mvdev, ndev->res.tirn);
1268}
1269
1270static int add_fwd_to_tir(struct mlx5_vdpa_net *ndev)
1271{
1272	struct mlx5_flow_destination dest[2] = {};
1273	struct mlx5_flow_table_attr ft_attr = {};
1274	struct mlx5_flow_act flow_act = {};
1275	struct mlx5_flow_namespace *ns;
1276	int err;
1277
1278	/* for now, one entry, match all, forward to tir */
1279	ft_attr.max_fte = 1;
1280	ft_attr.autogroup.max_num_groups = 1;
1281
1282	ns = mlx5_get_flow_namespace(ndev->mvdev.mdev, MLX5_FLOW_NAMESPACE_BYPASS);
1283	if (!ns) {
1284		mlx5_vdpa_warn(&ndev->mvdev, "get flow namespace\n");
1285		return -EOPNOTSUPP;
1286	}
1287
1288	ndev->rxft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr);
1289	if (IS_ERR(ndev->rxft))
1290		return PTR_ERR(ndev->rxft);
1291
1292	ndev->rx_counter = mlx5_fc_create(ndev->mvdev.mdev, false);
1293	if (IS_ERR(ndev->rx_counter)) {
1294		err = PTR_ERR(ndev->rx_counter);
1295		goto err_fc;
1296	}
1297
1298	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | MLX5_FLOW_CONTEXT_ACTION_COUNT;
1299	dest[0].type = MLX5_FLOW_DESTINATION_TYPE_TIR;
1300	dest[0].tir_num = ndev->res.tirn;
1301	dest[1].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
1302	dest[1].counter_id = mlx5_fc_id(ndev->rx_counter);
1303	ndev->rx_rule = mlx5_add_flow_rules(ndev->rxft, NULL, &flow_act, dest, 2);
1304	if (IS_ERR(ndev->rx_rule)) {
1305		err = PTR_ERR(ndev->rx_rule);
1306		ndev->rx_rule = NULL;
1307		goto err_rule;
1308	}
1309
1310	return 0;
1311
1312err_rule:
1313	mlx5_fc_destroy(ndev->mvdev.mdev, ndev->rx_counter);
1314err_fc:
1315	mlx5_destroy_flow_table(ndev->rxft);
1316	return err;
1317}
1318
1319static void remove_fwd_to_tir(struct mlx5_vdpa_net *ndev)
1320{
1321	if (!ndev->rx_rule)
1322		return;
1323
1324	mlx5_del_flow_rules(ndev->rx_rule);
1325	mlx5_fc_destroy(ndev->mvdev.mdev, ndev->rx_counter);
1326	mlx5_destroy_flow_table(ndev->rxft);
1327
1328	ndev->rx_rule = NULL;
1329}
1330
1331static void mlx5_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx)
1332{
1333	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1334	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1335	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
1336
1337	if (unlikely(!mvq->ready))
1338		return;
1339
1340	iowrite16(idx, ndev->mvdev.res.kick_addr);
1341}
1342
1343static int mlx5_vdpa_set_vq_address(struct vdpa_device *vdev, u16 idx, u64 desc_area,
1344				    u64 driver_area, u64 device_area)
1345{
1346	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1347	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1348	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
1349
1350	mvq->desc_addr = desc_area;
1351	mvq->device_addr = device_area;
1352	mvq->driver_addr = driver_area;
1353	return 0;
1354}
1355
1356static void mlx5_vdpa_set_vq_num(struct vdpa_device *vdev, u16 idx, u32 num)
1357{
1358	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1359	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1360	struct mlx5_vdpa_virtqueue *mvq;
1361
1362	mvq = &ndev->vqs[idx];
1363	mvq->num_ent = num;
1364}
1365
1366static void mlx5_vdpa_set_vq_cb(struct vdpa_device *vdev, u16 idx, struct vdpa_callback *cb)
1367{
1368	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1369	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1370	struct mlx5_vdpa_virtqueue *vq = &ndev->vqs[idx];
1371
1372	vq->event_cb = *cb;
1373}
1374
1375static void mlx5_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx, bool ready)
1376{
1377	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1378	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1379	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
1380
1381	if (!ready)
1382		suspend_vq(ndev, mvq);
1383
1384	mvq->ready = ready;
1385}
1386
1387static bool mlx5_vdpa_get_vq_ready(struct vdpa_device *vdev, u16 idx)
1388{
1389	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1390	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1391	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
1392
1393	return mvq->ready;
1394}
1395
1396static int mlx5_vdpa_set_vq_state(struct vdpa_device *vdev, u16 idx,
1397				  const struct vdpa_vq_state *state)
1398{
1399	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1400	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1401	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
1402
1403	if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) {
1404		mlx5_vdpa_warn(mvdev, "can't modify available index\n");
1405		return -EINVAL;
1406	}
1407
1408	mvq->used_idx = state->avail_index;
1409	mvq->avail_idx = state->avail_index;
1410	return 0;
1411}
1412
1413static int mlx5_vdpa_get_vq_state(struct vdpa_device *vdev, u16 idx, struct vdpa_vq_state *state)
1414{
1415	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1416	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1417	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
1418	struct mlx5_virtq_attr attr;
1419	int err;
1420
1421	/* If the virtq object was destroyed, use the value saved at
1422	 * the last minute of suspend_vq. This caters for userspace
1423	 * that cares about emulating the index after vq is stopped.
1424	 */
1425	if (!mvq->initialized) {
1426		/* Firmware returns a wrong value for the available index.
1427		 * Since both values should be identical, we take the value of
1428		 * used_idx which is reported correctly.
1429		 */
1430		state->avail_index = mvq->used_idx;
1431		return 0;
1432	}
1433
1434	err = query_virtqueue(ndev, mvq, &attr);
1435	if (err) {
1436		mlx5_vdpa_warn(mvdev, "failed to query virtqueue\n");
1437		return err;
1438	}
1439	state->avail_index = attr.used_index;
1440	return 0;
1441}
1442
1443static u32 mlx5_vdpa_get_vq_align(struct vdpa_device *vdev)
1444{
1445	return PAGE_SIZE;
1446}
1447
1448enum { MLX5_VIRTIO_NET_F_GUEST_CSUM = 1 << 9,
1449	MLX5_VIRTIO_NET_F_CSUM = 1 << 10,
1450	MLX5_VIRTIO_NET_F_HOST_TSO6 = 1 << 11,
1451	MLX5_VIRTIO_NET_F_HOST_TSO4 = 1 << 12,
1452};
1453
1454static u64 mlx_to_vritio_features(u16 dev_features)
1455{
1456	u64 result = 0;
1457
1458	if (dev_features & MLX5_VIRTIO_NET_F_GUEST_CSUM)
1459		result |= BIT_ULL(VIRTIO_NET_F_GUEST_CSUM);
1460	if (dev_features & MLX5_VIRTIO_NET_F_CSUM)
1461		result |= BIT_ULL(VIRTIO_NET_F_CSUM);
1462	if (dev_features & MLX5_VIRTIO_NET_F_HOST_TSO6)
1463		result |= BIT_ULL(VIRTIO_NET_F_HOST_TSO6);
1464	if (dev_features & MLX5_VIRTIO_NET_F_HOST_TSO4)
1465		result |= BIT_ULL(VIRTIO_NET_F_HOST_TSO4);
1466
1467	return result;
1468}
1469
1470static u64 mlx5_vdpa_get_features(struct vdpa_device *vdev)
1471{
1472	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1473	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1474	u16 dev_features;
1475
1476	dev_features = MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, device_features_bits_mask);
1477	ndev->mvdev.mlx_features = mlx_to_vritio_features(dev_features);
1478	if (MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, virtio_version_1_0))
1479		ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_F_VERSION_1);
1480	ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_F_ACCESS_PLATFORM);
1481	print_features(mvdev, ndev->mvdev.mlx_features, false);
1482	return ndev->mvdev.mlx_features;
1483}
1484
1485static int verify_driver_features(struct mlx5_vdpa_dev *mvdev, u64 features)
1486{
1487	/* Minimum features to expect */
1488	if (!(features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM)))
1489		return -EOPNOTSUPP;
1490
1491	/* Double check features combination sent down by the driver.
1492	 * Fail invalid features due to absence of the depended feature.
1493	 *
1494	 * Per VIRTIO v1.1 specification, section 5.1.3.1 Feature bit
1495	 * requirements: "VIRTIO_NET_F_MQ Requires VIRTIO_NET_F_CTRL_VQ".
1496	 * By failing the invalid features sent down by untrusted drivers,
1497	 * we're assured the assumption made upon is_index_valid() and
1498	 * is_ctrl_vq_idx() will not be compromised.
1499	 */
1500	if ((features & (BIT_ULL(VIRTIO_NET_F_MQ) | BIT_ULL(VIRTIO_NET_F_CTRL_VQ))) ==
1501            BIT_ULL(VIRTIO_NET_F_MQ))
1502		return -EINVAL;
1503
1504	return 0;
1505}
1506
1507static int setup_virtqueues(struct mlx5_vdpa_net *ndev)
1508{
1509	int err;
1510	int i;
1511
1512	for (i = 0; i < 2 * mlx5_vdpa_max_qps(ndev->mvdev.max_vqs); i++) {
1513		err = setup_vq(ndev, &ndev->vqs[i]);
1514		if (err)
1515			goto err_vq;
1516	}
1517
1518	return 0;
1519
1520err_vq:
1521	for (--i; i >= 0; i--)
1522		teardown_vq(ndev, &ndev->vqs[i]);
1523
1524	return err;
1525}
1526
1527static void teardown_virtqueues(struct mlx5_vdpa_net *ndev)
1528{
1529	struct mlx5_vdpa_virtqueue *mvq;
1530	int i;
1531
1532	for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {
1533		mvq = &ndev->vqs[i];
1534		if (!mvq->initialized)
1535			continue;
1536
1537		teardown_vq(ndev, mvq);
1538	}
1539}
1540
1541/* TODO: cross-endian support */
1542static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev *mvdev)
1543{
1544	return virtio_legacy_is_little_endian() ||
1545		(mvdev->actual_features & BIT_ULL(VIRTIO_F_VERSION_1));
1546}
1547
1548static __virtio16 cpu_to_mlx5vdpa16(struct mlx5_vdpa_dev *mvdev, u16 val)
1549{
1550	return __cpu_to_virtio16(mlx5_vdpa_is_little_endian(mvdev), val);
1551}
1552
1553static int mlx5_vdpa_set_features(struct vdpa_device *vdev, u64 features)
1554{
1555	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1556	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1557	int err;
1558
1559	print_features(mvdev, features, true);
1560
1561	err = verify_driver_features(mvdev, features);
1562	if (err)
1563		return err;
1564
1565	ndev->mvdev.actual_features = features & ndev->mvdev.mlx_features;
1566	ndev->config.mtu = cpu_to_mlx5vdpa16(mvdev, ndev->mtu);
1567	ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP);
1568	return err;
1569}
1570
1571static void mlx5_vdpa_set_config_cb(struct vdpa_device *vdev, struct vdpa_callback *cb)
1572{
1573	/* not implemented */
1574	mlx5_vdpa_warn(to_mvdev(vdev), "set config callback not supported\n");
1575}
1576
1577#define MLX5_VDPA_MAX_VQ_ENTRIES 256
1578static u16 mlx5_vdpa_get_vq_num_max(struct vdpa_device *vdev)
1579{
1580	return MLX5_VDPA_MAX_VQ_ENTRIES;
1581}
1582
1583static u32 mlx5_vdpa_get_device_id(struct vdpa_device *vdev)
1584{
1585	return VIRTIO_ID_NET;
1586}
1587
1588static u32 mlx5_vdpa_get_vendor_id(struct vdpa_device *vdev)
1589{
1590	return PCI_VENDOR_ID_MELLANOX;
1591}
1592
1593static u8 mlx5_vdpa_get_status(struct vdpa_device *vdev)
1594{
1595	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1596	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1597
1598	print_status(mvdev, ndev->mvdev.status, false);
1599	return ndev->mvdev.status;
1600}
1601
1602static int save_channel_info(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1603{
1604	struct mlx5_vq_restore_info *ri = &mvq->ri;
1605	struct mlx5_virtq_attr attr;
1606	int err;
1607
1608	if (!mvq->initialized)
1609		return 0;
1610
1611	err = query_virtqueue(ndev, mvq, &attr);
1612	if (err)
1613		return err;
1614
1615	ri->avail_index = attr.available_index;
1616	ri->used_index = attr.used_index;
1617	ri->ready = mvq->ready;
1618	ri->num_ent = mvq->num_ent;
1619	ri->desc_addr = mvq->desc_addr;
1620	ri->device_addr = mvq->device_addr;
1621	ri->driver_addr = mvq->driver_addr;
1622	ri->cb = mvq->event_cb;
1623	ri->restore = true;
1624	return 0;
1625}
1626
1627static int save_channels_info(struct mlx5_vdpa_net *ndev)
1628{
1629	int i;
1630
1631	for (i = 0; i < ndev->mvdev.max_vqs; i++) {
1632		memset(&ndev->vqs[i].ri, 0, sizeof(ndev->vqs[i].ri));
1633		save_channel_info(ndev, &ndev->vqs[i]);
1634	}
1635	return 0;
1636}
1637
1638static void mlx5_clear_vqs(struct mlx5_vdpa_net *ndev)
1639{
1640	int i;
1641
1642	for (i = 0; i < ndev->mvdev.max_vqs; i++)
1643		memset(&ndev->vqs[i], 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
1644}
1645
1646static void restore_channels_info(struct mlx5_vdpa_net *ndev)
1647{
1648	struct mlx5_vdpa_virtqueue *mvq;
1649	struct mlx5_vq_restore_info *ri;
1650	int i;
1651
1652	mlx5_clear_vqs(ndev);
1653	init_mvqs(ndev);
1654	for (i = 0; i < ndev->mvdev.max_vqs; i++) {
1655		mvq = &ndev->vqs[i];
1656		ri = &mvq->ri;
1657		if (!ri->restore)
1658			continue;
1659
1660		mvq->avail_idx = ri->avail_index;
1661		mvq->used_idx = ri->used_index;
1662		mvq->ready = ri->ready;
1663		mvq->num_ent = ri->num_ent;
1664		mvq->desc_addr = ri->desc_addr;
1665		mvq->device_addr = ri->device_addr;
1666		mvq->driver_addr = ri->driver_addr;
1667		mvq->event_cb = ri->cb;
1668	}
1669}
1670
1671static int mlx5_vdpa_change_map(struct mlx5_vdpa_net *ndev, struct vhost_iotlb *iotlb)
1672{
1673	int err;
1674
1675	suspend_vqs(ndev);
1676	err = save_channels_info(ndev);
1677	if (err)
1678		goto err_mr;
1679
1680	teardown_driver(ndev);
1681	mlx5_vdpa_destroy_mr(&ndev->mvdev);
1682	err = mlx5_vdpa_create_mr(&ndev->mvdev, iotlb);
1683	if (err)
1684		goto err_mr;
1685
1686	if (!(ndev->mvdev.status & VIRTIO_CONFIG_S_DRIVER_OK))
1687		return 0;
1688
1689	restore_channels_info(ndev);
1690	err = setup_driver(ndev);
1691	if (err)
1692		goto err_setup;
1693
1694	return 0;
1695
1696err_setup:
1697	mlx5_vdpa_destroy_mr(&ndev->mvdev);
1698err_mr:
1699	return err;
1700}
1701
1702static int setup_driver(struct mlx5_vdpa_net *ndev)
1703{
1704	int err;
1705
1706	mutex_lock(&ndev->reslock);
1707	if (ndev->setup) {
1708		mlx5_vdpa_warn(&ndev->mvdev, "setup driver called for already setup driver\n");
1709		err = 0;
1710		goto out;
1711	}
1712	err = setup_virtqueues(ndev);
1713	if (err) {
1714		mlx5_vdpa_warn(&ndev->mvdev, "setup_virtqueues\n");
1715		goto out;
1716	}
1717
1718	err = create_rqt(ndev);
1719	if (err) {
1720		mlx5_vdpa_warn(&ndev->mvdev, "create_rqt\n");
1721		goto err_rqt;
1722	}
1723
1724	err = create_tir(ndev);
1725	if (err) {
1726		mlx5_vdpa_warn(&ndev->mvdev, "create_tir\n");
1727		goto err_tir;
1728	}
1729
1730	err = add_fwd_to_tir(ndev);
1731	if (err) {
1732		mlx5_vdpa_warn(&ndev->mvdev, "add_fwd_to_tir\n");
1733		goto err_fwd;
1734	}
1735	ndev->setup = true;
1736	mutex_unlock(&ndev->reslock);
1737
1738	return 0;
1739
1740err_fwd:
1741	destroy_tir(ndev);
1742err_tir:
1743	destroy_rqt(ndev);
1744err_rqt:
1745	teardown_virtqueues(ndev);
1746out:
1747	mutex_unlock(&ndev->reslock);
1748	return err;
1749}
1750
1751static void teardown_driver(struct mlx5_vdpa_net *ndev)
1752{
1753	mutex_lock(&ndev->reslock);
1754	if (!ndev->setup)
1755		goto out;
1756
1757	remove_fwd_to_tir(ndev);
1758	destroy_tir(ndev);
1759	destroy_rqt(ndev);
1760	teardown_virtqueues(ndev);
1761	ndev->setup = false;
1762out:
1763	mutex_unlock(&ndev->reslock);
1764}
1765
1766static void clear_vqs_ready(struct mlx5_vdpa_net *ndev)
1767{
1768	int i;
1769
1770	for (i = 0; i < ndev->mvdev.max_vqs; i++)
1771		ndev->vqs[i].ready = false;
1772}
1773
1774static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status)
1775{
1776	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1777	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1778	int err;
1779
1780	print_status(mvdev, status, true);
1781	if (!status) {
1782		mlx5_vdpa_info(mvdev, "performing device reset\n");
1783		teardown_driver(ndev);
1784		clear_vqs_ready(ndev);
1785		mlx5_vdpa_destroy_mr(&ndev->mvdev);
1786		ndev->mvdev.status = 0;
1787		ndev->mvdev.mlx_features = 0;
1788		++mvdev->generation;
1789		return;
1790	}
1791
1792	if ((status ^ ndev->mvdev.status) & VIRTIO_CONFIG_S_DRIVER_OK) {
1793		if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
1794			err = setup_driver(ndev);
1795			if (err) {
1796				mlx5_vdpa_warn(mvdev, "failed to setup driver\n");
1797				goto err_setup;
1798			}
1799		} else {
1800			mlx5_vdpa_warn(mvdev, "did not expect DRIVER_OK to be cleared\n");
1801			return;
1802		}
1803	}
1804
1805	ndev->mvdev.status = status;
1806	return;
1807
1808err_setup:
1809	mlx5_vdpa_destroy_mr(&ndev->mvdev);
1810	ndev->mvdev.status |= VIRTIO_CONFIG_S_FAILED;
1811}
1812
1813static void mlx5_vdpa_get_config(struct vdpa_device *vdev, unsigned int offset, void *buf,
1814				 unsigned int len)
1815{
1816	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1817	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1818
1819	if (offset + len <= sizeof(struct virtio_net_config))
1820		memcpy(buf, (u8 *)&ndev->config + offset, len);
1821}
1822
1823static void mlx5_vdpa_set_config(struct vdpa_device *vdev, unsigned int offset, const void *buf,
1824				 unsigned int len)
1825{
1826	/* not supported */
1827}
1828
1829static u32 mlx5_vdpa_get_generation(struct vdpa_device *vdev)
1830{
1831	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1832
1833	return mvdev->generation;
1834}
1835
1836static int mlx5_vdpa_set_map(struct vdpa_device *vdev, struct vhost_iotlb *iotlb)
1837{
1838	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1839	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1840	bool change_map;
1841	int err;
1842
1843	err = mlx5_vdpa_handle_set_map(mvdev, iotlb, &change_map);
1844	if (err) {
1845		mlx5_vdpa_warn(mvdev, "set map failed(%d)\n", err);
1846		return err;
1847	}
1848
1849	if (change_map)
1850		return mlx5_vdpa_change_map(ndev, iotlb);
1851
1852	return 0;
1853}
1854
1855static void mlx5_vdpa_free(struct vdpa_device *vdev)
1856{
1857	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1858	struct mlx5_core_dev *pfmdev;
1859	struct mlx5_vdpa_net *ndev;
1860
1861	ndev = to_mlx5_vdpa_ndev(mvdev);
1862
1863	free_resources(ndev);
1864	if (!is_zero_ether_addr(ndev->config.mac)) {
1865		pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev));
1866		mlx5_mpfs_del_mac(pfmdev, ndev->config.mac);
1867	}
1868	mlx5_vdpa_free_resources(&ndev->mvdev);
1869	mutex_destroy(&ndev->reslock);
1870}
1871
1872static struct vdpa_notification_area mlx5_get_vq_notification(struct vdpa_device *vdev, u16 idx)
1873{
1874	struct vdpa_notification_area ret = {};
1875
1876	return ret;
1877}
1878
1879static int mlx5_get_vq_irq(struct vdpa_device *vdv, u16 idx)
1880{
1881	return -EOPNOTSUPP;
1882}
1883
1884static const struct vdpa_config_ops mlx5_vdpa_ops = {
1885	.set_vq_address = mlx5_vdpa_set_vq_address,
1886	.set_vq_num = mlx5_vdpa_set_vq_num,
1887	.kick_vq = mlx5_vdpa_kick_vq,
1888	.set_vq_cb = mlx5_vdpa_set_vq_cb,
1889	.set_vq_ready = mlx5_vdpa_set_vq_ready,
1890	.get_vq_ready = mlx5_vdpa_get_vq_ready,
1891	.set_vq_state = mlx5_vdpa_set_vq_state,
1892	.get_vq_state = mlx5_vdpa_get_vq_state,
1893	.get_vq_notification = mlx5_get_vq_notification,
1894	.get_vq_irq = mlx5_get_vq_irq,
1895	.get_vq_align = mlx5_vdpa_get_vq_align,
1896	.get_features = mlx5_vdpa_get_features,
1897	.set_features = mlx5_vdpa_set_features,
1898	.set_config_cb = mlx5_vdpa_set_config_cb,
1899	.get_vq_num_max = mlx5_vdpa_get_vq_num_max,
1900	.get_device_id = mlx5_vdpa_get_device_id,
1901	.get_vendor_id = mlx5_vdpa_get_vendor_id,
1902	.get_status = mlx5_vdpa_get_status,
1903	.set_status = mlx5_vdpa_set_status,
1904	.get_config = mlx5_vdpa_get_config,
1905	.set_config = mlx5_vdpa_set_config,
1906	.get_generation = mlx5_vdpa_get_generation,
1907	.set_map = mlx5_vdpa_set_map,
1908	.free = mlx5_vdpa_free,
1909};
1910
1911static int query_mtu(struct mlx5_core_dev *mdev, u16 *mtu)
1912{
1913	u16 hw_mtu;
1914	int err;
1915
1916	err = mlx5_query_nic_vport_mtu(mdev, &hw_mtu);
1917	if (err)
1918		return err;
1919
1920	*mtu = hw_mtu - MLX5V_ETH_HARD_MTU;
1921	return 0;
1922}
1923
1924static int alloc_resources(struct mlx5_vdpa_net *ndev)
1925{
1926	struct mlx5_vdpa_net_resources *res = &ndev->res;
1927	int err;
1928
1929	if (res->valid) {
1930		mlx5_vdpa_warn(&ndev->mvdev, "resources already allocated\n");
1931		return -EEXIST;
1932	}
1933
1934	err = mlx5_vdpa_alloc_transport_domain(&ndev->mvdev, &res->tdn);
1935	if (err)
1936		return err;
1937
1938	err = create_tis(ndev);
1939	if (err)
1940		goto err_tis;
1941
1942	res->valid = true;
1943
1944	return 0;
1945
1946err_tis:
1947	mlx5_vdpa_dealloc_transport_domain(&ndev->mvdev, res->tdn);
1948	return err;
1949}
1950
1951static void free_resources(struct mlx5_vdpa_net *ndev)
1952{
1953	struct mlx5_vdpa_net_resources *res = &ndev->res;
1954
1955	if (!res->valid)
1956		return;
1957
1958	destroy_tis(ndev);
1959	mlx5_vdpa_dealloc_transport_domain(&ndev->mvdev, res->tdn);
1960	res->valid = false;
1961}
1962
1963static void init_mvqs(struct mlx5_vdpa_net *ndev)
1964{
1965	struct mlx5_vdpa_virtqueue *mvq;
1966	int i;
1967
1968	for (i = 0; i < 2 * mlx5_vdpa_max_qps(ndev->mvdev.max_vqs); ++i) {
1969		mvq = &ndev->vqs[i];
1970		memset(mvq, 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
1971		mvq->index = i;
1972		mvq->ndev = ndev;
1973		mvq->fwqp.fw = true;
1974	}
1975	for (; i < ndev->mvdev.max_vqs; i++) {
1976		mvq = &ndev->vqs[i];
1977		memset(mvq, 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
1978		mvq->index = i;
1979		mvq->ndev = ndev;
1980	}
1981}
1982
1983void *mlx5_vdpa_add_dev(struct mlx5_core_dev *mdev)
1984{
1985	struct virtio_net_config *config;
1986	struct mlx5_core_dev *pfmdev;
1987	struct mlx5_vdpa_dev *mvdev;
1988	struct mlx5_vdpa_net *ndev;
1989	u32 max_vqs;
1990	int err;
1991
1992	/* we save one virtqueue for control virtqueue should we require it */
1993	max_vqs = MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues);
1994	max_vqs = min_t(u32, max_vqs, MLX5_MAX_SUPPORTED_VQS);
1995
1996	ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mlx5_vdpa_ops,
1997				 2 * mlx5_vdpa_max_qps(max_vqs));
1998	if (IS_ERR(ndev))
1999		return ndev;
2000
2001	ndev->mvdev.max_vqs = max_vqs;
2002	mvdev = &ndev->mvdev;
2003	mvdev->mdev = mdev;
2004	init_mvqs(ndev);
2005	mutex_init(&ndev->reslock);
2006	config = &ndev->config;
2007	err = query_mtu(mdev, &ndev->mtu);
2008	if (err)
2009		goto err_mtu;
2010
2011	err = mlx5_query_nic_vport_mac_address(mdev, 0, 0, config->mac);
2012	if (err)
2013		goto err_mtu;
2014
2015	if (!is_zero_ether_addr(config->mac)) {
2016		pfmdev = pci_get_drvdata(pci_physfn(mdev->pdev));
2017		err = mlx5_mpfs_add_mac(pfmdev, config->mac);
2018		if (err)
2019			goto err_mtu;
2020	}
2021
2022	mvdev->vdev.dma_dev = mdev->device;
2023	err = mlx5_vdpa_alloc_resources(&ndev->mvdev);
2024	if (err)
2025		goto err_mpfs;
2026
2027	err = alloc_resources(ndev);
2028	if (err)
2029		goto err_res;
2030
2031	err = vdpa_register_device(&mvdev->vdev);
2032	if (err)
2033		goto err_reg;
2034
2035	return ndev;
2036
2037err_reg:
2038	free_resources(ndev);
2039err_res:
2040	mlx5_vdpa_free_resources(&ndev->mvdev);
2041err_mpfs:
2042	if (!is_zero_ether_addr(config->mac))
2043		mlx5_mpfs_del_mac(pfmdev, config->mac);
2044err_mtu:
2045	mutex_destroy(&ndev->reslock);
2046	put_device(&mvdev->vdev.dev);
2047	return ERR_PTR(err);
2048}
2049
2050void mlx5_vdpa_remove_dev(struct mlx5_vdpa_dev *mvdev)
2051{
2052	vdpa_unregister_device(&mvdev->vdev);
2053}
2054