1// SPDX-License-Identifier: (GPL-2.0 OR MIT)
2/* Google virtual Ethernet (gve) driver
3 *
4 * Copyright (C) 2015-2021 Google, Inc.
5 */
6
7#include "gve.h"
8#include "gve_adminq.h"
9#include "gve_utils.h"
10#include <linux/ip.h>
11#include <linux/tcp.h>
12#include <linux/vmalloc.h>
13#include <linux/skbuff.h>
14#include <net/xdp_sock_drv.h>
15
16static inline void gve_tx_put_doorbell(struct gve_priv *priv,
17				       struct gve_queue_resources *q_resources,
18				       u32 val)
19{
20	iowrite32be(val, &priv->db_bar2[be32_to_cpu(q_resources->db_index)]);
21}
22
23void gve_xdp_tx_flush(struct gve_priv *priv, u32 xdp_qid)
24{
25	u32 tx_qid = gve_xdp_tx_queue_id(priv, xdp_qid);
26	struct gve_tx_ring *tx = &priv->tx[tx_qid];
27
28	gve_tx_put_doorbell(priv, tx->q_resources, tx->req);
29}
30
31/* gvnic can only transmit from a Registered Segment.
32 * We copy skb payloads into the registered segment before writing Tx
33 * descriptors and ringing the Tx doorbell.
34 *
35 * gve_tx_fifo_* manages the Registered Segment as a FIFO - clients must
36 * free allocations in the order they were allocated.
37 */
38
39static int gve_tx_fifo_init(struct gve_priv *priv, struct gve_tx_fifo *fifo)
40{
41	fifo->base = vmap(fifo->qpl->pages, fifo->qpl->num_entries, VM_MAP,
42			  PAGE_KERNEL);
43	if (unlikely(!fifo->base)) {
44		netif_err(priv, drv, priv->dev, "Failed to vmap fifo, qpl_id = %d\n",
45			  fifo->qpl->id);
46		return -ENOMEM;
47	}
48
49	fifo->size = fifo->qpl->num_entries * PAGE_SIZE;
50	atomic_set(&fifo->available, fifo->size);
51	fifo->head = 0;
52	return 0;
53}
54
55static void gve_tx_fifo_release(struct gve_priv *priv, struct gve_tx_fifo *fifo)
56{
57	WARN(atomic_read(&fifo->available) != fifo->size,
58	     "Releasing non-empty fifo");
59
60	vunmap(fifo->base);
61}
62
63static int gve_tx_fifo_pad_alloc_one_frag(struct gve_tx_fifo *fifo,
64					  size_t bytes)
65{
66	return (fifo->head + bytes < fifo->size) ? 0 : fifo->size - fifo->head;
67}
68
69static bool gve_tx_fifo_can_alloc(struct gve_tx_fifo *fifo, size_t bytes)
70{
71	return (atomic_read(&fifo->available) <= bytes) ? false : true;
72}
73
74/* gve_tx_alloc_fifo - Allocate fragment(s) from Tx FIFO
75 * @fifo: FIFO to allocate from
76 * @bytes: Allocation size
77 * @iov: Scatter-gather elements to fill with allocation fragment base/len
78 *
79 * Returns number of valid elements in iov[] or negative on error.
80 *
81 * Allocations from a given FIFO must be externally synchronized but concurrent
82 * allocation and frees are allowed.
83 */
84static int gve_tx_alloc_fifo(struct gve_tx_fifo *fifo, size_t bytes,
85			     struct gve_tx_iovec iov[2])
86{
87	size_t overflow, padding;
88	u32 aligned_head;
89	int nfrags = 0;
90
91	if (!bytes)
92		return 0;
93
94	/* This check happens before we know how much padding is needed to
95	 * align to a cacheline boundary for the payload, but that is fine,
96	 * because the FIFO head always start aligned, and the FIFO's boundaries
97	 * are aligned, so if there is space for the data, there is space for
98	 * the padding to the next alignment.
99	 */
100	WARN(!gve_tx_fifo_can_alloc(fifo, bytes),
101	     "Reached %s when there's not enough space in the fifo", __func__);
102
103	nfrags++;
104
105	iov[0].iov_offset = fifo->head;
106	iov[0].iov_len = bytes;
107	fifo->head += bytes;
108
109	if (fifo->head > fifo->size) {
110		/* If the allocation did not fit in the tail fragment of the
111		 * FIFO, also use the head fragment.
112		 */
113		nfrags++;
114		overflow = fifo->head - fifo->size;
115		iov[0].iov_len -= overflow;
116		iov[1].iov_offset = 0;	/* Start of fifo*/
117		iov[1].iov_len = overflow;
118
119		fifo->head = overflow;
120	}
121
122	/* Re-align to a cacheline boundary */
123	aligned_head = L1_CACHE_ALIGN(fifo->head);
124	padding = aligned_head - fifo->head;
125	iov[nfrags - 1].iov_padding = padding;
126	atomic_sub(bytes + padding, &fifo->available);
127	fifo->head = aligned_head;
128
129	if (fifo->head == fifo->size)
130		fifo->head = 0;
131
132	return nfrags;
133}
134
135/* gve_tx_free_fifo - Return space to Tx FIFO
136 * @fifo: FIFO to return fragments to
137 * @bytes: Bytes to free
138 */
139static void gve_tx_free_fifo(struct gve_tx_fifo *fifo, size_t bytes)
140{
141	atomic_add(bytes, &fifo->available);
142}
143
144static size_t gve_tx_clear_buffer_state(struct gve_tx_buffer_state *info)
145{
146	size_t space_freed = 0;
147	int i;
148
149	for (i = 0; i < ARRAY_SIZE(info->iov); i++) {
150		space_freed += info->iov[i].iov_len + info->iov[i].iov_padding;
151		info->iov[i].iov_len = 0;
152		info->iov[i].iov_padding = 0;
153	}
154	return space_freed;
155}
156
157static int gve_clean_xdp_done(struct gve_priv *priv, struct gve_tx_ring *tx,
158			      u32 to_do)
159{
160	struct gve_tx_buffer_state *info;
161	u32 clean_end = tx->done + to_do;
162	u64 pkts = 0, bytes = 0;
163	size_t space_freed = 0;
164	u32 xsk_complete = 0;
165	u32 idx;
166
167	for (; tx->done < clean_end; tx->done++) {
168		idx = tx->done & tx->mask;
169		info = &tx->info[idx];
170
171		if (unlikely(!info->xdp.size))
172			continue;
173
174		bytes += info->xdp.size;
175		pkts++;
176		xsk_complete += info->xdp.is_xsk;
177
178		info->xdp.size = 0;
179		if (info->xdp_frame) {
180			xdp_return_frame(info->xdp_frame);
181			info->xdp_frame = NULL;
182		}
183		space_freed += gve_tx_clear_buffer_state(info);
184	}
185
186	gve_tx_free_fifo(&tx->tx_fifo, space_freed);
187	if (xsk_complete > 0 && tx->xsk_pool)
188		xsk_tx_completed(tx->xsk_pool, xsk_complete);
189	u64_stats_update_begin(&tx->statss);
190	tx->bytes_done += bytes;
191	tx->pkt_done += pkts;
192	u64_stats_update_end(&tx->statss);
193	return pkts;
194}
195
196static int gve_clean_tx_done(struct gve_priv *priv, struct gve_tx_ring *tx,
197			     u32 to_do, bool try_to_wake);
198
199static void gve_tx_free_ring(struct gve_priv *priv, int idx)
200{
201	struct gve_tx_ring *tx = &priv->tx[idx];
202	struct device *hdev = &priv->pdev->dev;
203	size_t bytes;
204	u32 slots;
205
206	gve_tx_remove_from_block(priv, idx);
207	slots = tx->mask + 1;
208	if (tx->q_num < priv->tx_cfg.num_queues) {
209		gve_clean_tx_done(priv, tx, priv->tx_desc_cnt, false);
210		netdev_tx_reset_queue(tx->netdev_txq);
211	} else {
212		gve_clean_xdp_done(priv, tx, priv->tx_desc_cnt);
213	}
214
215	dma_free_coherent(hdev, sizeof(*tx->q_resources),
216			  tx->q_resources, tx->q_resources_bus);
217	tx->q_resources = NULL;
218
219	if (!tx->raw_addressing) {
220		gve_tx_fifo_release(priv, &tx->tx_fifo);
221		gve_unassign_qpl(priv, tx->tx_fifo.qpl->id);
222		tx->tx_fifo.qpl = NULL;
223	}
224
225	bytes = sizeof(*tx->desc) * slots;
226	dma_free_coherent(hdev, bytes, tx->desc, tx->bus);
227	tx->desc = NULL;
228
229	vfree(tx->info);
230	tx->info = NULL;
231
232	netif_dbg(priv, drv, priv->dev, "freed tx queue %d\n", idx);
233}
234
235static int gve_tx_alloc_ring(struct gve_priv *priv, int idx)
236{
237	struct gve_tx_ring *tx = &priv->tx[idx];
238	struct device *hdev = &priv->pdev->dev;
239	u32 slots = priv->tx_desc_cnt;
240	size_t bytes;
241
242	/* Make sure everything is zeroed to start */
243	memset(tx, 0, sizeof(*tx));
244	spin_lock_init(&tx->clean_lock);
245	spin_lock_init(&tx->xdp_lock);
246	tx->q_num = idx;
247
248	tx->mask = slots - 1;
249
250	/* alloc metadata */
251	tx->info = vcalloc(slots, sizeof(*tx->info));
252	if (!tx->info)
253		return -ENOMEM;
254
255	/* alloc tx queue */
256	bytes = sizeof(*tx->desc) * slots;
257	tx->desc = dma_alloc_coherent(hdev, bytes, &tx->bus, GFP_KERNEL);
258	if (!tx->desc)
259		goto abort_with_info;
260
261	tx->raw_addressing = priv->queue_format == GVE_GQI_RDA_FORMAT;
262	tx->dev = &priv->pdev->dev;
263	if (!tx->raw_addressing) {
264		tx->tx_fifo.qpl = gve_assign_tx_qpl(priv, idx);
265		if (!tx->tx_fifo.qpl)
266			goto abort_with_desc;
267		/* map Tx FIFO */
268		if (gve_tx_fifo_init(priv, &tx->tx_fifo))
269			goto abort_with_qpl;
270	}
271
272	tx->q_resources =
273		dma_alloc_coherent(hdev,
274				   sizeof(*tx->q_resources),
275				   &tx->q_resources_bus,
276				   GFP_KERNEL);
277	if (!tx->q_resources)
278		goto abort_with_fifo;
279
280	netif_dbg(priv, drv, priv->dev, "tx[%d]->bus=%lx\n", idx,
281		  (unsigned long)tx->bus);
282	if (idx < priv->tx_cfg.num_queues)
283		tx->netdev_txq = netdev_get_tx_queue(priv->dev, idx);
284	gve_tx_add_to_block(priv, idx);
285
286	return 0;
287
288abort_with_fifo:
289	if (!tx->raw_addressing)
290		gve_tx_fifo_release(priv, &tx->tx_fifo);
291abort_with_qpl:
292	if (!tx->raw_addressing)
293		gve_unassign_qpl(priv, tx->tx_fifo.qpl->id);
294abort_with_desc:
295	dma_free_coherent(hdev, bytes, tx->desc, tx->bus);
296	tx->desc = NULL;
297abort_with_info:
298	vfree(tx->info);
299	tx->info = NULL;
300	return -ENOMEM;
301}
302
303int gve_tx_alloc_rings(struct gve_priv *priv, int start_id, int num_rings)
304{
305	int err = 0;
306	int i;
307
308	for (i = start_id; i < start_id + num_rings; i++) {
309		err = gve_tx_alloc_ring(priv, i);
310		if (err) {
311			netif_err(priv, drv, priv->dev,
312				  "Failed to alloc tx ring=%d: err=%d\n",
313				  i, err);
314			break;
315		}
316	}
317	/* Unallocate if there was an error */
318	if (err) {
319		int j;
320
321		for (j = start_id; j < i; j++)
322			gve_tx_free_ring(priv, j);
323	}
324	return err;
325}
326
327void gve_tx_free_rings_gqi(struct gve_priv *priv, int start_id, int num_rings)
328{
329	int i;
330
331	for (i = start_id; i < start_id + num_rings; i++)
332		gve_tx_free_ring(priv, i);
333}
334
335/* gve_tx_avail - Calculates the number of slots available in the ring
336 * @tx: tx ring to check
337 *
338 * Returns the number of slots available
339 *
340 * The capacity of the queue is mask + 1. We don't need to reserve an entry.
341 **/
342static inline u32 gve_tx_avail(struct gve_tx_ring *tx)
343{
344	return tx->mask + 1 - (tx->req - tx->done);
345}
346
347static inline int gve_skb_fifo_bytes_required(struct gve_tx_ring *tx,
348					      struct sk_buff *skb)
349{
350	int pad_bytes, align_hdr_pad;
351	int bytes;
352	int hlen;
353
354	hlen = skb_is_gso(skb) ? skb_checksum_start_offset(skb) + tcp_hdrlen(skb) :
355				 min_t(int, GVE_GQ_TX_MIN_PKT_DESC_BYTES, skb->len);
356
357	pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->tx_fifo,
358						   hlen);
359	/* We need to take into account the header alignment padding. */
360	align_hdr_pad = L1_CACHE_ALIGN(hlen) - hlen;
361	bytes = align_hdr_pad + pad_bytes + skb->len;
362
363	return bytes;
364}
365
366/* The most descriptors we could need is MAX_SKB_FRAGS + 4 :
367 * 1 for each skb frag
368 * 1 for the skb linear portion
369 * 1 for when tcp hdr needs to be in separate descriptor
370 * 1 if the payload wraps to the beginning of the FIFO
371 * 1 for metadata descriptor
372 */
373#define MAX_TX_DESC_NEEDED	(MAX_SKB_FRAGS + 4)
374static void gve_tx_unmap_buf(struct device *dev, struct gve_tx_buffer_state *info)
375{
376	if (info->skb) {
377		dma_unmap_single(dev, dma_unmap_addr(info, dma),
378				 dma_unmap_len(info, len),
379				 DMA_TO_DEVICE);
380		dma_unmap_len_set(info, len, 0);
381	} else {
382		dma_unmap_page(dev, dma_unmap_addr(info, dma),
383			       dma_unmap_len(info, len),
384			       DMA_TO_DEVICE);
385		dma_unmap_len_set(info, len, 0);
386	}
387}
388
389/* Check if sufficient resources (descriptor ring space, FIFO space) are
390 * available to transmit the given number of bytes.
391 */
392static inline bool gve_can_tx(struct gve_tx_ring *tx, int bytes_required)
393{
394	bool can_alloc = true;
395
396	if (!tx->raw_addressing)
397		can_alloc = gve_tx_fifo_can_alloc(&tx->tx_fifo, bytes_required);
398
399	return (gve_tx_avail(tx) >= MAX_TX_DESC_NEEDED && can_alloc);
400}
401
402static_assert(NAPI_POLL_WEIGHT >= MAX_TX_DESC_NEEDED);
403
404/* Stops the queue if the skb cannot be transmitted. */
405static int gve_maybe_stop_tx(struct gve_priv *priv, struct gve_tx_ring *tx,
406			     struct sk_buff *skb)
407{
408	int bytes_required = 0;
409	u32 nic_done;
410	u32 to_do;
411	int ret;
412
413	if (!tx->raw_addressing)
414		bytes_required = gve_skb_fifo_bytes_required(tx, skb);
415
416	if (likely(gve_can_tx(tx, bytes_required)))
417		return 0;
418
419	ret = -EBUSY;
420	spin_lock(&tx->clean_lock);
421	nic_done = gve_tx_load_event_counter(priv, tx);
422	to_do = nic_done - tx->done;
423
424	/* Only try to clean if there is hope for TX */
425	if (to_do + gve_tx_avail(tx) >= MAX_TX_DESC_NEEDED) {
426		if (to_do > 0) {
427			to_do = min_t(u32, to_do, NAPI_POLL_WEIGHT);
428			gve_clean_tx_done(priv, tx, to_do, false);
429		}
430		if (likely(gve_can_tx(tx, bytes_required)))
431			ret = 0;
432	}
433	if (ret) {
434		/* No space, so stop the queue */
435		tx->stop_queue++;
436		netif_tx_stop_queue(tx->netdev_txq);
437	}
438	spin_unlock(&tx->clean_lock);
439
440	return ret;
441}
442
443static void gve_tx_fill_pkt_desc(union gve_tx_desc *pkt_desc,
444				 u16 csum_offset, u8 ip_summed, bool is_gso,
445				 int l4_hdr_offset, u32 desc_cnt,
446				 u16 hlen, u64 addr, u16 pkt_len)
447{
448	/* l4_hdr_offset and csum_offset are in units of 16-bit words */
449	if (is_gso) {
450		pkt_desc->pkt.type_flags = GVE_TXD_TSO | GVE_TXF_L4CSUM;
451		pkt_desc->pkt.l4_csum_offset = csum_offset >> 1;
452		pkt_desc->pkt.l4_hdr_offset = l4_hdr_offset >> 1;
453	} else if (likely(ip_summed == CHECKSUM_PARTIAL)) {
454		pkt_desc->pkt.type_flags = GVE_TXD_STD | GVE_TXF_L4CSUM;
455		pkt_desc->pkt.l4_csum_offset = csum_offset >> 1;
456		pkt_desc->pkt.l4_hdr_offset = l4_hdr_offset >> 1;
457	} else {
458		pkt_desc->pkt.type_flags = GVE_TXD_STD;
459		pkt_desc->pkt.l4_csum_offset = 0;
460		pkt_desc->pkt.l4_hdr_offset = 0;
461	}
462	pkt_desc->pkt.desc_cnt = desc_cnt;
463	pkt_desc->pkt.len = cpu_to_be16(pkt_len);
464	pkt_desc->pkt.seg_len = cpu_to_be16(hlen);
465	pkt_desc->pkt.seg_addr = cpu_to_be64(addr);
466}
467
468static void gve_tx_fill_mtd_desc(union gve_tx_desc *mtd_desc,
469				 struct sk_buff *skb)
470{
471	BUILD_BUG_ON(sizeof(mtd_desc->mtd) != sizeof(mtd_desc->pkt));
472
473	mtd_desc->mtd.type_flags = GVE_TXD_MTD | GVE_MTD_SUBTYPE_PATH;
474	mtd_desc->mtd.path_state = GVE_MTD_PATH_STATE_DEFAULT |
475				   GVE_MTD_PATH_HASH_L4;
476	mtd_desc->mtd.path_hash = cpu_to_be32(skb->hash);
477	mtd_desc->mtd.reserved0 = 0;
478	mtd_desc->mtd.reserved1 = 0;
479}
480
481static void gve_tx_fill_seg_desc(union gve_tx_desc *seg_desc,
482				 u16 l3_offset, u16 gso_size,
483				 bool is_gso_v6, bool is_gso,
484				 u16 len, u64 addr)
485{
486	seg_desc->seg.type_flags = GVE_TXD_SEG;
487	if (is_gso) {
488		if (is_gso_v6)
489			seg_desc->seg.type_flags |= GVE_TXSF_IPV6;
490		seg_desc->seg.l3_offset = l3_offset >> 1;
491		seg_desc->seg.mss = cpu_to_be16(gso_size);
492	}
493	seg_desc->seg.seg_len = cpu_to_be16(len);
494	seg_desc->seg.seg_addr = cpu_to_be64(addr);
495}
496
497static void gve_dma_sync_for_device(struct device *dev, dma_addr_t *page_buses,
498				    u64 iov_offset, u64 iov_len)
499{
500	u64 last_page = (iov_offset + iov_len - 1) / PAGE_SIZE;
501	u64 first_page = iov_offset / PAGE_SIZE;
502	u64 page;
503
504	for (page = first_page; page <= last_page; page++)
505		dma_sync_single_for_device(dev, page_buses[page], PAGE_SIZE, DMA_TO_DEVICE);
506}
507
508static int gve_tx_add_skb_copy(struct gve_priv *priv, struct gve_tx_ring *tx, struct sk_buff *skb)
509{
510	int pad_bytes, hlen, hdr_nfrags, payload_nfrags, l4_hdr_offset;
511	union gve_tx_desc *pkt_desc, *seg_desc;
512	struct gve_tx_buffer_state *info;
513	int mtd_desc_nr = !!skb->l4_hash;
514	bool is_gso = skb_is_gso(skb);
515	u32 idx = tx->req & tx->mask;
516	int payload_iov = 2;
517	int copy_offset;
518	u32 next_idx;
519	int i;
520
521	info = &tx->info[idx];
522	pkt_desc = &tx->desc[idx];
523
524	l4_hdr_offset = skb_checksum_start_offset(skb);
525	/* If the skb is gso, then we want the tcp header alone in the first segment
526	 * otherwise we want the minimum required by the gVNIC spec.
527	 */
528	hlen = is_gso ? l4_hdr_offset + tcp_hdrlen(skb) :
529			min_t(int, GVE_GQ_TX_MIN_PKT_DESC_BYTES, skb->len);
530
531	info->skb =  skb;
532	/* We don't want to split the header, so if necessary, pad to the end
533	 * of the fifo and then put the header at the beginning of the fifo.
534	 */
535	pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->tx_fifo, hlen);
536	hdr_nfrags = gve_tx_alloc_fifo(&tx->tx_fifo, hlen + pad_bytes,
537				       &info->iov[0]);
538	WARN(!hdr_nfrags, "hdr_nfrags should never be 0!");
539	payload_nfrags = gve_tx_alloc_fifo(&tx->tx_fifo, skb->len - hlen,
540					   &info->iov[payload_iov]);
541
542	gve_tx_fill_pkt_desc(pkt_desc, skb->csum_offset, skb->ip_summed,
543			     is_gso, l4_hdr_offset,
544			     1 + mtd_desc_nr + payload_nfrags, hlen,
545			     info->iov[hdr_nfrags - 1].iov_offset, skb->len);
546
547	skb_copy_bits(skb, 0,
548		      tx->tx_fifo.base + info->iov[hdr_nfrags - 1].iov_offset,
549		      hlen);
550	gve_dma_sync_for_device(&priv->pdev->dev, tx->tx_fifo.qpl->page_buses,
551				info->iov[hdr_nfrags - 1].iov_offset,
552				info->iov[hdr_nfrags - 1].iov_len);
553	copy_offset = hlen;
554
555	if (mtd_desc_nr) {
556		next_idx = (tx->req + 1) & tx->mask;
557		gve_tx_fill_mtd_desc(&tx->desc[next_idx], skb);
558	}
559
560	for (i = payload_iov; i < payload_nfrags + payload_iov; i++) {
561		next_idx = (tx->req + 1 + mtd_desc_nr + i - payload_iov) & tx->mask;
562		seg_desc = &tx->desc[next_idx];
563
564		gve_tx_fill_seg_desc(seg_desc, skb_network_offset(skb),
565				     skb_shinfo(skb)->gso_size,
566				     skb_is_gso_v6(skb), is_gso,
567				     info->iov[i].iov_len,
568				     info->iov[i].iov_offset);
569
570		skb_copy_bits(skb, copy_offset,
571			      tx->tx_fifo.base + info->iov[i].iov_offset,
572			      info->iov[i].iov_len);
573		gve_dma_sync_for_device(&priv->pdev->dev, tx->tx_fifo.qpl->page_buses,
574					info->iov[i].iov_offset,
575					info->iov[i].iov_len);
576		copy_offset += info->iov[i].iov_len;
577	}
578
579	return 1 + mtd_desc_nr + payload_nfrags;
580}
581
582static int gve_tx_add_skb_no_copy(struct gve_priv *priv, struct gve_tx_ring *tx,
583				  struct sk_buff *skb)
584{
585	const struct skb_shared_info *shinfo = skb_shinfo(skb);
586	int hlen, num_descriptors, l4_hdr_offset;
587	union gve_tx_desc *pkt_desc, *mtd_desc, *seg_desc;
588	struct gve_tx_buffer_state *info;
589	int mtd_desc_nr = !!skb->l4_hash;
590	bool is_gso = skb_is_gso(skb);
591	u32 idx = tx->req & tx->mask;
592	u64 addr;
593	u32 len;
594	int i;
595
596	info = &tx->info[idx];
597	pkt_desc = &tx->desc[idx];
598
599	l4_hdr_offset = skb_checksum_start_offset(skb);
600	/* If the skb is gso, then we want only up to the tcp header in the first segment
601	 * to efficiently replicate on each segment otherwise we want the linear portion
602	 * of the skb (which will contain the checksum because skb->csum_start and
603	 * skb->csum_offset are given relative to skb->head) in the first segment.
604	 */
605	hlen = is_gso ? l4_hdr_offset + tcp_hdrlen(skb) : skb_headlen(skb);
606	len = skb_headlen(skb);
607
608	info->skb =  skb;
609
610	addr = dma_map_single(tx->dev, skb->data, len, DMA_TO_DEVICE);
611	if (unlikely(dma_mapping_error(tx->dev, addr))) {
612		tx->dma_mapping_error++;
613		goto drop;
614	}
615	dma_unmap_len_set(info, len, len);
616	dma_unmap_addr_set(info, dma, addr);
617
618	num_descriptors = 1 + shinfo->nr_frags;
619	if (hlen < len)
620		num_descriptors++;
621	if (mtd_desc_nr)
622		num_descriptors++;
623
624	gve_tx_fill_pkt_desc(pkt_desc, skb->csum_offset, skb->ip_summed,
625			     is_gso, l4_hdr_offset,
626			     num_descriptors, hlen, addr, skb->len);
627
628	if (mtd_desc_nr) {
629		idx = (idx + 1) & tx->mask;
630		mtd_desc = &tx->desc[idx];
631		gve_tx_fill_mtd_desc(mtd_desc, skb);
632	}
633
634	if (hlen < len) {
635		/* For gso the rest of the linear portion of the skb needs to
636		 * be in its own descriptor.
637		 */
638		len -= hlen;
639		addr += hlen;
640		idx = (idx + 1) & tx->mask;
641		seg_desc = &tx->desc[idx];
642		gve_tx_fill_seg_desc(seg_desc, skb_network_offset(skb),
643				     skb_shinfo(skb)->gso_size,
644				     skb_is_gso_v6(skb), is_gso, len, addr);
645	}
646
647	for (i = 0; i < shinfo->nr_frags; i++) {
648		const skb_frag_t *frag = &shinfo->frags[i];
649
650		idx = (idx + 1) & tx->mask;
651		seg_desc = &tx->desc[idx];
652		len = skb_frag_size(frag);
653		addr = skb_frag_dma_map(tx->dev, frag, 0, len, DMA_TO_DEVICE);
654		if (unlikely(dma_mapping_error(tx->dev, addr))) {
655			tx->dma_mapping_error++;
656			goto unmap_drop;
657		}
658		tx->info[idx].skb = NULL;
659		dma_unmap_len_set(&tx->info[idx], len, len);
660		dma_unmap_addr_set(&tx->info[idx], dma, addr);
661
662		gve_tx_fill_seg_desc(seg_desc, skb_network_offset(skb),
663				     skb_shinfo(skb)->gso_size,
664				     skb_is_gso_v6(skb), is_gso, len, addr);
665	}
666
667	return num_descriptors;
668
669unmap_drop:
670	i += num_descriptors - shinfo->nr_frags;
671	while (i--) {
672		/* Skip metadata descriptor, if set */
673		if (i == 1 && mtd_desc_nr == 1)
674			continue;
675		idx--;
676		gve_tx_unmap_buf(tx->dev, &tx->info[idx & tx->mask]);
677	}
678drop:
679	tx->dropped_pkt++;
680	return 0;
681}
682
683netdev_tx_t gve_tx(struct sk_buff *skb, struct net_device *dev)
684{
685	struct gve_priv *priv = netdev_priv(dev);
686	struct gve_tx_ring *tx;
687	int nsegs;
688
689	WARN(skb_get_queue_mapping(skb) >= priv->tx_cfg.num_queues,
690	     "skb queue index out of range");
691	tx = &priv->tx[skb_get_queue_mapping(skb)];
692	if (unlikely(gve_maybe_stop_tx(priv, tx, skb))) {
693		/* We need to ring the txq doorbell -- we have stopped the Tx
694		 * queue for want of resources, but prior calls to gve_tx()
695		 * may have added descriptors without ringing the doorbell.
696		 */
697
698		gve_tx_put_doorbell(priv, tx->q_resources, tx->req);
699		return NETDEV_TX_BUSY;
700	}
701	if (tx->raw_addressing)
702		nsegs = gve_tx_add_skb_no_copy(priv, tx, skb);
703	else
704		nsegs = gve_tx_add_skb_copy(priv, tx, skb);
705
706	/* If the packet is getting sent, we need to update the skb */
707	if (nsegs) {
708		netdev_tx_sent_queue(tx->netdev_txq, skb->len);
709		skb_tx_timestamp(skb);
710		tx->req += nsegs;
711	} else {
712		dev_kfree_skb_any(skb);
713	}
714
715	if (!netif_xmit_stopped(tx->netdev_txq) && netdev_xmit_more())
716		return NETDEV_TX_OK;
717
718	/* Give packets to NIC. Even if this packet failed to send the doorbell
719	 * might need to be rung because of xmit_more.
720	 */
721	gve_tx_put_doorbell(priv, tx->q_resources, tx->req);
722	return NETDEV_TX_OK;
723}
724
725static int gve_tx_fill_xdp(struct gve_priv *priv, struct gve_tx_ring *tx,
726			   void *data, int len, void *frame_p, bool is_xsk)
727{
728	int pad, nfrags, ndescs, iovi, offset;
729	struct gve_tx_buffer_state *info;
730	u32 reqi = tx->req;
731
732	pad = gve_tx_fifo_pad_alloc_one_frag(&tx->tx_fifo, len);
733	if (pad >= GVE_GQ_TX_MIN_PKT_DESC_BYTES)
734		pad = 0;
735	info = &tx->info[reqi & tx->mask];
736	info->xdp_frame = frame_p;
737	info->xdp.size = len;
738	info->xdp.is_xsk = is_xsk;
739
740	nfrags = gve_tx_alloc_fifo(&tx->tx_fifo, pad + len,
741				   &info->iov[0]);
742	iovi = pad > 0;
743	ndescs = nfrags - iovi;
744	offset = 0;
745
746	while (iovi < nfrags) {
747		if (!offset)
748			gve_tx_fill_pkt_desc(&tx->desc[reqi & tx->mask], 0,
749					     CHECKSUM_NONE, false, 0, ndescs,
750					     info->iov[iovi].iov_len,
751					     info->iov[iovi].iov_offset, len);
752		else
753			gve_tx_fill_seg_desc(&tx->desc[reqi & tx->mask],
754					     0, 0, false, false,
755					     info->iov[iovi].iov_len,
756					     info->iov[iovi].iov_offset);
757
758		memcpy(tx->tx_fifo.base + info->iov[iovi].iov_offset,
759		       data + offset, info->iov[iovi].iov_len);
760		gve_dma_sync_for_device(&priv->pdev->dev,
761					tx->tx_fifo.qpl->page_buses,
762					info->iov[iovi].iov_offset,
763					info->iov[iovi].iov_len);
764		offset += info->iov[iovi].iov_len;
765		iovi++;
766		reqi++;
767	}
768
769	return ndescs;
770}
771
772int gve_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
773		 u32 flags)
774{
775	struct gve_priv *priv = netdev_priv(dev);
776	struct gve_tx_ring *tx;
777	int i, err = 0, qid;
778
779	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
780		return -EINVAL;
781
782	qid = gve_xdp_tx_queue_id(priv,
783				  smp_processor_id() % priv->num_xdp_queues);
784
785	tx = &priv->tx[qid];
786
787	spin_lock(&tx->xdp_lock);
788	for (i = 0; i < n; i++) {
789		err = gve_xdp_xmit_one(priv, tx, frames[i]->data,
790				       frames[i]->len, frames[i]);
791		if (err)
792			break;
793	}
794
795	if (flags & XDP_XMIT_FLUSH)
796		gve_tx_put_doorbell(priv, tx->q_resources, tx->req);
797
798	spin_unlock(&tx->xdp_lock);
799
800	u64_stats_update_begin(&tx->statss);
801	tx->xdp_xmit += n;
802	tx->xdp_xmit_errors += n - i;
803	u64_stats_update_end(&tx->statss);
804
805	return i ? i : err;
806}
807
808int gve_xdp_xmit_one(struct gve_priv *priv, struct gve_tx_ring *tx,
809		     void *data, int len, void *frame_p)
810{
811	int nsegs;
812
813	if (!gve_can_tx(tx, len + GVE_GQ_TX_MIN_PKT_DESC_BYTES - 1))
814		return -EBUSY;
815
816	nsegs = gve_tx_fill_xdp(priv, tx, data, len, frame_p, false);
817	tx->req += nsegs;
818
819	return 0;
820}
821
822#define GVE_TX_START_THRESH	PAGE_SIZE
823
824static int gve_clean_tx_done(struct gve_priv *priv, struct gve_tx_ring *tx,
825			     u32 to_do, bool try_to_wake)
826{
827	struct gve_tx_buffer_state *info;
828	u64 pkts = 0, bytes = 0;
829	size_t space_freed = 0;
830	struct sk_buff *skb;
831	u32 idx;
832	int j;
833
834	for (j = 0; j < to_do; j++) {
835		idx = tx->done & tx->mask;
836		netif_info(priv, tx_done, priv->dev,
837			   "[%d] %s: idx=%d (req=%u done=%u)\n",
838			   tx->q_num, __func__, idx, tx->req, tx->done);
839		info = &tx->info[idx];
840		skb = info->skb;
841
842		/* Unmap the buffer */
843		if (tx->raw_addressing)
844			gve_tx_unmap_buf(tx->dev, info);
845		tx->done++;
846		/* Mark as free */
847		if (skb) {
848			info->skb = NULL;
849			bytes += skb->len;
850			pkts++;
851			dev_consume_skb_any(skb);
852			if (tx->raw_addressing)
853				continue;
854			space_freed += gve_tx_clear_buffer_state(info);
855		}
856	}
857
858	if (!tx->raw_addressing)
859		gve_tx_free_fifo(&tx->tx_fifo, space_freed);
860	u64_stats_update_begin(&tx->statss);
861	tx->bytes_done += bytes;
862	tx->pkt_done += pkts;
863	u64_stats_update_end(&tx->statss);
864	netdev_tx_completed_queue(tx->netdev_txq, pkts, bytes);
865
866	/* start the queue if we've stopped it */
867#ifndef CONFIG_BQL
868	/* Make sure that the doorbells are synced */
869	smp_mb();
870#endif
871	if (try_to_wake && netif_tx_queue_stopped(tx->netdev_txq) &&
872	    likely(gve_can_tx(tx, GVE_TX_START_THRESH))) {
873		tx->wake_queue++;
874		netif_tx_wake_queue(tx->netdev_txq);
875	}
876
877	return pkts;
878}
879
880u32 gve_tx_load_event_counter(struct gve_priv *priv,
881			      struct gve_tx_ring *tx)
882{
883	u32 counter_index = be32_to_cpu(tx->q_resources->counter_index);
884	__be32 counter = READ_ONCE(priv->counter_array[counter_index]);
885
886	return be32_to_cpu(counter);
887}
888
889static int gve_xsk_tx(struct gve_priv *priv, struct gve_tx_ring *tx,
890		      int budget)
891{
892	struct xdp_desc desc;
893	int sent = 0, nsegs;
894	void *data;
895
896	spin_lock(&tx->xdp_lock);
897	while (sent < budget) {
898		if (!gve_can_tx(tx, GVE_TX_START_THRESH))
899			goto out;
900
901		if (!xsk_tx_peek_desc(tx->xsk_pool, &desc)) {
902			tx->xdp_xsk_done = tx->xdp_xsk_wakeup;
903			goto out;
904		}
905
906		data = xsk_buff_raw_get_data(tx->xsk_pool, desc.addr);
907		nsegs = gve_tx_fill_xdp(priv, tx, data, desc.len, NULL, true);
908		tx->req += nsegs;
909		sent++;
910	}
911out:
912	if (sent > 0) {
913		gve_tx_put_doorbell(priv, tx->q_resources, tx->req);
914		xsk_tx_release(tx->xsk_pool);
915	}
916	spin_unlock(&tx->xdp_lock);
917	return sent;
918}
919
920bool gve_xdp_poll(struct gve_notify_block *block, int budget)
921{
922	struct gve_priv *priv = block->priv;
923	struct gve_tx_ring *tx = block->tx;
924	u32 nic_done;
925	bool repoll;
926	u32 to_do;
927
928	/* Find out how much work there is to be done */
929	nic_done = gve_tx_load_event_counter(priv, tx);
930	to_do = min_t(u32, (nic_done - tx->done), budget);
931	gve_clean_xdp_done(priv, tx, to_do);
932	repoll = nic_done != tx->done;
933
934	if (tx->xsk_pool) {
935		int sent = gve_xsk_tx(priv, tx, budget);
936
937		u64_stats_update_begin(&tx->statss);
938		tx->xdp_xsk_sent += sent;
939		u64_stats_update_end(&tx->statss);
940		repoll |= (sent == budget);
941		if (xsk_uses_need_wakeup(tx->xsk_pool))
942			xsk_set_tx_need_wakeup(tx->xsk_pool);
943	}
944
945	/* If we still have work we want to repoll */
946	return repoll;
947}
948
949bool gve_tx_poll(struct gve_notify_block *block, int budget)
950{
951	struct gve_priv *priv = block->priv;
952	struct gve_tx_ring *tx = block->tx;
953	u32 nic_done;
954	u32 to_do;
955
956	/* If budget is 0, do all the work */
957	if (budget == 0)
958		budget = INT_MAX;
959
960	/* In TX path, it may try to clean completed pkts in order to xmit,
961	 * to avoid cleaning conflict, use spin_lock(), it yields better
962	 * concurrency between xmit/clean than netif's lock.
963	 */
964	spin_lock(&tx->clean_lock);
965	/* Find out how much work there is to be done */
966	nic_done = gve_tx_load_event_counter(priv, tx);
967	to_do = min_t(u32, (nic_done - tx->done), budget);
968	gve_clean_tx_done(priv, tx, to_do, true);
969	spin_unlock(&tx->clean_lock);
970	/* If we still have work we want to repoll */
971	return nic_done != tx->done;
972}
973
974bool gve_tx_clean_pending(struct gve_priv *priv, struct gve_tx_ring *tx)
975{
976	u32 nic_done = gve_tx_load_event_counter(priv, tx);
977
978	return nic_done != tx->done;
979}
980