1/*
2 * This file is part of the Chelsio T4 Ethernet driver for Linux.
3 *
4 * Copyright (c) 2003-2014 Chelsio Communications, Inc. All rights reserved.
5 *
6 * This software is available to you under a choice of one of two
7 * licenses.  You may choose to be licensed under the terms of the GNU
8 * General Public License (GPL) Version 2, available from the file
9 * COPYING in the main directory of this source tree, or the
10 * OpenIB.org BSD license below:
11 *
12 *     Redistribution and use in source and binary forms, with or
13 *     without modification, are permitted provided that the following
14 *     conditions are met:
15 *
16 *      - Redistributions of source code must retain the above
17 *        copyright notice, this list of conditions and the following
18 *        disclaimer.
19 *
20 *      - Redistributions in binary form must reproduce the above
21 *        copyright notice, this list of conditions and the following
22 *        disclaimer in the documentation and/or other materials
23 *        provided with the distribution.
24 *
25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32 * SOFTWARE.
33 */
34
35#include <linux/skbuff.h>
36#include <linux/netdevice.h>
37#include <linux/etherdevice.h>
38#include <linux/if_vlan.h>
39#include <linux/ip.h>
40#include <linux/dma-mapping.h>
41#include <linux/jiffies.h>
42#include <linux/prefetch.h>
43#include <linux/export.h>
44#include <net/xfrm.h>
45#include <net/ipv6.h>
46#include <net/tcp.h>
47#include <net/busy_poll.h>
48#ifdef CONFIG_CHELSIO_T4_FCOE
49#include <scsi/fc/fc_fcoe.h>
50#endif /* CONFIG_CHELSIO_T4_FCOE */
51#include "cxgb4.h"
52#include "t4_regs.h"
53#include "t4_values.h"
54#include "t4_msg.h"
55#include "t4fw_api.h"
56#include "cxgb4_ptp.h"
57#include "cxgb4_uld.h"
58#include "cxgb4_tc_mqprio.h"
59#include "sched.h"
60
61/*
62 * Rx buffer size.  We use largish buffers if possible but settle for single
63 * pages under memory shortage.
64 */
65#if PAGE_SHIFT >= 16
66# define FL_PG_ORDER 0
67#else
68# define FL_PG_ORDER (16 - PAGE_SHIFT)
69#endif
70
71/* RX_PULL_LEN should be <= RX_COPY_THRES */
72#define RX_COPY_THRES    256
73#define RX_PULL_LEN      128
74
75/*
76 * Main body length for sk_buffs used for Rx Ethernet packets with fragments.
77 * Should be >= RX_PULL_LEN but possibly bigger to give pskb_may_pull some room.
78 */
79#define RX_PKT_SKB_LEN   512
80
81/*
82 * Max number of Tx descriptors we clean up at a time.  Should be modest as
83 * freeing skbs isn't cheap and it happens while holding locks.  We just need
84 * to free packets faster than they arrive, we eventually catch up and keep
85 * the amortized cost reasonable.  Must be >= 2 * TXQ_STOP_THRES.  It should
86 * also match the CIDX Flush Threshold.
87 */
88#define MAX_TX_RECLAIM 32
89
90/*
91 * Max number of Rx buffers we replenish at a time.  Again keep this modest,
92 * allocating buffers isn't cheap either.
93 */
94#define MAX_RX_REFILL 16U
95
96/*
97 * Period of the Rx queue check timer.  This timer is infrequent as it has
98 * something to do only when the system experiences severe memory shortage.
99 */
100#define RX_QCHECK_PERIOD (HZ / 2)
101
102/*
103 * Period of the Tx queue check timer.
104 */
105#define TX_QCHECK_PERIOD (HZ / 2)
106
107/*
108 * Max number of Tx descriptors to be reclaimed by the Tx timer.
109 */
110#define MAX_TIMER_TX_RECLAIM 100
111
112/*
113 * Timer index used when backing off due to memory shortage.
114 */
115#define NOMEM_TMR_IDX (SGE_NTIMERS - 1)
116
117/*
118 * Suspension threshold for non-Ethernet Tx queues.  We require enough room
119 * for a full sized WR.
120 */
121#define TXQ_STOP_THRES (SGE_MAX_WR_LEN / sizeof(struct tx_desc))
122
123/*
124 * Max Tx descriptor space we allow for an Ethernet packet to be inlined
125 * into a WR.
126 */
127#define MAX_IMM_TX_PKT_LEN 256
128
129/*
130 * Max size of a WR sent through a control Tx queue.
131 */
132#define MAX_CTRL_WR_LEN SGE_MAX_WR_LEN
133
134struct rx_sw_desc {                /* SW state per Rx descriptor */
135	struct page *page;
136	dma_addr_t dma_addr;
137};
138
139/*
140 * Rx buffer sizes for "useskbs" Free List buffers (one ingress packet pe skb
141 * buffer).  We currently only support two sizes for 1500- and 9000-byte MTUs.
142 * We could easily support more but there doesn't seem to be much need for
143 * that ...
144 */
145#define FL_MTU_SMALL 1500
146#define FL_MTU_LARGE 9000
147
148static inline unsigned int fl_mtu_bufsize(struct adapter *adapter,
149					  unsigned int mtu)
150{
151	struct sge *s = &adapter->sge;
152
153	return ALIGN(s->pktshift + ETH_HLEN + VLAN_HLEN + mtu, s->fl_align);
154}
155
156#define FL_MTU_SMALL_BUFSIZE(adapter) fl_mtu_bufsize(adapter, FL_MTU_SMALL)
157#define FL_MTU_LARGE_BUFSIZE(adapter) fl_mtu_bufsize(adapter, FL_MTU_LARGE)
158
159/*
160 * Bits 0..3 of rx_sw_desc.dma_addr have special meaning.  The hardware uses
161 * these to specify the buffer size as an index into the SGE Free List Buffer
162 * Size register array.  We also use bit 4, when the buffer has been unmapped
163 * for DMA, but this is of course never sent to the hardware and is only used
164 * to prevent double unmappings.  All of the above requires that the Free List
165 * Buffers which we allocate have the bottom 5 bits free (0) -- i.e. are
166 * 32-byte or or a power of 2 greater in alignment.  Since the SGE's minimal
167 * Free List Buffer alignment is 32 bytes, this works out for us ...
168 */
169enum {
170	RX_BUF_FLAGS     = 0x1f,   /* bottom five bits are special */
171	RX_BUF_SIZE      = 0x0f,   /* bottom three bits are for buf sizes */
172	RX_UNMAPPED_BUF  = 0x10,   /* buffer is not mapped */
173
174	/*
175	 * XXX We shouldn't depend on being able to use these indices.
176	 * XXX Especially when some other Master PF has initialized the
177	 * XXX adapter or we use the Firmware Configuration File.  We
178	 * XXX should really search through the Host Buffer Size register
179	 * XXX array for the appropriately sized buffer indices.
180	 */
181	RX_SMALL_PG_BUF  = 0x0,   /* small (PAGE_SIZE) page buffer */
182	RX_LARGE_PG_BUF  = 0x1,   /* buffer large (FL_PG_ORDER) page buffer */
183
184	RX_SMALL_MTU_BUF = 0x2,   /* small MTU buffer */
185	RX_LARGE_MTU_BUF = 0x3,   /* large MTU buffer */
186};
187
188static int timer_pkt_quota[] = {1, 1, 2, 3, 4, 5};
189#define MIN_NAPI_WORK  1
190
191static inline dma_addr_t get_buf_addr(const struct rx_sw_desc *d)
192{
193	return d->dma_addr & ~(dma_addr_t)RX_BUF_FLAGS;
194}
195
196static inline bool is_buf_mapped(const struct rx_sw_desc *d)
197{
198	return !(d->dma_addr & RX_UNMAPPED_BUF);
199}
200
201/**
202 *	txq_avail - return the number of available slots in a Tx queue
203 *	@q: the Tx queue
204 *
205 *	Returns the number of descriptors in a Tx queue available to write new
206 *	packets.
207 */
208static inline unsigned int txq_avail(const struct sge_txq *q)
209{
210	return q->size - 1 - q->in_use;
211}
212
213/**
214 *	fl_cap - return the capacity of a free-buffer list
215 *	@fl: the FL
216 *
217 *	Returns the capacity of a free-buffer list.  The capacity is less than
218 *	the size because one descriptor needs to be left unpopulated, otherwise
219 *	HW will think the FL is empty.
220 */
221static inline unsigned int fl_cap(const struct sge_fl *fl)
222{
223	return fl->size - 8;   /* 1 descriptor = 8 buffers */
224}
225
226/**
227 *	fl_starving - return whether a Free List is starving.
228 *	@adapter: pointer to the adapter
229 *	@fl: the Free List
230 *
231 *	Tests specified Free List to see whether the number of buffers
232 *	available to the hardware has falled below our "starvation"
233 *	threshold.
234 */
235static inline bool fl_starving(const struct adapter *adapter,
236			       const struct sge_fl *fl)
237{
238	const struct sge *s = &adapter->sge;
239
240	return fl->avail - fl->pend_cred <= s->fl_starve_thres;
241}
242
243int cxgb4_map_skb(struct device *dev, const struct sk_buff *skb,
244		  dma_addr_t *addr)
245{
246	const skb_frag_t *fp, *end;
247	const struct skb_shared_info *si;
248
249	*addr = dma_map_single(dev, skb->data, skb_headlen(skb), DMA_TO_DEVICE);
250	if (dma_mapping_error(dev, *addr))
251		goto out_err;
252
253	si = skb_shinfo(skb);
254	end = &si->frags[si->nr_frags];
255
256	for (fp = si->frags; fp < end; fp++) {
257		*++addr = skb_frag_dma_map(dev, fp, 0, skb_frag_size(fp),
258					   DMA_TO_DEVICE);
259		if (dma_mapping_error(dev, *addr))
260			goto unwind;
261	}
262	return 0;
263
264unwind:
265	while (fp-- > si->frags)
266		dma_unmap_page(dev, *--addr, skb_frag_size(fp), DMA_TO_DEVICE);
267
268	dma_unmap_single(dev, addr[-1], skb_headlen(skb), DMA_TO_DEVICE);
269out_err:
270	return -ENOMEM;
271}
272EXPORT_SYMBOL(cxgb4_map_skb);
273
274static void unmap_skb(struct device *dev, const struct sk_buff *skb,
275		      const dma_addr_t *addr)
276{
277	const skb_frag_t *fp, *end;
278	const struct skb_shared_info *si;
279
280	dma_unmap_single(dev, *addr++, skb_headlen(skb), DMA_TO_DEVICE);
281
282	si = skb_shinfo(skb);
283	end = &si->frags[si->nr_frags];
284	for (fp = si->frags; fp < end; fp++)
285		dma_unmap_page(dev, *addr++, skb_frag_size(fp), DMA_TO_DEVICE);
286}
287
288#ifdef CONFIG_NEED_DMA_MAP_STATE
289/**
290 *	deferred_unmap_destructor - unmap a packet when it is freed
291 *	@skb: the packet
292 *
293 *	This is the packet destructor used for Tx packets that need to remain
294 *	mapped until they are freed rather than until their Tx descriptors are
295 *	freed.
296 */
297static void deferred_unmap_destructor(struct sk_buff *skb)
298{
299	unmap_skb(skb->dev->dev.parent, skb, (dma_addr_t *)skb->head);
300}
301#endif
302
303/**
304 *	free_tx_desc - reclaims Tx descriptors and their buffers
305 *	@adap: the adapter
306 *	@q: the Tx queue to reclaim descriptors from
307 *	@n: the number of descriptors to reclaim
308 *	@unmap: whether the buffers should be unmapped for DMA
309 *
310 *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
311 *	Tx buffers.  Called with the Tx queue lock held.
312 */
313void free_tx_desc(struct adapter *adap, struct sge_txq *q,
314		  unsigned int n, bool unmap)
315{
316	unsigned int cidx = q->cidx;
317	struct tx_sw_desc *d;
318
319	d = &q->sdesc[cidx];
320	while (n--) {
321		if (d->skb) {                       /* an SGL is present */
322			if (unmap && d->addr[0]) {
323				unmap_skb(adap->pdev_dev, d->skb, d->addr);
324				memset(d->addr, 0, sizeof(d->addr));
325			}
326			dev_consume_skb_any(d->skb);
327			d->skb = NULL;
328		}
329		++d;
330		if (++cidx == q->size) {
331			cidx = 0;
332			d = q->sdesc;
333		}
334	}
335	q->cidx = cidx;
336}
337
338/*
339 * Return the number of reclaimable descriptors in a Tx queue.
340 */
341static inline int reclaimable(const struct sge_txq *q)
342{
343	int hw_cidx = ntohs(READ_ONCE(q->stat->cidx));
344	hw_cidx -= q->cidx;
345	return hw_cidx < 0 ? hw_cidx + q->size : hw_cidx;
346}
347
348/**
349 *	reclaim_completed_tx - reclaims completed TX Descriptors
350 *	@adap: the adapter
351 *	@q: the Tx queue to reclaim completed descriptors from
352 *	@maxreclaim: the maximum number of TX Descriptors to reclaim or -1
353 *	@unmap: whether the buffers should be unmapped for DMA
354 *
355 *	Reclaims Tx Descriptors that the SGE has indicated it has processed,
356 *	and frees the associated buffers if possible.  If @max == -1, then
357 *	we'll use a defaiult maximum.  Called with the TX Queue locked.
358 */
359static inline int reclaim_completed_tx(struct adapter *adap, struct sge_txq *q,
360				       int maxreclaim, bool unmap)
361{
362	int reclaim = reclaimable(q);
363
364	if (reclaim) {
365		/*
366		 * Limit the amount of clean up work we do at a time to keep
367		 * the Tx lock hold time O(1).
368		 */
369		if (maxreclaim < 0)
370			maxreclaim = MAX_TX_RECLAIM;
371		if (reclaim > maxreclaim)
372			reclaim = maxreclaim;
373
374		free_tx_desc(adap, q, reclaim, unmap);
375		q->in_use -= reclaim;
376	}
377
378	return reclaim;
379}
380
381/**
382 *	cxgb4_reclaim_completed_tx - reclaims completed Tx descriptors
383 *	@adap: the adapter
384 *	@q: the Tx queue to reclaim completed descriptors from
385 *	@unmap: whether the buffers should be unmapped for DMA
386 *
387 *	Reclaims Tx descriptors that the SGE has indicated it has processed,
388 *	and frees the associated buffers if possible.  Called with the Tx
389 *	queue locked.
390 */
391void cxgb4_reclaim_completed_tx(struct adapter *adap, struct sge_txq *q,
392				bool unmap)
393{
394	(void)reclaim_completed_tx(adap, q, -1, unmap);
395}
396EXPORT_SYMBOL(cxgb4_reclaim_completed_tx);
397
398static inline int get_buf_size(struct adapter *adapter,
399			       const struct rx_sw_desc *d)
400{
401	struct sge *s = &adapter->sge;
402	unsigned int rx_buf_size_idx = d->dma_addr & RX_BUF_SIZE;
403	int buf_size;
404
405	switch (rx_buf_size_idx) {
406	case RX_SMALL_PG_BUF:
407		buf_size = PAGE_SIZE;
408		break;
409
410	case RX_LARGE_PG_BUF:
411		buf_size = PAGE_SIZE << s->fl_pg_order;
412		break;
413
414	case RX_SMALL_MTU_BUF:
415		buf_size = FL_MTU_SMALL_BUFSIZE(adapter);
416		break;
417
418	case RX_LARGE_MTU_BUF:
419		buf_size = FL_MTU_LARGE_BUFSIZE(adapter);
420		break;
421
422	default:
423		BUG();
424	}
425
426	return buf_size;
427}
428
429/**
430 *	free_rx_bufs - free the Rx buffers on an SGE free list
431 *	@adap: the adapter
432 *	@q: the SGE free list to free buffers from
433 *	@n: how many buffers to free
434 *
435 *	Release the next @n buffers on an SGE free-buffer Rx queue.   The
436 *	buffers must be made inaccessible to HW before calling this function.
437 */
438static void free_rx_bufs(struct adapter *adap, struct sge_fl *q, int n)
439{
440	while (n--) {
441		struct rx_sw_desc *d = &q->sdesc[q->cidx];
442
443		if (is_buf_mapped(d))
444			dma_unmap_page(adap->pdev_dev, get_buf_addr(d),
445				       get_buf_size(adap, d),
446				       PCI_DMA_FROMDEVICE);
447		put_page(d->page);
448		d->page = NULL;
449		if (++q->cidx == q->size)
450			q->cidx = 0;
451		q->avail--;
452	}
453}
454
455/**
456 *	unmap_rx_buf - unmap the current Rx buffer on an SGE free list
457 *	@adap: the adapter
458 *	@q: the SGE free list
459 *
460 *	Unmap the current buffer on an SGE free-buffer Rx queue.   The
461 *	buffer must be made inaccessible to HW before calling this function.
462 *
463 *	This is similar to @free_rx_bufs above but does not free the buffer.
464 *	Do note that the FL still loses any further access to the buffer.
465 */
466static void unmap_rx_buf(struct adapter *adap, struct sge_fl *q)
467{
468	struct rx_sw_desc *d = &q->sdesc[q->cidx];
469
470	if (is_buf_mapped(d))
471		dma_unmap_page(adap->pdev_dev, get_buf_addr(d),
472			       get_buf_size(adap, d), PCI_DMA_FROMDEVICE);
473	d->page = NULL;
474	if (++q->cidx == q->size)
475		q->cidx = 0;
476	q->avail--;
477}
478
479static inline void ring_fl_db(struct adapter *adap, struct sge_fl *q)
480{
481	if (q->pend_cred >= 8) {
482		u32 val = adap->params.arch.sge_fl_db;
483
484		if (is_t4(adap->params.chip))
485			val |= PIDX_V(q->pend_cred / 8);
486		else
487			val |= PIDX_T5_V(q->pend_cred / 8);
488
489		/* Make sure all memory writes to the Free List queue are
490		 * committed before we tell the hardware about them.
491		 */
492		wmb();
493
494		/* If we don't have access to the new User Doorbell (T5+), use
495		 * the old doorbell mechanism; otherwise use the new BAR2
496		 * mechanism.
497		 */
498		if (unlikely(q->bar2_addr == NULL)) {
499			t4_write_reg(adap, MYPF_REG(SGE_PF_KDOORBELL_A),
500				     val | QID_V(q->cntxt_id));
501		} else {
502			writel(val | QID_V(q->bar2_qid),
503			       q->bar2_addr + SGE_UDB_KDOORBELL);
504
505			/* This Write memory Barrier will force the write to
506			 * the User Doorbell area to be flushed.
507			 */
508			wmb();
509		}
510		q->pend_cred &= 7;
511	}
512}
513
514static inline void set_rx_sw_desc(struct rx_sw_desc *sd, struct page *pg,
515				  dma_addr_t mapping)
516{
517	sd->page = pg;
518	sd->dma_addr = mapping;      /* includes size low bits */
519}
520
521/**
522 *	refill_fl - refill an SGE Rx buffer ring
523 *	@adap: the adapter
524 *	@q: the ring to refill
525 *	@n: the number of new buffers to allocate
526 *	@gfp: the gfp flags for the allocations
527 *
528 *	(Re)populate an SGE free-buffer queue with up to @n new packet buffers,
529 *	allocated with the supplied gfp flags.  The caller must assure that
530 *	@n does not exceed the queue's capacity.  If afterwards the queue is
531 *	found critically low mark it as starving in the bitmap of starving FLs.
532 *
533 *	Returns the number of buffers allocated.
534 */
535static unsigned int refill_fl(struct adapter *adap, struct sge_fl *q, int n,
536			      gfp_t gfp)
537{
538	struct sge *s = &adap->sge;
539	struct page *pg;
540	dma_addr_t mapping;
541	unsigned int cred = q->avail;
542	__be64 *d = &q->desc[q->pidx];
543	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
544	int node;
545
546#ifdef CONFIG_DEBUG_FS
547	if (test_bit(q->cntxt_id - adap->sge.egr_start, adap->sge.blocked_fl))
548		goto out;
549#endif
550
551	gfp |= __GFP_NOWARN;
552	node = dev_to_node(adap->pdev_dev);
553
554	if (s->fl_pg_order == 0)
555		goto alloc_small_pages;
556
557	/*
558	 * Prefer large buffers
559	 */
560	while (n) {
561		pg = alloc_pages_node(node, gfp | __GFP_COMP, s->fl_pg_order);
562		if (unlikely(!pg)) {
563			q->large_alloc_failed++;
564			break;       /* fall back to single pages */
565		}
566
567		mapping = dma_map_page(adap->pdev_dev, pg, 0,
568				       PAGE_SIZE << s->fl_pg_order,
569				       PCI_DMA_FROMDEVICE);
570		if (unlikely(dma_mapping_error(adap->pdev_dev, mapping))) {
571			__free_pages(pg, s->fl_pg_order);
572			q->mapping_err++;
573			goto out;   /* do not try small pages for this error */
574		}
575		mapping |= RX_LARGE_PG_BUF;
576		*d++ = cpu_to_be64(mapping);
577
578		set_rx_sw_desc(sd, pg, mapping);
579		sd++;
580
581		q->avail++;
582		if (++q->pidx == q->size) {
583			q->pidx = 0;
584			sd = q->sdesc;
585			d = q->desc;
586		}
587		n--;
588	}
589
590alloc_small_pages:
591	while (n--) {
592		pg = alloc_pages_node(node, gfp, 0);
593		if (unlikely(!pg)) {
594			q->alloc_failed++;
595			break;
596		}
597
598		mapping = dma_map_page(adap->pdev_dev, pg, 0, PAGE_SIZE,
599				       PCI_DMA_FROMDEVICE);
600		if (unlikely(dma_mapping_error(adap->pdev_dev, mapping))) {
601			put_page(pg);
602			q->mapping_err++;
603			goto out;
604		}
605		*d++ = cpu_to_be64(mapping);
606
607		set_rx_sw_desc(sd, pg, mapping);
608		sd++;
609
610		q->avail++;
611		if (++q->pidx == q->size) {
612			q->pidx = 0;
613			sd = q->sdesc;
614			d = q->desc;
615		}
616	}
617
618out:	cred = q->avail - cred;
619	q->pend_cred += cred;
620	ring_fl_db(adap, q);
621
622	if (unlikely(fl_starving(adap, q))) {
623		smp_wmb();
624		q->low++;
625		set_bit(q->cntxt_id - adap->sge.egr_start,
626			adap->sge.starving_fl);
627	}
628
629	return cred;
630}
631
632static inline void __refill_fl(struct adapter *adap, struct sge_fl *fl)
633{
634	refill_fl(adap, fl, min(MAX_RX_REFILL, fl_cap(fl) - fl->avail),
635		  GFP_ATOMIC);
636}
637
638/**
639 *	alloc_ring - allocate resources for an SGE descriptor ring
640 *	@dev: the PCI device's core device
641 *	@nelem: the number of descriptors
642 *	@elem_size: the size of each descriptor
643 *	@sw_size: the size of the SW state associated with each ring element
644 *	@phys: the physical address of the allocated ring
645 *	@metadata: address of the array holding the SW state for the ring
646 *	@stat_size: extra space in HW ring for status information
647 *	@node: preferred node for memory allocations
648 *
649 *	Allocates resources for an SGE descriptor ring, such as Tx queues,
650 *	free buffer lists, or response queues.  Each SGE ring requires
651 *	space for its HW descriptors plus, optionally, space for the SW state
652 *	associated with each HW entry (the metadata).  The function returns
653 *	three values: the virtual address for the HW ring (the return value
654 *	of the function), the bus address of the HW ring, and the address
655 *	of the SW ring.
656 */
657static void *alloc_ring(struct device *dev, size_t nelem, size_t elem_size,
658			size_t sw_size, dma_addr_t *phys, void *metadata,
659			size_t stat_size, int node)
660{
661	size_t len = nelem * elem_size + stat_size;
662	void *s = NULL;
663	void *p = dma_alloc_coherent(dev, len, phys, GFP_KERNEL);
664
665	if (!p)
666		return NULL;
667	if (sw_size) {
668		s = kcalloc_node(sw_size, nelem, GFP_KERNEL, node);
669
670		if (!s) {
671			dma_free_coherent(dev, len, p, *phys);
672			return NULL;
673		}
674	}
675	if (metadata)
676		*(void **)metadata = s;
677	return p;
678}
679
680/**
681 *	sgl_len - calculates the size of an SGL of the given capacity
682 *	@n: the number of SGL entries
683 *
684 *	Calculates the number of flits needed for a scatter/gather list that
685 *	can hold the given number of entries.
686 */
687static inline unsigned int sgl_len(unsigned int n)
688{
689	/* A Direct Scatter Gather List uses 32-bit lengths and 64-bit PCI DMA
690	 * addresses.  The DSGL Work Request starts off with a 32-bit DSGL
691	 * ULPTX header, then Length0, then Address0, then, for 1 <= i <= N,
692	 * repeated sequences of { Length[i], Length[i+1], Address[i],
693	 * Address[i+1] } (this ensures that all addresses are on 64-bit
694	 * boundaries).  If N is even, then Length[N+1] should be set to 0 and
695	 * Address[N+1] is omitted.
696	 *
697	 * The following calculation incorporates all of the above.  It's
698	 * somewhat hard to follow but, briefly: the "+2" accounts for the
699	 * first two flits which include the DSGL header, Length0 and
700	 * Address0; the "(3*(n-1))/2" covers the main body of list entries (3
701	 * flits for every pair of the remaining N) +1 if (n-1) is odd; and
702	 * finally the "+((n-1)&1)" adds the one remaining flit needed if
703	 * (n-1) is odd ...
704	 */
705	n--;
706	return (3 * n) / 2 + (n & 1) + 2;
707}
708
709/**
710 *	flits_to_desc - returns the num of Tx descriptors for the given flits
711 *	@n: the number of flits
712 *
713 *	Returns the number of Tx descriptors needed for the supplied number
714 *	of flits.
715 */
716static inline unsigned int flits_to_desc(unsigned int n)
717{
718	BUG_ON(n > SGE_MAX_WR_LEN / 8);
719	return DIV_ROUND_UP(n, 8);
720}
721
722/**
723 *	is_eth_imm - can an Ethernet packet be sent as immediate data?
724 *	@skb: the packet
725 *	@chip_ver: chip version
726 *
727 *	Returns whether an Ethernet packet is small enough to fit as
728 *	immediate data. Return value corresponds to headroom required.
729 */
730static inline int is_eth_imm(const struct sk_buff *skb, unsigned int chip_ver)
731{
732	int hdrlen = 0;
733
734	if (skb->encapsulation && skb_shinfo(skb)->gso_size &&
735	    chip_ver > CHELSIO_T5) {
736		hdrlen = sizeof(struct cpl_tx_tnl_lso);
737		hdrlen += sizeof(struct cpl_tx_pkt_core);
738	} else if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) {
739		return 0;
740	} else {
741		hdrlen = skb_shinfo(skb)->gso_size ?
742			 sizeof(struct cpl_tx_pkt_lso_core) : 0;
743		hdrlen += sizeof(struct cpl_tx_pkt);
744	}
745	if (skb->len <= MAX_IMM_TX_PKT_LEN - hdrlen)
746		return hdrlen;
747	return 0;
748}
749
750/**
751 *	calc_tx_flits - calculate the number of flits for a packet Tx WR
752 *	@skb: the packet
753 *	@chip_ver: chip version
754 *
755 *	Returns the number of flits needed for a Tx WR for the given Ethernet
756 *	packet, including the needed WR and CPL headers.
757 */
758static inline unsigned int calc_tx_flits(const struct sk_buff *skb,
759					 unsigned int chip_ver)
760{
761	unsigned int flits;
762	int hdrlen = is_eth_imm(skb, chip_ver);
763
764	/* If the skb is small enough, we can pump it out as a work request
765	 * with only immediate data.  In that case we just have to have the
766	 * TX Packet header plus the skb data in the Work Request.
767	 */
768
769	if (hdrlen)
770		return DIV_ROUND_UP(skb->len + hdrlen, sizeof(__be64));
771
772	/* Otherwise, we're going to have to construct a Scatter gather list
773	 * of the skb body and fragments.  We also include the flits necessary
774	 * for the TX Packet Work Request and CPL.  We always have a firmware
775	 * Write Header (incorporated as part of the cpl_tx_pkt_lso and
776	 * cpl_tx_pkt structures), followed by either a TX Packet Write CPL
777	 * message or, if we're doing a Large Send Offload, an LSO CPL message
778	 * with an embedded TX Packet Write CPL message.
779	 */
780	flits = sgl_len(skb_shinfo(skb)->nr_frags + 1);
781	if (skb_shinfo(skb)->gso_size) {
782		if (skb->encapsulation && chip_ver > CHELSIO_T5) {
783			hdrlen = sizeof(struct fw_eth_tx_pkt_wr) +
784				 sizeof(struct cpl_tx_tnl_lso);
785		} else if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) {
786			u32 pkt_hdrlen;
787
788			pkt_hdrlen = eth_get_headlen(skb->dev, skb->data,
789						     skb_headlen(skb));
790			hdrlen = sizeof(struct fw_eth_tx_eo_wr) +
791				 round_up(pkt_hdrlen, 16);
792		} else {
793			hdrlen = sizeof(struct fw_eth_tx_pkt_wr) +
794				 sizeof(struct cpl_tx_pkt_lso_core);
795		}
796
797		hdrlen += sizeof(struct cpl_tx_pkt_core);
798		flits += (hdrlen / sizeof(__be64));
799	} else {
800		flits += (sizeof(struct fw_eth_tx_pkt_wr) +
801			  sizeof(struct cpl_tx_pkt_core)) / sizeof(__be64);
802	}
803	return flits;
804}
805
806/**
807 *	calc_tx_descs - calculate the number of Tx descriptors for a packet
808 *	@skb: the packet
809 *	@chip_ver: chip version
810 *
811 *	Returns the number of Tx descriptors needed for the given Ethernet
812 *	packet, including the needed WR and CPL headers.
813 */
814static inline unsigned int calc_tx_descs(const struct sk_buff *skb,
815					 unsigned int chip_ver)
816{
817	return flits_to_desc(calc_tx_flits(skb, chip_ver));
818}
819
820/**
821 *	cxgb4_write_sgl - populate a scatter/gather list for a packet
822 *	@skb: the packet
823 *	@q: the Tx queue we are writing into
824 *	@sgl: starting location for writing the SGL
825 *	@end: points right after the end of the SGL
826 *	@start: start offset into skb main-body data to include in the SGL
827 *	@addr: the list of bus addresses for the SGL elements
828 *
829 *	Generates a gather list for the buffers that make up a packet.
830 *	The caller must provide adequate space for the SGL that will be written.
831 *	The SGL includes all of the packet's page fragments and the data in its
832 *	main body except for the first @start bytes.  @sgl must be 16-byte
833 *	aligned and within a Tx descriptor with available space.  @end points
834 *	right after the end of the SGL but does not account for any potential
835 *	wrap around, i.e., @end > @sgl.
836 */
837void cxgb4_write_sgl(const struct sk_buff *skb, struct sge_txq *q,
838		     struct ulptx_sgl *sgl, u64 *end, unsigned int start,
839		     const dma_addr_t *addr)
840{
841	unsigned int i, len;
842	struct ulptx_sge_pair *to;
843	const struct skb_shared_info *si = skb_shinfo(skb);
844	unsigned int nfrags = si->nr_frags;
845	struct ulptx_sge_pair buf[MAX_SKB_FRAGS / 2 + 1];
846
847	len = skb_headlen(skb) - start;
848	if (likely(len)) {
849		sgl->len0 = htonl(len);
850		sgl->addr0 = cpu_to_be64(addr[0] + start);
851		nfrags++;
852	} else {
853		sgl->len0 = htonl(skb_frag_size(&si->frags[0]));
854		sgl->addr0 = cpu_to_be64(addr[1]);
855	}
856
857	sgl->cmd_nsge = htonl(ULPTX_CMD_V(ULP_TX_SC_DSGL) |
858			      ULPTX_NSGE_V(nfrags));
859	if (likely(--nfrags == 0))
860		return;
861	/*
862	 * Most of the complexity below deals with the possibility we hit the
863	 * end of the queue in the middle of writing the SGL.  For this case
864	 * only we create the SGL in a temporary buffer and then copy it.
865	 */
866	to = (u8 *)end > (u8 *)q->stat ? buf : sgl->sge;
867
868	for (i = (nfrags != si->nr_frags); nfrags >= 2; nfrags -= 2, to++) {
869		to->len[0] = cpu_to_be32(skb_frag_size(&si->frags[i]));
870		to->len[1] = cpu_to_be32(skb_frag_size(&si->frags[++i]));
871		to->addr[0] = cpu_to_be64(addr[i]);
872		to->addr[1] = cpu_to_be64(addr[++i]);
873	}
874	if (nfrags) {
875		to->len[0] = cpu_to_be32(skb_frag_size(&si->frags[i]));
876		to->len[1] = cpu_to_be32(0);
877		to->addr[0] = cpu_to_be64(addr[i + 1]);
878	}
879	if (unlikely((u8 *)end > (u8 *)q->stat)) {
880		unsigned int part0 = (u8 *)q->stat - (u8 *)sgl->sge, part1;
881
882		if (likely(part0))
883			memcpy(sgl->sge, buf, part0);
884		part1 = (u8 *)end - (u8 *)q->stat;
885		memcpy(q->desc, (u8 *)buf + part0, part1);
886		end = (void *)q->desc + part1;
887	}
888	if ((uintptr_t)end & 8)           /* 0-pad to multiple of 16 */
889		*end = 0;
890}
891EXPORT_SYMBOL(cxgb4_write_sgl);
892
893/*	cxgb4_write_partial_sgl - populate SGL for partial packet
894 *	@skb: the packet
895 *	@q: the Tx queue we are writing into
896 *	@sgl: starting location for writing the SGL
897 *	@end: points right after the end of the SGL
898 *	@addr: the list of bus addresses for the SGL elements
899 *	@start: start offset in the SKB where partial data starts
900 *	@len: length of data from @start to send out
901 *
902 *	This API will handle sending out partial data of a skb if required.
903 *	Unlike cxgb4_write_sgl, @start can be any offset into the skb data,
904 *	and @len will decide how much data after @start offset to send out.
905 */
906void cxgb4_write_partial_sgl(const struct sk_buff *skb, struct sge_txq *q,
907			     struct ulptx_sgl *sgl, u64 *end,
908			     const dma_addr_t *addr, u32 start, u32 len)
909{
910	struct ulptx_sge_pair buf[MAX_SKB_FRAGS / 2 + 1] = {0}, *to;
911	u32 frag_size, skb_linear_data_len = skb_headlen(skb);
912	struct skb_shared_info *si = skb_shinfo(skb);
913	u8 i = 0, frag_idx = 0, nfrags = 0;
914	skb_frag_t *frag;
915
916	/* Fill the first SGL either from linear data or from partial
917	 * frag based on @start.
918	 */
919	if (unlikely(start < skb_linear_data_len)) {
920		frag_size = min(len, skb_linear_data_len - start);
921		sgl->len0 = htonl(frag_size);
922		sgl->addr0 = cpu_to_be64(addr[0] + start);
923		len -= frag_size;
924		nfrags++;
925	} else {
926		start -= skb_linear_data_len;
927		frag = &si->frags[frag_idx];
928		frag_size = skb_frag_size(frag);
929		/* find the first frag */
930		while (start >= frag_size) {
931			start -= frag_size;
932			frag_idx++;
933			frag = &si->frags[frag_idx];
934			frag_size = skb_frag_size(frag);
935		}
936
937		frag_size = min(len, skb_frag_size(frag) - start);
938		sgl->len0 = cpu_to_be32(frag_size);
939		sgl->addr0 = cpu_to_be64(addr[frag_idx + 1] + start);
940		len -= frag_size;
941		nfrags++;
942		frag_idx++;
943	}
944
945	/* If the entire partial data fit in one SGL, then send it out
946	 * now.
947	 */
948	if (!len)
949		goto done;
950
951	/* Most of the complexity below deals with the possibility we hit the
952	 * end of the queue in the middle of writing the SGL.  For this case
953	 * only we create the SGL in a temporary buffer and then copy it.
954	 */
955	to = (u8 *)end > (u8 *)q->stat ? buf : sgl->sge;
956
957	/* If the skb couldn't fit in first SGL completely, fill the
958	 * rest of the frags in subsequent SGLs. Note that each SGL
959	 * pair can store 2 frags.
960	 */
961	while (len) {
962		frag_size = min(len, skb_frag_size(&si->frags[frag_idx]));
963		to->len[i & 1] = cpu_to_be32(frag_size);
964		to->addr[i & 1] = cpu_to_be64(addr[frag_idx + 1]);
965		if (i && (i & 1))
966			to++;
967		nfrags++;
968		frag_idx++;
969		i++;
970		len -= frag_size;
971	}
972
973	/* If we ended in an odd boundary, then set the second SGL's
974	 * length in the pair to 0.
975	 */
976	if (i & 1)
977		to->len[1] = cpu_to_be32(0);
978
979	/* Copy from temporary buffer to Tx ring, in case we hit the
980	 * end of the queue in the middle of writing the SGL.
981	 */
982	if (unlikely((u8 *)end > (u8 *)q->stat)) {
983		u32 part0 = (u8 *)q->stat - (u8 *)sgl->sge, part1;
984
985		if (likely(part0))
986			memcpy(sgl->sge, buf, part0);
987		part1 = (u8 *)end - (u8 *)q->stat;
988		memcpy(q->desc, (u8 *)buf + part0, part1);
989		end = (void *)q->desc + part1;
990	}
991
992	/* 0-pad to multiple of 16 */
993	if ((uintptr_t)end & 8)
994		*end = 0;
995done:
996	sgl->cmd_nsge = htonl(ULPTX_CMD_V(ULP_TX_SC_DSGL) |
997			ULPTX_NSGE_V(nfrags));
998}
999EXPORT_SYMBOL(cxgb4_write_partial_sgl);
1000
1001/* This function copies 64 byte coalesced work request to
1002 * memory mapped BAR2 space. For coalesced WR SGE fetches
1003 * data from the FIFO instead of from Host.
1004 */
1005static void cxgb_pio_copy(u64 __iomem *dst, u64 *src)
1006{
1007	int count = 8;
1008
1009	while (count) {
1010		writeq(*src, dst);
1011		src++;
1012		dst++;
1013		count--;
1014	}
1015}
1016
1017/**
1018 *	cxgb4_ring_tx_db - check and potentially ring a Tx queue's doorbell
1019 *	@adap: the adapter
1020 *	@q: the Tx queue
1021 *	@n: number of new descriptors to give to HW
1022 *
1023 *	Ring the doorbel for a Tx queue.
1024 */
1025inline void cxgb4_ring_tx_db(struct adapter *adap, struct sge_txq *q, int n)
1026{
1027	/* Make sure that all writes to the TX Descriptors are committed
1028	 * before we tell the hardware about them.
1029	 */
1030	wmb();
1031
1032	/* If we don't have access to the new User Doorbell (T5+), use the old
1033	 * doorbell mechanism; otherwise use the new BAR2 mechanism.
1034	 */
1035	if (unlikely(q->bar2_addr == NULL)) {
1036		u32 val = PIDX_V(n);
1037		unsigned long flags;
1038
1039		/* For T4 we need to participate in the Doorbell Recovery
1040		 * mechanism.
1041		 */
1042		spin_lock_irqsave(&q->db_lock, flags);
1043		if (!q->db_disabled)
1044			t4_write_reg(adap, MYPF_REG(SGE_PF_KDOORBELL_A),
1045				     QID_V(q->cntxt_id) | val);
1046		else
1047			q->db_pidx_inc += n;
1048		q->db_pidx = q->pidx;
1049		spin_unlock_irqrestore(&q->db_lock, flags);
1050	} else {
1051		u32 val = PIDX_T5_V(n);
1052
1053		/* T4 and later chips share the same PIDX field offset within
1054		 * the doorbell, but T5 and later shrank the field in order to
1055		 * gain a bit for Doorbell Priority.  The field was absurdly
1056		 * large in the first place (14 bits) so we just use the T5
1057		 * and later limits and warn if a Queue ID is too large.
1058		 */
1059		WARN_ON(val & DBPRIO_F);
1060
1061		/* If we're only writing a single TX Descriptor and we can use
1062		 * Inferred QID registers, we can use the Write Combining
1063		 * Gather Buffer; otherwise we use the simple doorbell.
1064		 */
1065		if (n == 1 && q->bar2_qid == 0) {
1066			int index = (q->pidx
1067				     ? (q->pidx - 1)
1068				     : (q->size - 1));
1069			u64 *wr = (u64 *)&q->desc[index];
1070
1071			cxgb_pio_copy((u64 __iomem *)
1072				      (q->bar2_addr + SGE_UDB_WCDOORBELL),
1073				      wr);
1074		} else {
1075			writel(val | QID_V(q->bar2_qid),
1076			       q->bar2_addr + SGE_UDB_KDOORBELL);
1077		}
1078
1079		/* This Write Memory Barrier will force the write to the User
1080		 * Doorbell area to be flushed.  This is needed to prevent
1081		 * writes on different CPUs for the same queue from hitting
1082		 * the adapter out of order.  This is required when some Work
1083		 * Requests take the Write Combine Gather Buffer path (user
1084		 * doorbell area offset [SGE_UDB_WCDOORBELL..+63]) and some
1085		 * take the traditional path where we simply increment the
1086		 * PIDX (User Doorbell area SGE_UDB_KDOORBELL) and have the
1087		 * hardware DMA read the actual Work Request.
1088		 */
1089		wmb();
1090	}
1091}
1092EXPORT_SYMBOL(cxgb4_ring_tx_db);
1093
1094/**
1095 *	cxgb4_inline_tx_skb - inline a packet's data into Tx descriptors
1096 *	@skb: the packet
1097 *	@q: the Tx queue where the packet will be inlined
1098 *	@pos: starting position in the Tx queue where to inline the packet
1099 *
1100 *	Inline a packet's contents directly into Tx descriptors, starting at
1101 *	the given position within the Tx DMA ring.
1102 *	Most of the complexity of this operation is dealing with wrap arounds
1103 *	in the middle of the packet we want to inline.
1104 */
1105void cxgb4_inline_tx_skb(const struct sk_buff *skb,
1106			 const struct sge_txq *q, void *pos)
1107{
1108	int left = (void *)q->stat - pos;
1109	u64 *p;
1110
1111	if (likely(skb->len <= left)) {
1112		if (likely(!skb->data_len))
1113			skb_copy_from_linear_data(skb, pos, skb->len);
1114		else
1115			skb_copy_bits(skb, 0, pos, skb->len);
1116		pos += skb->len;
1117	} else {
1118		skb_copy_bits(skb, 0, pos, left);
1119		skb_copy_bits(skb, left, q->desc, skb->len - left);
1120		pos = (void *)q->desc + (skb->len - left);
1121	}
1122
1123	/* 0-pad to multiple of 16 */
1124	p = PTR_ALIGN(pos, 8);
1125	if ((uintptr_t)p & 8)
1126		*p = 0;
1127}
1128EXPORT_SYMBOL(cxgb4_inline_tx_skb);
1129
1130static void *inline_tx_skb_header(const struct sk_buff *skb,
1131				  const struct sge_txq *q,  void *pos,
1132				  int length)
1133{
1134	u64 *p;
1135	int left = (void *)q->stat - pos;
1136
1137	if (likely(length <= left)) {
1138		memcpy(pos, skb->data, length);
1139		pos += length;
1140	} else {
1141		memcpy(pos, skb->data, left);
1142		memcpy(q->desc, skb->data + left, length - left);
1143		pos = (void *)q->desc + (length - left);
1144	}
1145	/* 0-pad to multiple of 16 */
1146	p = PTR_ALIGN(pos, 8);
1147	if ((uintptr_t)p & 8) {
1148		*p = 0;
1149		return p + 1;
1150	}
1151	return p;
1152}
1153
1154/*
1155 * Figure out what HW csum a packet wants and return the appropriate control
1156 * bits.
1157 */
1158static u64 hwcsum(enum chip_type chip, const struct sk_buff *skb)
1159{
1160	int csum_type;
1161	bool inner_hdr_csum = false;
1162	u16 proto, ver;
1163
1164	if (skb->encapsulation &&
1165	    (CHELSIO_CHIP_VERSION(chip) > CHELSIO_T5))
1166		inner_hdr_csum = true;
1167
1168	if (inner_hdr_csum) {
1169		ver = inner_ip_hdr(skb)->version;
1170		proto = (ver == 4) ? inner_ip_hdr(skb)->protocol :
1171			inner_ipv6_hdr(skb)->nexthdr;
1172	} else {
1173		ver = ip_hdr(skb)->version;
1174		proto = (ver == 4) ? ip_hdr(skb)->protocol :
1175			ipv6_hdr(skb)->nexthdr;
1176	}
1177
1178	if (ver == 4) {
1179		if (proto == IPPROTO_TCP)
1180			csum_type = TX_CSUM_TCPIP;
1181		else if (proto == IPPROTO_UDP)
1182			csum_type = TX_CSUM_UDPIP;
1183		else {
1184nocsum:			/*
1185			 * unknown protocol, disable HW csum
1186			 * and hope a bad packet is detected
1187			 */
1188			return TXPKT_L4CSUM_DIS_F;
1189		}
1190	} else {
1191		/*
1192		 * this doesn't work with extension headers
1193		 */
1194		if (proto == IPPROTO_TCP)
1195			csum_type = TX_CSUM_TCPIP6;
1196		else if (proto == IPPROTO_UDP)
1197			csum_type = TX_CSUM_UDPIP6;
1198		else
1199			goto nocsum;
1200	}
1201
1202	if (likely(csum_type >= TX_CSUM_TCPIP)) {
1203		int eth_hdr_len, l4_len;
1204		u64 hdr_len;
1205
1206		if (inner_hdr_csum) {
1207			/* This allows checksum offload for all encapsulated
1208			 * packets like GRE etc..
1209			 */
1210			l4_len = skb_inner_network_header_len(skb);
1211			eth_hdr_len = skb_inner_network_offset(skb) - ETH_HLEN;
1212		} else {
1213			l4_len = skb_network_header_len(skb);
1214			eth_hdr_len = skb_network_offset(skb) - ETH_HLEN;
1215		}
1216		hdr_len = TXPKT_IPHDR_LEN_V(l4_len);
1217
1218		if (CHELSIO_CHIP_VERSION(chip) <= CHELSIO_T5)
1219			hdr_len |= TXPKT_ETHHDR_LEN_V(eth_hdr_len);
1220		else
1221			hdr_len |= T6_TXPKT_ETHHDR_LEN_V(eth_hdr_len);
1222		return TXPKT_CSUM_TYPE_V(csum_type) | hdr_len;
1223	} else {
1224		int start = skb_transport_offset(skb);
1225
1226		return TXPKT_CSUM_TYPE_V(csum_type) |
1227			TXPKT_CSUM_START_V(start) |
1228			TXPKT_CSUM_LOC_V(start + skb->csum_offset);
1229	}
1230}
1231
1232static void eth_txq_stop(struct sge_eth_txq *q)
1233{
1234	netif_tx_stop_queue(q->txq);
1235	q->q.stops++;
1236}
1237
1238static inline void txq_advance(struct sge_txq *q, unsigned int n)
1239{
1240	q->in_use += n;
1241	q->pidx += n;
1242	if (q->pidx >= q->size)
1243		q->pidx -= q->size;
1244}
1245
1246#ifdef CONFIG_CHELSIO_T4_FCOE
1247static inline int
1248cxgb_fcoe_offload(struct sk_buff *skb, struct adapter *adap,
1249		  const struct port_info *pi, u64 *cntrl)
1250{
1251	const struct cxgb_fcoe *fcoe = &pi->fcoe;
1252
1253	if (!(fcoe->flags & CXGB_FCOE_ENABLED))
1254		return 0;
1255
1256	if (skb->protocol != htons(ETH_P_FCOE))
1257		return 0;
1258
1259	skb_reset_mac_header(skb);
1260	skb->mac_len = sizeof(struct ethhdr);
1261
1262	skb_set_network_header(skb, skb->mac_len);
1263	skb_set_transport_header(skb, skb->mac_len + sizeof(struct fcoe_hdr));
1264
1265	if (!cxgb_fcoe_sof_eof_supported(adap, skb))
1266		return -ENOTSUPP;
1267
1268	/* FC CRC offload */
1269	*cntrl = TXPKT_CSUM_TYPE_V(TX_CSUM_FCOE) |
1270		     TXPKT_L4CSUM_DIS_F | TXPKT_IPCSUM_DIS_F |
1271		     TXPKT_CSUM_START_V(CXGB_FCOE_TXPKT_CSUM_START) |
1272		     TXPKT_CSUM_END_V(CXGB_FCOE_TXPKT_CSUM_END) |
1273		     TXPKT_CSUM_LOC_V(CXGB_FCOE_TXPKT_CSUM_END);
1274	return 0;
1275}
1276#endif /* CONFIG_CHELSIO_T4_FCOE */
1277
1278/* Returns tunnel type if hardware supports offloading of the same.
1279 * It is called only for T5 and onwards.
1280 */
1281enum cpl_tx_tnl_lso_type cxgb_encap_offload_supported(struct sk_buff *skb)
1282{
1283	u8 l4_hdr = 0;
1284	enum cpl_tx_tnl_lso_type tnl_type = TX_TNL_TYPE_OPAQUE;
1285	struct port_info *pi = netdev_priv(skb->dev);
1286	struct adapter *adapter = pi->adapter;
1287
1288	if (skb->inner_protocol_type != ENCAP_TYPE_ETHER ||
1289	    skb->inner_protocol != htons(ETH_P_TEB))
1290		return tnl_type;
1291
1292	switch (vlan_get_protocol(skb)) {
1293	case htons(ETH_P_IP):
1294		l4_hdr = ip_hdr(skb)->protocol;
1295		break;
1296	case htons(ETH_P_IPV6):
1297		l4_hdr = ipv6_hdr(skb)->nexthdr;
1298		break;
1299	default:
1300		return tnl_type;
1301	}
1302
1303	switch (l4_hdr) {
1304	case IPPROTO_UDP:
1305		if (adapter->vxlan_port == udp_hdr(skb)->dest)
1306			tnl_type = TX_TNL_TYPE_VXLAN;
1307		else if (adapter->geneve_port == udp_hdr(skb)->dest)
1308			tnl_type = TX_TNL_TYPE_GENEVE;
1309		break;
1310	default:
1311		return tnl_type;
1312	}
1313
1314	return tnl_type;
1315}
1316
1317static inline void t6_fill_tnl_lso(struct sk_buff *skb,
1318				   struct cpl_tx_tnl_lso *tnl_lso,
1319				   enum cpl_tx_tnl_lso_type tnl_type)
1320{
1321	u32 val;
1322	int in_eth_xtra_len;
1323	int l3hdr_len = skb_network_header_len(skb);
1324	int eth_xtra_len = skb_network_offset(skb) - ETH_HLEN;
1325	const struct skb_shared_info *ssi = skb_shinfo(skb);
1326	bool v6 = (ip_hdr(skb)->version == 6);
1327
1328	val = CPL_TX_TNL_LSO_OPCODE_V(CPL_TX_TNL_LSO) |
1329	      CPL_TX_TNL_LSO_FIRST_F |
1330	      CPL_TX_TNL_LSO_LAST_F |
1331	      (v6 ? CPL_TX_TNL_LSO_IPV6OUT_F : 0) |
1332	      CPL_TX_TNL_LSO_ETHHDRLENOUT_V(eth_xtra_len / 4) |
1333	      CPL_TX_TNL_LSO_IPHDRLENOUT_V(l3hdr_len / 4) |
1334	      (v6 ? 0 : CPL_TX_TNL_LSO_IPHDRCHKOUT_F) |
1335	      CPL_TX_TNL_LSO_IPLENSETOUT_F |
1336	      (v6 ? 0 : CPL_TX_TNL_LSO_IPIDINCOUT_F);
1337	tnl_lso->op_to_IpIdSplitOut = htonl(val);
1338
1339	tnl_lso->IpIdOffsetOut = 0;
1340
1341	/* Get the tunnel header length */
1342	val = skb_inner_mac_header(skb) - skb_mac_header(skb);
1343	in_eth_xtra_len = skb_inner_network_header(skb) -
1344			  skb_inner_mac_header(skb) - ETH_HLEN;
1345
1346	switch (tnl_type) {
1347	case TX_TNL_TYPE_VXLAN:
1348	case TX_TNL_TYPE_GENEVE:
1349		tnl_lso->UdpLenSetOut_to_TnlHdrLen =
1350			htons(CPL_TX_TNL_LSO_UDPCHKCLROUT_F |
1351			CPL_TX_TNL_LSO_UDPLENSETOUT_F);
1352		break;
1353	default:
1354		tnl_lso->UdpLenSetOut_to_TnlHdrLen = 0;
1355		break;
1356	}
1357
1358	tnl_lso->UdpLenSetOut_to_TnlHdrLen |=
1359		 htons(CPL_TX_TNL_LSO_TNLHDRLEN_V(val) |
1360		       CPL_TX_TNL_LSO_TNLTYPE_V(tnl_type));
1361
1362	tnl_lso->r1 = 0;
1363
1364	val = CPL_TX_TNL_LSO_ETHHDRLEN_V(in_eth_xtra_len / 4) |
1365	      CPL_TX_TNL_LSO_IPV6_V(inner_ip_hdr(skb)->version == 6) |
1366	      CPL_TX_TNL_LSO_IPHDRLEN_V(skb_inner_network_header_len(skb) / 4) |
1367	      CPL_TX_TNL_LSO_TCPHDRLEN_V(inner_tcp_hdrlen(skb) / 4);
1368	tnl_lso->Flow_to_TcpHdrLen = htonl(val);
1369
1370	tnl_lso->IpIdOffset = htons(0);
1371
1372	tnl_lso->IpIdSplit_to_Mss = htons(CPL_TX_TNL_LSO_MSS_V(ssi->gso_size));
1373	tnl_lso->TCPSeqOffset = htonl(0);
1374	tnl_lso->EthLenOffset_Size = htonl(CPL_TX_TNL_LSO_SIZE_V(skb->len));
1375}
1376
1377static inline void *write_tso_wr(struct adapter *adap, struct sk_buff *skb,
1378				 struct cpl_tx_pkt_lso_core *lso)
1379{
1380	int eth_xtra_len = skb_network_offset(skb) - ETH_HLEN;
1381	int l3hdr_len = skb_network_header_len(skb);
1382	const struct skb_shared_info *ssi;
1383	bool ipv6 = false;
1384
1385	ssi = skb_shinfo(skb);
1386	if (ssi->gso_type & SKB_GSO_TCPV6)
1387		ipv6 = true;
1388
1389	lso->lso_ctrl = htonl(LSO_OPCODE_V(CPL_TX_PKT_LSO) |
1390			      LSO_FIRST_SLICE_F | LSO_LAST_SLICE_F |
1391			      LSO_IPV6_V(ipv6) |
1392			      LSO_ETHHDR_LEN_V(eth_xtra_len / 4) |
1393			      LSO_IPHDR_LEN_V(l3hdr_len / 4) |
1394			      LSO_TCPHDR_LEN_V(tcp_hdr(skb)->doff));
1395	lso->ipid_ofst = htons(0);
1396	lso->mss = htons(ssi->gso_size);
1397	lso->seqno_offset = htonl(0);
1398	if (is_t4(adap->params.chip))
1399		lso->len = htonl(skb->len);
1400	else
1401		lso->len = htonl(LSO_T5_XFER_SIZE_V(skb->len));
1402
1403	return (void *)(lso + 1);
1404}
1405
1406/**
1407 *	t4_sge_eth_txq_egress_update - handle Ethernet TX Queue update
1408 *	@adap: the adapter
1409 *	@eq: the Ethernet TX Queue
1410 *	@maxreclaim: the maximum number of TX Descriptors to reclaim or -1
1411 *
1412 *	We're typically called here to update the state of an Ethernet TX
1413 *	Queue with respect to the hardware's progress in consuming the TX
1414 *	Work Requests that we've put on that Egress Queue.  This happens
1415 *	when we get Egress Queue Update messages and also prophylactically
1416 *	in regular timer-based Ethernet TX Queue maintenance.
1417 */
1418int t4_sge_eth_txq_egress_update(struct adapter *adap, struct sge_eth_txq *eq,
1419				 int maxreclaim)
1420{
1421	unsigned int reclaimed, hw_cidx;
1422	struct sge_txq *q = &eq->q;
1423	int hw_in_use;
1424
1425	if (!q->in_use || !__netif_tx_trylock(eq->txq))
1426		return 0;
1427
1428	/* Reclaim pending completed TX Descriptors. */
1429	reclaimed = reclaim_completed_tx(adap, &eq->q, maxreclaim, true);
1430
1431	hw_cidx = ntohs(READ_ONCE(q->stat->cidx));
1432	hw_in_use = q->pidx - hw_cidx;
1433	if (hw_in_use < 0)
1434		hw_in_use += q->size;
1435
1436	/* If the TX Queue is currently stopped and there's now more than half
1437	 * the queue available, restart it.  Otherwise bail out since the rest
1438	 * of what we want do here is with the possibility of shipping any
1439	 * currently buffered Coalesced TX Work Request.
1440	 */
1441	if (netif_tx_queue_stopped(eq->txq) && hw_in_use < (q->size / 2)) {
1442		netif_tx_wake_queue(eq->txq);
1443		eq->q.restarts++;
1444	}
1445
1446	__netif_tx_unlock(eq->txq);
1447	return reclaimed;
1448}
1449
1450static inline int cxgb4_validate_skb(struct sk_buff *skb,
1451				     struct net_device *dev,
1452				     u32 min_pkt_len)
1453{
1454	u32 max_pkt_len;
1455
1456	/* The chip min packet length is 10 octets but some firmware
1457	 * commands have a minimum packet length requirement. So, play
1458	 * safe and reject anything shorter than @min_pkt_len.
1459	 */
1460	if (unlikely(skb->len < min_pkt_len))
1461		return -EINVAL;
1462
1463	/* Discard the packet if the length is greater than mtu */
1464	max_pkt_len = ETH_HLEN + dev->mtu;
1465
1466	if (skb_vlan_tagged(skb))
1467		max_pkt_len += VLAN_HLEN;
1468
1469	if (!skb_shinfo(skb)->gso_size && (unlikely(skb->len > max_pkt_len)))
1470		return -EINVAL;
1471
1472	return 0;
1473}
1474
1475static void *write_eo_udp_wr(struct sk_buff *skb, struct fw_eth_tx_eo_wr *wr,
1476			     u32 hdr_len)
1477{
1478	wr->u.udpseg.type = FW_ETH_TX_EO_TYPE_UDPSEG;
1479	wr->u.udpseg.ethlen = skb_network_offset(skb);
1480	wr->u.udpseg.iplen = cpu_to_be16(skb_network_header_len(skb));
1481	wr->u.udpseg.udplen = sizeof(struct udphdr);
1482	wr->u.udpseg.rtplen = 0;
1483	wr->u.udpseg.r4 = 0;
1484	if (skb_shinfo(skb)->gso_size)
1485		wr->u.udpseg.mss = cpu_to_be16(skb_shinfo(skb)->gso_size);
1486	else
1487		wr->u.udpseg.mss = cpu_to_be16(skb->len - hdr_len);
1488	wr->u.udpseg.schedpktsize = wr->u.udpseg.mss;
1489	wr->u.udpseg.plen = cpu_to_be32(skb->len - hdr_len);
1490
1491	return (void *)(wr + 1);
1492}
1493
1494/**
1495 *	cxgb4_eth_xmit - add a packet to an Ethernet Tx queue
1496 *	@skb: the packet
1497 *	@dev: the egress net device
1498 *
1499 *	Add a packet to an SGE Ethernet Tx queue.  Runs with softirqs disabled.
1500 */
1501static netdev_tx_t cxgb4_eth_xmit(struct sk_buff *skb, struct net_device *dev)
1502{
1503	enum cpl_tx_tnl_lso_type tnl_type = TX_TNL_TYPE_OPAQUE;
1504	bool ptp_enabled = is_ptp_enabled(skb, dev);
1505	unsigned int last_desc, flits, ndesc;
1506	u32 wr_mid, ctrl0, op, sgl_off = 0;
1507	const struct skb_shared_info *ssi;
1508	int len, qidx, credits, ret, left;
1509	struct tx_sw_desc *sgl_sdesc;
1510	struct fw_eth_tx_eo_wr *eowr;
1511	struct fw_eth_tx_pkt_wr *wr;
1512	struct cpl_tx_pkt_core *cpl;
1513	const struct port_info *pi;
1514	bool immediate = false;
1515	u64 cntrl, *end, *sgl;
1516	struct sge_eth_txq *q;
1517	unsigned int chip_ver;
1518	struct adapter *adap;
1519
1520	ret = cxgb4_validate_skb(skb, dev, ETH_HLEN);
1521	if (ret)
1522		goto out_free;
1523
1524	pi = netdev_priv(dev);
1525	adap = pi->adapter;
1526	ssi = skb_shinfo(skb);
1527#if IS_ENABLED(CONFIG_CHELSIO_IPSEC_INLINE)
1528	if (xfrm_offload(skb) && !ssi->gso_size)
1529		return adap->uld[CXGB4_ULD_IPSEC].tx_handler(skb, dev);
1530#endif /* CHELSIO_IPSEC_INLINE */
1531
1532#if IS_ENABLED(CONFIG_CHELSIO_TLS_DEVICE)
1533	if (cxgb4_is_ktls_skb(skb) &&
1534	    (skb->len - (skb_transport_offset(skb) + tcp_hdrlen(skb))))
1535		return adap->uld[CXGB4_ULD_KTLS].tx_handler(skb, dev);
1536#endif /* CHELSIO_TLS_DEVICE */
1537
1538	qidx = skb_get_queue_mapping(skb);
1539	if (ptp_enabled) {
1540		if (!(adap->ptp_tx_skb)) {
1541			skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
1542			adap->ptp_tx_skb = skb_get(skb);
1543		} else {
1544			goto out_free;
1545		}
1546		q = &adap->sge.ptptxq;
1547	} else {
1548		q = &adap->sge.ethtxq[qidx + pi->first_qset];
1549	}
1550	skb_tx_timestamp(skb);
1551
1552	reclaim_completed_tx(adap, &q->q, -1, true);
1553	cntrl = TXPKT_L4CSUM_DIS_F | TXPKT_IPCSUM_DIS_F;
1554
1555#ifdef CONFIG_CHELSIO_T4_FCOE
1556	ret = cxgb_fcoe_offload(skb, adap, pi, &cntrl);
1557	if (unlikely(ret == -EOPNOTSUPP))
1558		goto out_free;
1559#endif /* CONFIG_CHELSIO_T4_FCOE */
1560
1561	chip_ver = CHELSIO_CHIP_VERSION(adap->params.chip);
1562	flits = calc_tx_flits(skb, chip_ver);
1563	ndesc = flits_to_desc(flits);
1564	credits = txq_avail(&q->q) - ndesc;
1565
1566	if (unlikely(credits < 0)) {
1567		eth_txq_stop(q);
1568		dev_err(adap->pdev_dev,
1569			"%s: Tx ring %u full while queue awake!\n",
1570			dev->name, qidx);
1571		return NETDEV_TX_BUSY;
1572	}
1573
1574	if (is_eth_imm(skb, chip_ver))
1575		immediate = true;
1576
1577	if (skb->encapsulation && chip_ver > CHELSIO_T5)
1578		tnl_type = cxgb_encap_offload_supported(skb);
1579
1580	last_desc = q->q.pidx + ndesc - 1;
1581	if (last_desc >= q->q.size)
1582		last_desc -= q->q.size;
1583	sgl_sdesc = &q->q.sdesc[last_desc];
1584
1585	if (!immediate &&
1586	    unlikely(cxgb4_map_skb(adap->pdev_dev, skb, sgl_sdesc->addr) < 0)) {
1587		memset(sgl_sdesc->addr, 0, sizeof(sgl_sdesc->addr));
1588		q->mapping_err++;
1589		goto out_free;
1590	}
1591
1592	wr_mid = FW_WR_LEN16_V(DIV_ROUND_UP(flits, 2));
1593	if (unlikely(credits < ETHTXQ_STOP_THRES)) {
1594		/* After we're done injecting the Work Request for this
1595		 * packet, we'll be below our "stop threshold" so stop the TX
1596		 * Queue now and schedule a request for an SGE Egress Queue
1597		 * Update message. The queue will get started later on when
1598		 * the firmware processes this Work Request and sends us an
1599		 * Egress Queue Status Update message indicating that space
1600		 * has opened up.
1601		 */
1602		eth_txq_stop(q);
1603		wr_mid |= FW_WR_EQUEQ_F | FW_WR_EQUIQ_F;
1604	}
1605
1606	wr = (void *)&q->q.desc[q->q.pidx];
1607	eowr = (void *)&q->q.desc[q->q.pidx];
1608	wr->equiq_to_len16 = htonl(wr_mid);
1609	wr->r3 = cpu_to_be64(0);
1610	if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
1611		end = (u64 *)eowr + flits;
1612	else
1613		end = (u64 *)wr + flits;
1614
1615	len = immediate ? skb->len : 0;
1616	len += sizeof(*cpl);
1617	if (ssi->gso_size && !(ssi->gso_type & SKB_GSO_UDP_L4)) {
1618		struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
1619		struct cpl_tx_tnl_lso *tnl_lso = (void *)(wr + 1);
1620
1621		if (tnl_type)
1622			len += sizeof(*tnl_lso);
1623		else
1624			len += sizeof(*lso);
1625
1626		wr->op_immdlen = htonl(FW_WR_OP_V(FW_ETH_TX_PKT_WR) |
1627				       FW_WR_IMMDLEN_V(len));
1628		if (tnl_type) {
1629			struct iphdr *iph = ip_hdr(skb);
1630
1631			t6_fill_tnl_lso(skb, tnl_lso, tnl_type);
1632			cpl = (void *)(tnl_lso + 1);
1633			/* Driver is expected to compute partial checksum that
1634			 * does not include the IP Total Length.
1635			 */
1636			if (iph->version == 4) {
1637				iph->check = 0;
1638				iph->tot_len = 0;
1639				iph->check = ~ip_fast_csum((u8 *)iph, iph->ihl);
1640			}
1641			if (skb->ip_summed == CHECKSUM_PARTIAL)
1642				cntrl = hwcsum(adap->params.chip, skb);
1643		} else {
1644			cpl = write_tso_wr(adap, skb, lso);
1645			cntrl = hwcsum(adap->params.chip, skb);
1646		}
1647		sgl = (u64 *)(cpl + 1); /* sgl start here */
1648		q->tso++;
1649		q->tx_cso += ssi->gso_segs;
1650	} else if (ssi->gso_size) {
1651		u64 *start;
1652		u32 hdrlen;
1653
1654		hdrlen = eth_get_headlen(dev, skb->data, skb_headlen(skb));
1655		len += hdrlen;
1656		wr->op_immdlen = cpu_to_be32(FW_WR_OP_V(FW_ETH_TX_EO_WR) |
1657					     FW_ETH_TX_EO_WR_IMMDLEN_V(len));
1658		cpl = write_eo_udp_wr(skb, eowr, hdrlen);
1659		cntrl = hwcsum(adap->params.chip, skb);
1660
1661		start = (u64 *)(cpl + 1);
1662		sgl = (u64 *)inline_tx_skb_header(skb, &q->q, (void *)start,
1663						  hdrlen);
1664		if (unlikely(start > sgl)) {
1665			left = (u8 *)end - (u8 *)q->q.stat;
1666			end = (void *)q->q.desc + left;
1667		}
1668		sgl_off = hdrlen;
1669		q->uso++;
1670		q->tx_cso += ssi->gso_segs;
1671	} else {
1672		if (ptp_enabled)
1673			op = FW_PTP_TX_PKT_WR;
1674		else
1675			op = FW_ETH_TX_PKT_WR;
1676		wr->op_immdlen = htonl(FW_WR_OP_V(op) |
1677				       FW_WR_IMMDLEN_V(len));
1678		cpl = (void *)(wr + 1);
1679		sgl = (u64 *)(cpl + 1);
1680		if (skb->ip_summed == CHECKSUM_PARTIAL) {
1681			cntrl = hwcsum(adap->params.chip, skb) |
1682				TXPKT_IPCSUM_DIS_F;
1683			q->tx_cso++;
1684		}
1685	}
1686
1687	if (unlikely((u8 *)sgl >= (u8 *)q->q.stat)) {
1688		/* If current position is already at the end of the
1689		 * txq, reset the current to point to start of the queue
1690		 * and update the end ptr as well.
1691		 */
1692		left = (u8 *)end - (u8 *)q->q.stat;
1693		end = (void *)q->q.desc + left;
1694		sgl = (void *)q->q.desc;
1695	}
1696
1697	if (skb_vlan_tag_present(skb)) {
1698		q->vlan_ins++;
1699		cntrl |= TXPKT_VLAN_VLD_F | TXPKT_VLAN_V(skb_vlan_tag_get(skb));
1700#ifdef CONFIG_CHELSIO_T4_FCOE
1701		if (skb->protocol == htons(ETH_P_FCOE))
1702			cntrl |= TXPKT_VLAN_V(
1703				 ((skb->priority & 0x7) << VLAN_PRIO_SHIFT));
1704#endif /* CONFIG_CHELSIO_T4_FCOE */
1705	}
1706
1707	ctrl0 = TXPKT_OPCODE_V(CPL_TX_PKT_XT) | TXPKT_INTF_V(pi->tx_chan) |
1708		TXPKT_PF_V(adap->pf);
1709	if (ptp_enabled)
1710		ctrl0 |= TXPKT_TSTAMP_F;
1711#ifdef CONFIG_CHELSIO_T4_DCB
1712	if (is_t4(adap->params.chip))
1713		ctrl0 |= TXPKT_OVLAN_IDX_V(q->dcb_prio);
1714	else
1715		ctrl0 |= TXPKT_T5_OVLAN_IDX_V(q->dcb_prio);
1716#endif
1717	cpl->ctrl0 = htonl(ctrl0);
1718	cpl->pack = htons(0);
1719	cpl->len = htons(skb->len);
1720	cpl->ctrl1 = cpu_to_be64(cntrl);
1721
1722	if (immediate) {
1723		cxgb4_inline_tx_skb(skb, &q->q, sgl);
1724		dev_consume_skb_any(skb);
1725	} else {
1726		cxgb4_write_sgl(skb, &q->q, (void *)sgl, end, sgl_off,
1727				sgl_sdesc->addr);
1728		skb_orphan(skb);
1729		sgl_sdesc->skb = skb;
1730	}
1731
1732	txq_advance(&q->q, ndesc);
1733
1734	cxgb4_ring_tx_db(adap, &q->q, ndesc);
1735	return NETDEV_TX_OK;
1736
1737out_free:
1738	dev_kfree_skb_any(skb);
1739	return NETDEV_TX_OK;
1740}
1741
1742/* Constants ... */
1743enum {
1744	/* Egress Queue sizes, producer and consumer indices are all in units
1745	 * of Egress Context Units bytes.  Note that as far as the hardware is
1746	 * concerned, the free list is an Egress Queue (the host produces free
1747	 * buffers which the hardware consumes) and free list entries are
1748	 * 64-bit PCI DMA addresses.
1749	 */
1750	EQ_UNIT = SGE_EQ_IDXSIZE,
1751	FL_PER_EQ_UNIT = EQ_UNIT / sizeof(__be64),
1752	TXD_PER_EQ_UNIT = EQ_UNIT / sizeof(__be64),
1753
1754	T4VF_ETHTXQ_MAX_HDR = (sizeof(struct fw_eth_tx_pkt_vm_wr) +
1755			       sizeof(struct cpl_tx_pkt_lso_core) +
1756			       sizeof(struct cpl_tx_pkt_core)) / sizeof(__be64),
1757};
1758
1759/**
1760 *	t4vf_is_eth_imm - can an Ethernet packet be sent as immediate data?
1761 *	@skb: the packet
1762 *
1763 *	Returns whether an Ethernet packet is small enough to fit completely as
1764 *	immediate data.
1765 */
1766static inline int t4vf_is_eth_imm(const struct sk_buff *skb)
1767{
1768	/* The VF Driver uses the FW_ETH_TX_PKT_VM_WR firmware Work Request
1769	 * which does not accommodate immediate data.  We could dike out all
1770	 * of the support code for immediate data but that would tie our hands
1771	 * too much if we ever want to enhace the firmware.  It would also
1772	 * create more differences between the PF and VF Drivers.
1773	 */
1774	return false;
1775}
1776
1777/**
1778 *	t4vf_calc_tx_flits - calculate the number of flits for a packet TX WR
1779 *	@skb: the packet
1780 *
1781 *	Returns the number of flits needed for a TX Work Request for the
1782 *	given Ethernet packet, including the needed WR and CPL headers.
1783 */
1784static inline unsigned int t4vf_calc_tx_flits(const struct sk_buff *skb)
1785{
1786	unsigned int flits;
1787
1788	/* If the skb is small enough, we can pump it out as a work request
1789	 * with only immediate data.  In that case we just have to have the
1790	 * TX Packet header plus the skb data in the Work Request.
1791	 */
1792	if (t4vf_is_eth_imm(skb))
1793		return DIV_ROUND_UP(skb->len + sizeof(struct cpl_tx_pkt),
1794				    sizeof(__be64));
1795
1796	/* Otherwise, we're going to have to construct a Scatter gather list
1797	 * of the skb body and fragments.  We also include the flits necessary
1798	 * for the TX Packet Work Request and CPL.  We always have a firmware
1799	 * Write Header (incorporated as part of the cpl_tx_pkt_lso and
1800	 * cpl_tx_pkt structures), followed by either a TX Packet Write CPL
1801	 * message or, if we're doing a Large Send Offload, an LSO CPL message
1802	 * with an embedded TX Packet Write CPL message.
1803	 */
1804	flits = sgl_len(skb_shinfo(skb)->nr_frags + 1);
1805	if (skb_shinfo(skb)->gso_size)
1806		flits += (sizeof(struct fw_eth_tx_pkt_vm_wr) +
1807			  sizeof(struct cpl_tx_pkt_lso_core) +
1808			  sizeof(struct cpl_tx_pkt_core)) / sizeof(__be64);
1809	else
1810		flits += (sizeof(struct fw_eth_tx_pkt_vm_wr) +
1811			  sizeof(struct cpl_tx_pkt_core)) / sizeof(__be64);
1812	return flits;
1813}
1814
1815/**
1816 *	cxgb4_vf_eth_xmit - add a packet to an Ethernet TX queue
1817 *	@skb: the packet
1818 *	@dev: the egress net device
1819 *
1820 *	Add a packet to an SGE Ethernet TX queue.  Runs with softirqs disabled.
1821 */
1822static netdev_tx_t cxgb4_vf_eth_xmit(struct sk_buff *skb,
1823				     struct net_device *dev)
1824{
1825	unsigned int last_desc, flits, ndesc;
1826	const struct skb_shared_info *ssi;
1827	struct fw_eth_tx_pkt_vm_wr *wr;
1828	struct tx_sw_desc *sgl_sdesc;
1829	struct cpl_tx_pkt_core *cpl;
1830	const struct port_info *pi;
1831	struct sge_eth_txq *txq;
1832	struct adapter *adapter;
1833	int qidx, credits, ret;
1834	size_t fw_hdr_copy_len;
1835	u64 cntrl, *end;
1836	u32 wr_mid;
1837
1838	/* The chip minimum packet length is 10 octets but the firmware
1839	 * command that we are using requires that we copy the Ethernet header
1840	 * (including the VLAN tag) into the header so we reject anything
1841	 * smaller than that ...
1842	 */
1843	fw_hdr_copy_len = sizeof(wr->ethmacdst) + sizeof(wr->ethmacsrc) +
1844			  sizeof(wr->ethtype) + sizeof(wr->vlantci);
1845	ret = cxgb4_validate_skb(skb, dev, fw_hdr_copy_len);
1846	if (ret)
1847		goto out_free;
1848
1849	/* Figure out which TX Queue we're going to use. */
1850	pi = netdev_priv(dev);
1851	adapter = pi->adapter;
1852	qidx = skb_get_queue_mapping(skb);
1853	WARN_ON(qidx >= pi->nqsets);
1854	txq = &adapter->sge.ethtxq[pi->first_qset + qidx];
1855
1856	/* Take this opportunity to reclaim any TX Descriptors whose DMA
1857	 * transfers have completed.
1858	 */
1859	reclaim_completed_tx(adapter, &txq->q, -1, true);
1860
1861	/* Calculate the number of flits and TX Descriptors we're going to
1862	 * need along with how many TX Descriptors will be left over after
1863	 * we inject our Work Request.
1864	 */
1865	flits = t4vf_calc_tx_flits(skb);
1866	ndesc = flits_to_desc(flits);
1867	credits = txq_avail(&txq->q) - ndesc;
1868
1869	if (unlikely(credits < 0)) {
1870		/* Not enough room for this packet's Work Request.  Stop the
1871		 * TX Queue and return a "busy" condition.  The queue will get
1872		 * started later on when the firmware informs us that space
1873		 * has opened up.
1874		 */
1875		eth_txq_stop(txq);
1876		dev_err(adapter->pdev_dev,
1877			"%s: TX ring %u full while queue awake!\n",
1878			dev->name, qidx);
1879		return NETDEV_TX_BUSY;
1880	}
1881
1882	last_desc = txq->q.pidx + ndesc - 1;
1883	if (last_desc >= txq->q.size)
1884		last_desc -= txq->q.size;
1885	sgl_sdesc = &txq->q.sdesc[last_desc];
1886
1887	if (!t4vf_is_eth_imm(skb) &&
1888	    unlikely(cxgb4_map_skb(adapter->pdev_dev, skb,
1889				   sgl_sdesc->addr) < 0)) {
1890		/* We need to map the skb into PCI DMA space (because it can't
1891		 * be in-lined directly into the Work Request) and the mapping
1892		 * operation failed.  Record the error and drop the packet.
1893		 */
1894		memset(sgl_sdesc->addr, 0, sizeof(sgl_sdesc->addr));
1895		txq->mapping_err++;
1896		goto out_free;
1897	}
1898
1899	wr_mid = FW_WR_LEN16_V(DIV_ROUND_UP(flits, 2));
1900	if (unlikely(credits < ETHTXQ_STOP_THRES)) {
1901		/* After we're done injecting the Work Request for this
1902		 * packet, we'll be below our "stop threshold" so stop the TX
1903		 * Queue now and schedule a request for an SGE Egress Queue
1904		 * Update message.  The queue will get started later on when
1905		 * the firmware processes this Work Request and sends us an
1906		 * Egress Queue Status Update message indicating that space
1907		 * has opened up.
1908		 */
1909		eth_txq_stop(txq);
1910		wr_mid |= FW_WR_EQUEQ_F | FW_WR_EQUIQ_F;
1911	}
1912
1913	/* Start filling in our Work Request.  Note that we do _not_ handle
1914	 * the WR Header wrapping around the TX Descriptor Ring.  If our
1915	 * maximum header size ever exceeds one TX Descriptor, we'll need to
1916	 * do something else here.
1917	 */
1918	WARN_ON(DIV_ROUND_UP(T4VF_ETHTXQ_MAX_HDR, TXD_PER_EQ_UNIT) > 1);
1919	wr = (void *)&txq->q.desc[txq->q.pidx];
1920	wr->equiq_to_len16 = cpu_to_be32(wr_mid);
1921	wr->r3[0] = cpu_to_be32(0);
1922	wr->r3[1] = cpu_to_be32(0);
1923	skb_copy_from_linear_data(skb, (void *)wr->ethmacdst, fw_hdr_copy_len);
1924	end = (u64 *)wr + flits;
1925
1926	/* If this is a Large Send Offload packet we'll put in an LSO CPL
1927	 * message with an encapsulated TX Packet CPL message.  Otherwise we
1928	 * just use a TX Packet CPL message.
1929	 */
1930	ssi = skb_shinfo(skb);
1931	if (ssi->gso_size) {
1932		struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
1933		bool v6 = (ssi->gso_type & SKB_GSO_TCPV6) != 0;
1934		int l3hdr_len = skb_network_header_len(skb);
1935		int eth_xtra_len = skb_network_offset(skb) - ETH_HLEN;
1936
1937		wr->op_immdlen =
1938			cpu_to_be32(FW_WR_OP_V(FW_ETH_TX_PKT_VM_WR) |
1939				    FW_WR_IMMDLEN_V(sizeof(*lso) +
1940						    sizeof(*cpl)));
1941		 /* Fill in the LSO CPL message. */
1942		lso->lso_ctrl =
1943			cpu_to_be32(LSO_OPCODE_V(CPL_TX_PKT_LSO) |
1944				    LSO_FIRST_SLICE_F |
1945				    LSO_LAST_SLICE_F |
1946				    LSO_IPV6_V(v6) |
1947				    LSO_ETHHDR_LEN_V(eth_xtra_len / 4) |
1948				    LSO_IPHDR_LEN_V(l3hdr_len / 4) |
1949				    LSO_TCPHDR_LEN_V(tcp_hdr(skb)->doff));
1950		lso->ipid_ofst = cpu_to_be16(0);
1951		lso->mss = cpu_to_be16(ssi->gso_size);
1952		lso->seqno_offset = cpu_to_be32(0);
1953		if (is_t4(adapter->params.chip))
1954			lso->len = cpu_to_be32(skb->len);
1955		else
1956			lso->len = cpu_to_be32(LSO_T5_XFER_SIZE_V(skb->len));
1957
1958		/* Set up TX Packet CPL pointer, control word and perform
1959		 * accounting.
1960		 */
1961		cpl = (void *)(lso + 1);
1962
1963		if (CHELSIO_CHIP_VERSION(adapter->params.chip) <= CHELSIO_T5)
1964			cntrl = TXPKT_ETHHDR_LEN_V(eth_xtra_len);
1965		else
1966			cntrl = T6_TXPKT_ETHHDR_LEN_V(eth_xtra_len);
1967
1968		cntrl |= TXPKT_CSUM_TYPE_V(v6 ?
1969					   TX_CSUM_TCPIP6 : TX_CSUM_TCPIP) |
1970			 TXPKT_IPHDR_LEN_V(l3hdr_len);
1971		txq->tso++;
1972		txq->tx_cso += ssi->gso_segs;
1973	} else {
1974		int len;
1975
1976		len = (t4vf_is_eth_imm(skb)
1977		       ? skb->len + sizeof(*cpl)
1978		       : sizeof(*cpl));
1979		wr->op_immdlen =
1980			cpu_to_be32(FW_WR_OP_V(FW_ETH_TX_PKT_VM_WR) |
1981				    FW_WR_IMMDLEN_V(len));
1982
1983		/* Set up TX Packet CPL pointer, control word and perform
1984		 * accounting.
1985		 */
1986		cpl = (void *)(wr + 1);
1987		if (skb->ip_summed == CHECKSUM_PARTIAL) {
1988			cntrl = hwcsum(adapter->params.chip, skb) |
1989				TXPKT_IPCSUM_DIS_F;
1990			txq->tx_cso++;
1991		} else {
1992			cntrl = TXPKT_L4CSUM_DIS_F | TXPKT_IPCSUM_DIS_F;
1993		}
1994	}
1995
1996	/* If there's a VLAN tag present, add that to the list of things to
1997	 * do in this Work Request.
1998	 */
1999	if (skb_vlan_tag_present(skb)) {
2000		txq->vlan_ins++;
2001		cntrl |= TXPKT_VLAN_VLD_F | TXPKT_VLAN_V(skb_vlan_tag_get(skb));
2002	}
2003
2004	 /* Fill in the TX Packet CPL message header. */
2005	cpl->ctrl0 = cpu_to_be32(TXPKT_OPCODE_V(CPL_TX_PKT_XT) |
2006				 TXPKT_INTF_V(pi->port_id) |
2007				 TXPKT_PF_V(0));
2008	cpl->pack = cpu_to_be16(0);
2009	cpl->len = cpu_to_be16(skb->len);
2010	cpl->ctrl1 = cpu_to_be64(cntrl);
2011
2012	/* Fill in the body of the TX Packet CPL message with either in-lined
2013	 * data or a Scatter/Gather List.
2014	 */
2015	if (t4vf_is_eth_imm(skb)) {
2016		/* In-line the packet's data and free the skb since we don't
2017		 * need it any longer.
2018		 */
2019		cxgb4_inline_tx_skb(skb, &txq->q, cpl + 1);
2020		dev_consume_skb_any(skb);
2021	} else {
2022		/* Write the skb's Scatter/Gather list into the TX Packet CPL
2023		 * message and retain a pointer to the skb so we can free it
2024		 * later when its DMA completes.  (We store the skb pointer
2025		 * in the Software Descriptor corresponding to the last TX
2026		 * Descriptor used by the Work Request.)
2027		 *
2028		 * The retained skb will be freed when the corresponding TX
2029		 * Descriptors are reclaimed after their DMAs complete.
2030		 * However, this could take quite a while since, in general,
2031		 * the hardware is set up to be lazy about sending DMA
2032		 * completion notifications to us and we mostly perform TX
2033		 * reclaims in the transmit routine.
2034		 *
2035		 * This is good for performamce but means that we rely on new
2036		 * TX packets arriving to run the destructors of completed
2037		 * packets, which open up space in their sockets' send queues.
2038		 * Sometimes we do not get such new packets causing TX to
2039		 * stall.  A single UDP transmitter is a good example of this
2040		 * situation.  We have a clean up timer that periodically
2041		 * reclaims completed packets but it doesn't run often enough
2042		 * (nor do we want it to) to prevent lengthy stalls.  A
2043		 * solution to this problem is to run the destructor early,
2044		 * after the packet is queued but before it's DMAd.  A con is
2045		 * that we lie to socket memory accounting, but the amount of
2046		 * extra memory is reasonable (limited by the number of TX
2047		 * descriptors), the packets do actually get freed quickly by
2048		 * new packets almost always, and for protocols like TCP that
2049		 * wait for acks to really free up the data the extra memory
2050		 * is even less.  On the positive side we run the destructors
2051		 * on the sending CPU rather than on a potentially different
2052		 * completing CPU, usually a good thing.
2053		 *
2054		 * Run the destructor before telling the DMA engine about the
2055		 * packet to make sure it doesn't complete and get freed
2056		 * prematurely.
2057		 */
2058		struct ulptx_sgl *sgl = (struct ulptx_sgl *)(cpl + 1);
2059		struct sge_txq *tq = &txq->q;
2060
2061		/* If the Work Request header was an exact multiple of our TX
2062		 * Descriptor length, then it's possible that the starting SGL
2063		 * pointer lines up exactly with the end of our TX Descriptor
2064		 * ring.  If that's the case, wrap around to the beginning
2065		 * here ...
2066		 */
2067		if (unlikely((void *)sgl == (void *)tq->stat)) {
2068			sgl = (void *)tq->desc;
2069			end = (void *)((void *)tq->desc +
2070				       ((void *)end - (void *)tq->stat));
2071		}
2072
2073		cxgb4_write_sgl(skb, tq, sgl, end, 0, sgl_sdesc->addr);
2074		skb_orphan(skb);
2075		sgl_sdesc->skb = skb;
2076	}
2077
2078	/* Advance our internal TX Queue state, tell the hardware about
2079	 * the new TX descriptors and return success.
2080	 */
2081	txq_advance(&txq->q, ndesc);
2082
2083	cxgb4_ring_tx_db(adapter, &txq->q, ndesc);
2084	return NETDEV_TX_OK;
2085
2086out_free:
2087	/* An error of some sort happened.  Free the TX skb and tell the
2088	 * OS that we've "dealt" with the packet ...
2089	 */
2090	dev_kfree_skb_any(skb);
2091	return NETDEV_TX_OK;
2092}
2093
2094/**
2095 * reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
2096 * @q: the SGE control Tx queue
2097 *
2098 * This is a variant of cxgb4_reclaim_completed_tx() that is used
2099 * for Tx queues that send only immediate data (presently just
2100 * the control queues) and	thus do not have any sk_buffs to release.
2101 */
2102static inline void reclaim_completed_tx_imm(struct sge_txq *q)
2103{
2104	int hw_cidx = ntohs(READ_ONCE(q->stat->cidx));
2105	int reclaim = hw_cidx - q->cidx;
2106
2107	if (reclaim < 0)
2108		reclaim += q->size;
2109
2110	q->in_use -= reclaim;
2111	q->cidx = hw_cidx;
2112}
2113
2114static inline void eosw_txq_advance_index(u32 *idx, u32 n, u32 max)
2115{
2116	u32 val = *idx + n;
2117
2118	if (val >= max)
2119		val -= max;
2120
2121	*idx = val;
2122}
2123
2124void cxgb4_eosw_txq_free_desc(struct adapter *adap,
2125			      struct sge_eosw_txq *eosw_txq, u32 ndesc)
2126{
2127	struct tx_sw_desc *d;
2128
2129	d = &eosw_txq->desc[eosw_txq->last_cidx];
2130	while (ndesc--) {
2131		if (d->skb) {
2132			if (d->addr[0]) {
2133				unmap_skb(adap->pdev_dev, d->skb, d->addr);
2134				memset(d->addr, 0, sizeof(d->addr));
2135			}
2136			dev_consume_skb_any(d->skb);
2137			d->skb = NULL;
2138		}
2139		eosw_txq_advance_index(&eosw_txq->last_cidx, 1,
2140				       eosw_txq->ndesc);
2141		d = &eosw_txq->desc[eosw_txq->last_cidx];
2142	}
2143}
2144
2145static inline void eosw_txq_advance(struct sge_eosw_txq *eosw_txq, u32 n)
2146{
2147	eosw_txq_advance_index(&eosw_txq->pidx, n, eosw_txq->ndesc);
2148	eosw_txq->inuse += n;
2149}
2150
2151static inline int eosw_txq_enqueue(struct sge_eosw_txq *eosw_txq,
2152				   struct sk_buff *skb)
2153{
2154	if (eosw_txq->inuse == eosw_txq->ndesc)
2155		return -ENOMEM;
2156
2157	eosw_txq->desc[eosw_txq->pidx].skb = skb;
2158	return 0;
2159}
2160
2161static inline struct sk_buff *eosw_txq_peek(struct sge_eosw_txq *eosw_txq)
2162{
2163	return eosw_txq->desc[eosw_txq->last_pidx].skb;
2164}
2165
2166static inline u8 ethofld_calc_tx_flits(struct adapter *adap,
2167				       struct sk_buff *skb, u32 hdr_len)
2168{
2169	u8 flits, nsgl = 0;
2170	u32 wrlen;
2171
2172	wrlen = sizeof(struct fw_eth_tx_eo_wr) + sizeof(struct cpl_tx_pkt_core);
2173	if (skb_shinfo(skb)->gso_size &&
2174	    !(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4))
2175		wrlen += sizeof(struct cpl_tx_pkt_lso_core);
2176
2177	wrlen += roundup(hdr_len, 16);
2178
2179	/* Packet headers + WR + CPLs */
2180	flits = DIV_ROUND_UP(wrlen, 8);
2181
2182	if (skb_shinfo(skb)->nr_frags > 0) {
2183		if (skb_headlen(skb) - hdr_len)
2184			nsgl = sgl_len(skb_shinfo(skb)->nr_frags + 1);
2185		else
2186			nsgl = sgl_len(skb_shinfo(skb)->nr_frags);
2187	} else if (skb->len - hdr_len) {
2188		nsgl = sgl_len(1);
2189	}
2190
2191	return flits + nsgl;
2192}
2193
2194static void *write_eo_wr(struct adapter *adap, struct sge_eosw_txq *eosw_txq,
2195			 struct sk_buff *skb, struct fw_eth_tx_eo_wr *wr,
2196			 u32 hdr_len, u32 wrlen)
2197{
2198	const struct skb_shared_info *ssi = skb_shinfo(skb);
2199	struct cpl_tx_pkt_core *cpl;
2200	u32 immd_len, wrlen16;
2201	bool compl = false;
2202	u8 ver, proto;
2203
2204	ver = ip_hdr(skb)->version;
2205	proto = (ver == 6) ? ipv6_hdr(skb)->nexthdr : ip_hdr(skb)->protocol;
2206
2207	wrlen16 = DIV_ROUND_UP(wrlen, 16);
2208	immd_len = sizeof(struct cpl_tx_pkt_core);
2209	if (skb_shinfo(skb)->gso_size &&
2210	    !(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4))
2211		immd_len += sizeof(struct cpl_tx_pkt_lso_core);
2212	immd_len += hdr_len;
2213
2214	if (!eosw_txq->ncompl ||
2215	    (eosw_txq->last_compl + wrlen16) >=
2216	    (adap->params.ofldq_wr_cred / 2)) {
2217		compl = true;
2218		eosw_txq->ncompl++;
2219		eosw_txq->last_compl = 0;
2220	}
2221
2222	wr->op_immdlen = cpu_to_be32(FW_WR_OP_V(FW_ETH_TX_EO_WR) |
2223				     FW_ETH_TX_EO_WR_IMMDLEN_V(immd_len) |
2224				     FW_WR_COMPL_V(compl));
2225	wr->equiq_to_len16 = cpu_to_be32(FW_WR_LEN16_V(wrlen16) |
2226					 FW_WR_FLOWID_V(eosw_txq->hwtid));
2227	wr->r3 = 0;
2228	if (proto == IPPROTO_UDP) {
2229		cpl = write_eo_udp_wr(skb, wr, hdr_len);
2230	} else {
2231		wr->u.tcpseg.type = FW_ETH_TX_EO_TYPE_TCPSEG;
2232		wr->u.tcpseg.ethlen = skb_network_offset(skb);
2233		wr->u.tcpseg.iplen = cpu_to_be16(skb_network_header_len(skb));
2234		wr->u.tcpseg.tcplen = tcp_hdrlen(skb);
2235		wr->u.tcpseg.tsclk_tsoff = 0;
2236		wr->u.tcpseg.r4 = 0;
2237		wr->u.tcpseg.r5 = 0;
2238		wr->u.tcpseg.plen = cpu_to_be32(skb->len - hdr_len);
2239
2240		if (ssi->gso_size) {
2241			struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
2242
2243			wr->u.tcpseg.mss = cpu_to_be16(ssi->gso_size);
2244			cpl = write_tso_wr(adap, skb, lso);
2245		} else {
2246			wr->u.tcpseg.mss = cpu_to_be16(0xffff);
2247			cpl = (void *)(wr + 1);
2248		}
2249	}
2250
2251	eosw_txq->cred -= wrlen16;
2252	eosw_txq->last_compl += wrlen16;
2253	return cpl;
2254}
2255
2256static int ethofld_hard_xmit(struct net_device *dev,
2257			     struct sge_eosw_txq *eosw_txq)
2258{
2259	struct port_info *pi = netdev2pinfo(dev);
2260	struct adapter *adap = netdev2adap(dev);
2261	u32 wrlen, wrlen16, hdr_len, data_len;
2262	enum sge_eosw_state next_state;
2263	u64 cntrl, *start, *end, *sgl;
2264	struct sge_eohw_txq *eohw_txq;
2265	struct cpl_tx_pkt_core *cpl;
2266	struct fw_eth_tx_eo_wr *wr;
2267	bool skip_eotx_wr = false;
2268	struct tx_sw_desc *d;
2269	struct sk_buff *skb;
2270	int left, ret = 0;
2271	u8 flits, ndesc;
2272
2273	eohw_txq = &adap->sge.eohw_txq[eosw_txq->hwqid];
2274	spin_lock(&eohw_txq->lock);
2275	reclaim_completed_tx_imm(&eohw_txq->q);
2276
2277	d = &eosw_txq->desc[eosw_txq->last_pidx];
2278	skb = d->skb;
2279	skb_tx_timestamp(skb);
2280
2281	wr = (struct fw_eth_tx_eo_wr *)&eohw_txq->q.desc[eohw_txq->q.pidx];
2282	if (unlikely(eosw_txq->state != CXGB4_EO_STATE_ACTIVE &&
2283		     eosw_txq->last_pidx == eosw_txq->flowc_idx)) {
2284		hdr_len = skb->len;
2285		data_len = 0;
2286		flits = DIV_ROUND_UP(hdr_len, 8);
2287		if (eosw_txq->state == CXGB4_EO_STATE_FLOWC_OPEN_SEND)
2288			next_state = CXGB4_EO_STATE_FLOWC_OPEN_REPLY;
2289		else
2290			next_state = CXGB4_EO_STATE_FLOWC_CLOSE_REPLY;
2291		skip_eotx_wr = true;
2292	} else {
2293		hdr_len = eth_get_headlen(dev, skb->data, skb_headlen(skb));
2294		data_len = skb->len - hdr_len;
2295		flits = ethofld_calc_tx_flits(adap, skb, hdr_len);
2296	}
2297	ndesc = flits_to_desc(flits);
2298	wrlen = flits * 8;
2299	wrlen16 = DIV_ROUND_UP(wrlen, 16);
2300
2301	left = txq_avail(&eohw_txq->q) - ndesc;
2302
2303	/* If there are no descriptors left in hardware queues or no
2304	 * CPL credits left in software queues, then wait for them
2305	 * to come back and retry again. Note that we always request
2306	 * for credits update via interrupt for every half credits
2307	 * consumed. So, the interrupt will eventually restore the
2308	 * credits and invoke the Tx path again.
2309	 */
2310	if (unlikely(left < 0 || wrlen16 > eosw_txq->cred)) {
2311		ret = -ENOMEM;
2312		goto out_unlock;
2313	}
2314
2315	if (unlikely(skip_eotx_wr)) {
2316		start = (u64 *)wr;
2317		eosw_txq->state = next_state;
2318		eosw_txq->cred -= wrlen16;
2319		eosw_txq->ncompl++;
2320		eosw_txq->last_compl = 0;
2321		goto write_wr_headers;
2322	}
2323
2324	cpl = write_eo_wr(adap, eosw_txq, skb, wr, hdr_len, wrlen);
2325	cntrl = hwcsum(adap->params.chip, skb);
2326	if (skb_vlan_tag_present(skb))
2327		cntrl |= TXPKT_VLAN_VLD_F | TXPKT_VLAN_V(skb_vlan_tag_get(skb));
2328
2329	cpl->ctrl0 = cpu_to_be32(TXPKT_OPCODE_V(CPL_TX_PKT_XT) |
2330				 TXPKT_INTF_V(pi->tx_chan) |
2331				 TXPKT_PF_V(adap->pf));
2332	cpl->pack = 0;
2333	cpl->len = cpu_to_be16(skb->len);
2334	cpl->ctrl1 = cpu_to_be64(cntrl);
2335
2336	start = (u64 *)(cpl + 1);
2337
2338write_wr_headers:
2339	sgl = (u64 *)inline_tx_skb_header(skb, &eohw_txq->q, (void *)start,
2340					  hdr_len);
2341	if (data_len) {
2342		ret = cxgb4_map_skb(adap->pdev_dev, skb, d->addr);
2343		if (unlikely(ret)) {
2344			memset(d->addr, 0, sizeof(d->addr));
2345			eohw_txq->mapping_err++;
2346			goto out_unlock;
2347		}
2348
2349		end = (u64 *)wr + flits;
2350		if (unlikely(start > sgl)) {
2351			left = (u8 *)end - (u8 *)eohw_txq->q.stat;
2352			end = (void *)eohw_txq->q.desc + left;
2353		}
2354
2355		if (unlikely((u8 *)sgl >= (u8 *)eohw_txq->q.stat)) {
2356			/* If current position is already at the end of the
2357			 * txq, reset the current to point to start of the queue
2358			 * and update the end ptr as well.
2359			 */
2360			left = (u8 *)end - (u8 *)eohw_txq->q.stat;
2361
2362			end = (void *)eohw_txq->q.desc + left;
2363			sgl = (void *)eohw_txq->q.desc;
2364		}
2365
2366		cxgb4_write_sgl(skb, &eohw_txq->q, (void *)sgl, end, hdr_len,
2367				d->addr);
2368	}
2369
2370	if (skb_shinfo(skb)->gso_size) {
2371		if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
2372			eohw_txq->uso++;
2373		else
2374			eohw_txq->tso++;
2375		eohw_txq->tx_cso += skb_shinfo(skb)->gso_segs;
2376	} else if (skb->ip_summed == CHECKSUM_PARTIAL) {
2377		eohw_txq->tx_cso++;
2378	}
2379
2380	if (skb_vlan_tag_present(skb))
2381		eohw_txq->vlan_ins++;
2382
2383	txq_advance(&eohw_txq->q, ndesc);
2384	cxgb4_ring_tx_db(adap, &eohw_txq->q, ndesc);
2385	eosw_txq_advance_index(&eosw_txq->last_pidx, 1, eosw_txq->ndesc);
2386
2387out_unlock:
2388	spin_unlock(&eohw_txq->lock);
2389	return ret;
2390}
2391
2392static void ethofld_xmit(struct net_device *dev, struct sge_eosw_txq *eosw_txq)
2393{
2394	struct sk_buff *skb;
2395	int pktcount, ret;
2396
2397	switch (eosw_txq->state) {
2398	case CXGB4_EO_STATE_ACTIVE:
2399	case CXGB4_EO_STATE_FLOWC_OPEN_SEND:
2400	case CXGB4_EO_STATE_FLOWC_CLOSE_SEND:
2401		pktcount = eosw_txq->pidx - eosw_txq->last_pidx;
2402		if (pktcount < 0)
2403			pktcount += eosw_txq->ndesc;
2404		break;
2405	case CXGB4_EO_STATE_FLOWC_OPEN_REPLY:
2406	case CXGB4_EO_STATE_FLOWC_CLOSE_REPLY:
2407	case CXGB4_EO_STATE_CLOSED:
2408	default:
2409		return;
2410	}
2411
2412	while (pktcount--) {
2413		skb = eosw_txq_peek(eosw_txq);
2414		if (!skb) {
2415			eosw_txq_advance_index(&eosw_txq->last_pidx, 1,
2416					       eosw_txq->ndesc);
2417			continue;
2418		}
2419
2420		ret = ethofld_hard_xmit(dev, eosw_txq);
2421		if (ret)
2422			break;
2423	}
2424}
2425
2426static netdev_tx_t cxgb4_ethofld_xmit(struct sk_buff *skb,
2427				      struct net_device *dev)
2428{
2429	struct cxgb4_tc_port_mqprio *tc_port_mqprio;
2430	struct port_info *pi = netdev2pinfo(dev);
2431	struct adapter *adap = netdev2adap(dev);
2432	struct sge_eosw_txq *eosw_txq;
2433	u32 qid;
2434	int ret;
2435
2436	ret = cxgb4_validate_skb(skb, dev, ETH_HLEN);
2437	if (ret)
2438		goto out_free;
2439
2440	tc_port_mqprio = &adap->tc_mqprio->port_mqprio[pi->port_id];
2441	qid = skb_get_queue_mapping(skb) - pi->nqsets;
2442	eosw_txq = &tc_port_mqprio->eosw_txq[qid];
2443	spin_lock_bh(&eosw_txq->lock);
2444	if (eosw_txq->state != CXGB4_EO_STATE_ACTIVE)
2445		goto out_unlock;
2446
2447	ret = eosw_txq_enqueue(eosw_txq, skb);
2448	if (ret)
2449		goto out_unlock;
2450
2451	/* SKB is queued for processing until credits are available.
2452	 * So, call the destructor now and we'll free the skb later
2453	 * after it has been successfully transmitted.
2454	 */
2455	skb_orphan(skb);
2456
2457	eosw_txq_advance(eosw_txq, 1);
2458	ethofld_xmit(dev, eosw_txq);
2459	spin_unlock_bh(&eosw_txq->lock);
2460	return NETDEV_TX_OK;
2461
2462out_unlock:
2463	spin_unlock_bh(&eosw_txq->lock);
2464out_free:
2465	dev_kfree_skb_any(skb);
2466	return NETDEV_TX_OK;
2467}
2468
2469netdev_tx_t t4_start_xmit(struct sk_buff *skb, struct net_device *dev)
2470{
2471	struct port_info *pi = netdev_priv(dev);
2472	u16 qid = skb_get_queue_mapping(skb);
2473
2474	if (unlikely(pi->eth_flags & PRIV_FLAG_PORT_TX_VM))
2475		return cxgb4_vf_eth_xmit(skb, dev);
2476
2477	if (unlikely(qid >= pi->nqsets))
2478		return cxgb4_ethofld_xmit(skb, dev);
2479
2480	if (is_ptp_enabled(skb, dev)) {
2481		struct adapter *adap = netdev2adap(dev);
2482		netdev_tx_t ret;
2483
2484		spin_lock(&adap->ptp_lock);
2485		ret = cxgb4_eth_xmit(skb, dev);
2486		spin_unlock(&adap->ptp_lock);
2487		return ret;
2488	}
2489
2490	return cxgb4_eth_xmit(skb, dev);
2491}
2492
2493static void eosw_txq_flush_pending_skbs(struct sge_eosw_txq *eosw_txq)
2494{
2495	int pktcount = eosw_txq->pidx - eosw_txq->last_pidx;
2496	int pidx = eosw_txq->pidx;
2497	struct sk_buff *skb;
2498
2499	if (!pktcount)
2500		return;
2501
2502	if (pktcount < 0)
2503		pktcount += eosw_txq->ndesc;
2504
2505	while (pktcount--) {
2506		pidx--;
2507		if (pidx < 0)
2508			pidx += eosw_txq->ndesc;
2509
2510		skb = eosw_txq->desc[pidx].skb;
2511		if (skb) {
2512			dev_consume_skb_any(skb);
2513			eosw_txq->desc[pidx].skb = NULL;
2514			eosw_txq->inuse--;
2515		}
2516	}
2517
2518	eosw_txq->pidx = eosw_txq->last_pidx + 1;
2519}
2520
2521/**
2522 * cxgb4_ethofld_send_flowc - Send ETHOFLD flowc request to bind eotid to tc.
2523 * @dev: netdevice
2524 * @eotid: ETHOFLD tid to bind/unbind
2525 * @tc: traffic class. If set to FW_SCHED_CLS_NONE, then unbinds the @eotid
2526 *
2527 * Send a FLOWC work request to bind an ETHOFLD TID to a traffic class.
2528 * If @tc is set to FW_SCHED_CLS_NONE, then the @eotid is unbound from
2529 * a traffic class.
2530 */
2531int cxgb4_ethofld_send_flowc(struct net_device *dev, u32 eotid, u32 tc)
2532{
2533	struct port_info *pi = netdev2pinfo(dev);
2534	struct adapter *adap = netdev2adap(dev);
2535	enum sge_eosw_state next_state;
2536	struct sge_eosw_txq *eosw_txq;
2537	u32 len, len16, nparams = 6;
2538	struct fw_flowc_wr *flowc;
2539	struct eotid_entry *entry;
2540	struct sge_ofld_rxq *rxq;
2541	struct sk_buff *skb;
2542	int ret = 0;
2543
2544	len = struct_size(flowc, mnemval, nparams);
2545	len16 = DIV_ROUND_UP(len, 16);
2546
2547	entry = cxgb4_lookup_eotid(&adap->tids, eotid);
2548	if (!entry)
2549		return -ENOMEM;
2550
2551	eosw_txq = (struct sge_eosw_txq *)entry->data;
2552	if (!eosw_txq)
2553		return -ENOMEM;
2554
2555	if (!(adap->flags & CXGB4_FW_OK)) {
2556		/* Don't stall caller when access to FW is lost */
2557		complete(&eosw_txq->completion);
2558		return -EIO;
2559	}
2560
2561	skb = alloc_skb(len, GFP_KERNEL);
2562	if (!skb)
2563		return -ENOMEM;
2564
2565	spin_lock_bh(&eosw_txq->lock);
2566	if (tc != FW_SCHED_CLS_NONE) {
2567		if (eosw_txq->state != CXGB4_EO_STATE_CLOSED)
2568			goto out_free_skb;
2569
2570		next_state = CXGB4_EO_STATE_FLOWC_OPEN_SEND;
2571	} else {
2572		if (eosw_txq->state != CXGB4_EO_STATE_ACTIVE)
2573			goto out_free_skb;
2574
2575		next_state = CXGB4_EO_STATE_FLOWC_CLOSE_SEND;
2576	}
2577
2578	flowc = __skb_put(skb, len);
2579	memset(flowc, 0, len);
2580
2581	rxq = &adap->sge.eohw_rxq[eosw_txq->hwqid];
2582	flowc->flowid_len16 = cpu_to_be32(FW_WR_LEN16_V(len16) |
2583					  FW_WR_FLOWID_V(eosw_txq->hwtid));
2584	flowc->op_to_nparams = cpu_to_be32(FW_WR_OP_V(FW_FLOWC_WR) |
2585					   FW_FLOWC_WR_NPARAMS_V(nparams) |
2586					   FW_WR_COMPL_V(1));
2587	flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
2588	flowc->mnemval[0].val = cpu_to_be32(FW_PFVF_CMD_PFN_V(adap->pf));
2589	flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
2590	flowc->mnemval[1].val = cpu_to_be32(pi->tx_chan);
2591	flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
2592	flowc->mnemval[2].val = cpu_to_be32(pi->tx_chan);
2593	flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
2594	flowc->mnemval[3].val = cpu_to_be32(rxq->rspq.abs_id);
2595	flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS;
2596	flowc->mnemval[4].val = cpu_to_be32(tc);
2597	flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_EOSTATE;
2598	flowc->mnemval[5].val = cpu_to_be32(tc == FW_SCHED_CLS_NONE ?
2599					    FW_FLOWC_MNEM_EOSTATE_CLOSING :
2600					    FW_FLOWC_MNEM_EOSTATE_ESTABLISHED);
2601
2602	/* Free up any pending skbs to ensure there's room for
2603	 * termination FLOWC.
2604	 */
2605	if (tc == FW_SCHED_CLS_NONE)
2606		eosw_txq_flush_pending_skbs(eosw_txq);
2607
2608	ret = eosw_txq_enqueue(eosw_txq, skb);
2609	if (ret)
2610		goto out_free_skb;
2611
2612	eosw_txq->state = next_state;
2613	eosw_txq->flowc_idx = eosw_txq->pidx;
2614	eosw_txq_advance(eosw_txq, 1);
2615	ethofld_xmit(dev, eosw_txq);
2616
2617	spin_unlock_bh(&eosw_txq->lock);
2618	return 0;
2619
2620out_free_skb:
2621	dev_consume_skb_any(skb);
2622	spin_unlock_bh(&eosw_txq->lock);
2623	return ret;
2624}
2625
2626/**
2627 *	is_imm - check whether a packet can be sent as immediate data
2628 *	@skb: the packet
2629 *
2630 *	Returns true if a packet can be sent as a WR with immediate data.
2631 */
2632static inline int is_imm(const struct sk_buff *skb)
2633{
2634	return skb->len <= MAX_CTRL_WR_LEN;
2635}
2636
2637/**
2638 *	ctrlq_check_stop - check if a control queue is full and should stop
2639 *	@q: the queue
2640 *	@wr: most recent WR written to the queue
2641 *
2642 *	Check if a control queue has become full and should be stopped.
2643 *	We clean up control queue descriptors very lazily, only when we are out.
2644 *	If the queue is still full after reclaiming any completed descriptors
2645 *	we suspend it and have the last WR wake it up.
2646 */
2647static void ctrlq_check_stop(struct sge_ctrl_txq *q, struct fw_wr_hdr *wr)
2648{
2649	reclaim_completed_tx_imm(&q->q);
2650	if (unlikely(txq_avail(&q->q) < TXQ_STOP_THRES)) {
2651		wr->lo |= htonl(FW_WR_EQUEQ_F | FW_WR_EQUIQ_F);
2652		q->q.stops++;
2653		q->full = 1;
2654	}
2655}
2656
2657#define CXGB4_SELFTEST_LB_STR "CHELSIO_SELFTEST"
2658
2659int cxgb4_selftest_lb_pkt(struct net_device *netdev)
2660{
2661	struct port_info *pi = netdev_priv(netdev);
2662	struct adapter *adap = pi->adapter;
2663	struct cxgb4_ethtool_lb_test *lb;
2664	int ret, i = 0, pkt_len, credits;
2665	struct fw_eth_tx_pkt_wr *wr;
2666	struct cpl_tx_pkt_core *cpl;
2667	u32 ctrl0, ndesc, flits;
2668	struct sge_eth_txq *q;
2669	u8 *sgl;
2670
2671	pkt_len = ETH_HLEN + sizeof(CXGB4_SELFTEST_LB_STR);
2672
2673	flits = DIV_ROUND_UP(pkt_len + sizeof(*cpl) + sizeof(*wr),
2674			     sizeof(__be64));
2675	ndesc = flits_to_desc(flits);
2676
2677	lb = &pi->ethtool_lb;
2678	lb->loopback = 1;
2679
2680	q = &adap->sge.ethtxq[pi->first_qset];
2681	__netif_tx_lock(q->txq, smp_processor_id());
2682
2683	reclaim_completed_tx(adap, &q->q, -1, true);
2684	credits = txq_avail(&q->q) - ndesc;
2685	if (unlikely(credits < 0)) {
2686		__netif_tx_unlock(q->txq);
2687		return -ENOMEM;
2688	}
2689
2690	wr = (void *)&q->q.desc[q->q.pidx];
2691	memset(wr, 0, sizeof(struct tx_desc));
2692
2693	wr->op_immdlen = htonl(FW_WR_OP_V(FW_ETH_TX_PKT_WR) |
2694			       FW_WR_IMMDLEN_V(pkt_len +
2695			       sizeof(*cpl)));
2696	wr->equiq_to_len16 = htonl(FW_WR_LEN16_V(DIV_ROUND_UP(flits, 2)));
2697	wr->r3 = cpu_to_be64(0);
2698
2699	cpl = (void *)(wr + 1);
2700	sgl = (u8 *)(cpl + 1);
2701
2702	ctrl0 = TXPKT_OPCODE_V(CPL_TX_PKT_XT) | TXPKT_PF_V(adap->pf) |
2703		TXPKT_INTF_V(pi->tx_chan + 4);
2704
2705	cpl->ctrl0 = htonl(ctrl0);
2706	cpl->pack = htons(0);
2707	cpl->len = htons(pkt_len);
2708	cpl->ctrl1 = cpu_to_be64(TXPKT_L4CSUM_DIS_F | TXPKT_IPCSUM_DIS_F);
2709
2710	eth_broadcast_addr(sgl);
2711	i += ETH_ALEN;
2712	ether_addr_copy(&sgl[i], netdev->dev_addr);
2713	i += ETH_ALEN;
2714
2715	snprintf(&sgl[i], sizeof(CXGB4_SELFTEST_LB_STR), "%s",
2716		 CXGB4_SELFTEST_LB_STR);
2717
2718	init_completion(&lb->completion);
2719	txq_advance(&q->q, ndesc);
2720	cxgb4_ring_tx_db(adap, &q->q, ndesc);
2721	__netif_tx_unlock(q->txq);
2722
2723	/* wait for the pkt to return */
2724	ret = wait_for_completion_timeout(&lb->completion, 10 * HZ);
2725	if (!ret)
2726		ret = -ETIMEDOUT;
2727	else
2728		ret = lb->result;
2729
2730	lb->loopback = 0;
2731
2732	return ret;
2733}
2734
2735/**
2736 *	ctrl_xmit - send a packet through an SGE control Tx queue
2737 *	@q: the control queue
2738 *	@skb: the packet
2739 *
2740 *	Send a packet through an SGE control Tx queue.  Packets sent through
2741 *	a control queue must fit entirely as immediate data.
2742 */
2743static int ctrl_xmit(struct sge_ctrl_txq *q, struct sk_buff *skb)
2744{
2745	unsigned int ndesc;
2746	struct fw_wr_hdr *wr;
2747
2748	if (unlikely(!is_imm(skb))) {
2749		WARN_ON(1);
2750		dev_kfree_skb(skb);
2751		return NET_XMIT_DROP;
2752	}
2753
2754	ndesc = DIV_ROUND_UP(skb->len, sizeof(struct tx_desc));
2755	spin_lock(&q->sendq.lock);
2756
2757	if (unlikely(q->full)) {
2758		skb->priority = ndesc;                  /* save for restart */
2759		__skb_queue_tail(&q->sendq, skb);
2760		spin_unlock(&q->sendq.lock);
2761		return NET_XMIT_CN;
2762	}
2763
2764	wr = (struct fw_wr_hdr *)&q->q.desc[q->q.pidx];
2765	cxgb4_inline_tx_skb(skb, &q->q, wr);
2766
2767	txq_advance(&q->q, ndesc);
2768	if (unlikely(txq_avail(&q->q) < TXQ_STOP_THRES))
2769		ctrlq_check_stop(q, wr);
2770
2771	cxgb4_ring_tx_db(q->adap, &q->q, ndesc);
2772	spin_unlock(&q->sendq.lock);
2773
2774	kfree_skb(skb);
2775	return NET_XMIT_SUCCESS;
2776}
2777
2778/**
2779 *	restart_ctrlq - restart a suspended control queue
2780 *	@t: pointer to the tasklet associated with this handler
2781 *
2782 *	Resumes transmission on a suspended Tx control queue.
2783 */
2784static void restart_ctrlq(struct tasklet_struct *t)
2785{
2786	struct sk_buff *skb;
2787	unsigned int written = 0;
2788	struct sge_ctrl_txq *q = from_tasklet(q, t, qresume_tsk);
2789
2790	spin_lock(&q->sendq.lock);
2791	reclaim_completed_tx_imm(&q->q);
2792	BUG_ON(txq_avail(&q->q) < TXQ_STOP_THRES);  /* q should be empty */
2793
2794	while ((skb = __skb_dequeue(&q->sendq)) != NULL) {
2795		struct fw_wr_hdr *wr;
2796		unsigned int ndesc = skb->priority;     /* previously saved */
2797
2798		written += ndesc;
2799		/* Write descriptors and free skbs outside the lock to limit
2800		 * wait times.  q->full is still set so new skbs will be queued.
2801		 */
2802		wr = (struct fw_wr_hdr *)&q->q.desc[q->q.pidx];
2803		txq_advance(&q->q, ndesc);
2804		spin_unlock(&q->sendq.lock);
2805
2806		cxgb4_inline_tx_skb(skb, &q->q, wr);
2807		kfree_skb(skb);
2808
2809		if (unlikely(txq_avail(&q->q) < TXQ_STOP_THRES)) {
2810			unsigned long old = q->q.stops;
2811
2812			ctrlq_check_stop(q, wr);
2813			if (q->q.stops != old) {          /* suspended anew */
2814				spin_lock(&q->sendq.lock);
2815				goto ringdb;
2816			}
2817		}
2818		if (written > 16) {
2819			cxgb4_ring_tx_db(q->adap, &q->q, written);
2820			written = 0;
2821		}
2822		spin_lock(&q->sendq.lock);
2823	}
2824	q->full = 0;
2825ringdb:
2826	if (written)
2827		cxgb4_ring_tx_db(q->adap, &q->q, written);
2828	spin_unlock(&q->sendq.lock);
2829}
2830
2831/**
2832 *	t4_mgmt_tx - send a management message
2833 *	@adap: the adapter
2834 *	@skb: the packet containing the management message
2835 *
2836 *	Send a management message through control queue 0.
2837 */
2838int t4_mgmt_tx(struct adapter *adap, struct sk_buff *skb)
2839{
2840	int ret;
2841
2842	local_bh_disable();
2843	ret = ctrl_xmit(&adap->sge.ctrlq[0], skb);
2844	local_bh_enable();
2845	return ret;
2846}
2847
2848/**
2849 *	is_ofld_imm - check whether a packet can be sent as immediate data
2850 *	@skb: the packet
2851 *
2852 *	Returns true if a packet can be sent as an offload WR with immediate
2853 *	data.
2854 *	FW_OFLD_TX_DATA_WR limits the payload to 255 bytes due to 8-bit field.
2855 *      However, FW_ULPTX_WR commands have a 256 byte immediate only
2856 *      payload limit.
2857 */
2858static inline int is_ofld_imm(const struct sk_buff *skb)
2859{
2860	struct work_request_hdr *req = (struct work_request_hdr *)skb->data;
2861	unsigned long opcode = FW_WR_OP_G(ntohl(req->wr_hi));
2862
2863	if (unlikely(opcode == FW_ULPTX_WR))
2864		return skb->len <= MAX_IMM_ULPTX_WR_LEN;
2865	else if (opcode == FW_CRYPTO_LOOKASIDE_WR)
2866		return skb->len <= SGE_MAX_WR_LEN;
2867	else
2868		return skb->len <= MAX_IMM_OFLD_TX_DATA_WR_LEN;
2869}
2870
2871/**
2872 *	calc_tx_flits_ofld - calculate # of flits for an offload packet
2873 *	@skb: the packet
2874 *
2875 *	Returns the number of flits needed for the given offload packet.
2876 *	These packets are already fully constructed and no additional headers
2877 *	will be added.
2878 */
2879static inline unsigned int calc_tx_flits_ofld(const struct sk_buff *skb)
2880{
2881	unsigned int flits, cnt;
2882
2883	if (is_ofld_imm(skb))
2884		return DIV_ROUND_UP(skb->len, 8);
2885
2886	flits = skb_transport_offset(skb) / 8U;   /* headers */
2887	cnt = skb_shinfo(skb)->nr_frags;
2888	if (skb_tail_pointer(skb) != skb_transport_header(skb))
2889		cnt++;
2890	return flits + sgl_len(cnt);
2891}
2892
2893/**
2894 *	txq_stop_maperr - stop a Tx queue due to I/O MMU exhaustion
2895 *	@q: the queue to stop
2896 *
2897 *	Mark a Tx queue stopped due to I/O MMU exhaustion and resulting
2898 *	inability to map packets.  A periodic timer attempts to restart
2899 *	queues so marked.
2900 */
2901static void txq_stop_maperr(struct sge_uld_txq *q)
2902{
2903	q->mapping_err++;
2904	q->q.stops++;
2905	set_bit(q->q.cntxt_id - q->adap->sge.egr_start,
2906		q->adap->sge.txq_maperr);
2907}
2908
2909/**
2910 *	ofldtxq_stop - stop an offload Tx queue that has become full
2911 *	@q: the queue to stop
2912 *	@wr: the Work Request causing the queue to become full
2913 *
2914 *	Stops an offload Tx queue that has become full and modifies the packet
2915 *	being written to request a wakeup.
2916 */
2917static void ofldtxq_stop(struct sge_uld_txq *q, struct fw_wr_hdr *wr)
2918{
2919	wr->lo |= htonl(FW_WR_EQUEQ_F | FW_WR_EQUIQ_F);
2920	q->q.stops++;
2921	q->full = 1;
2922}
2923
2924/**
2925 *	service_ofldq - service/restart a suspended offload queue
2926 *	@q: the offload queue
2927 *
2928 *	Services an offload Tx queue by moving packets from its Pending Send
2929 *	Queue to the Hardware TX ring.  The function starts and ends with the
2930 *	Send Queue locked, but drops the lock while putting the skb at the
2931 *	head of the Send Queue onto the Hardware TX Ring.  Dropping the lock
2932 *	allows more skbs to be added to the Send Queue by other threads.
2933 *	The packet being processed at the head of the Pending Send Queue is
2934 *	left on the queue in case we experience DMA Mapping errors, etc.
2935 *	and need to give up and restart later.
2936 *
2937 *	service_ofldq() can be thought of as a task which opportunistically
2938 *	uses other threads execution contexts.  We use the Offload Queue
2939 *	boolean "service_ofldq_running" to make sure that only one instance
2940 *	is ever running at a time ...
2941 */
2942static void service_ofldq(struct sge_uld_txq *q)
2943	__must_hold(&q->sendq.lock)
2944{
2945	u64 *pos, *before, *end;
2946	int credits;
2947	struct sk_buff *skb;
2948	struct sge_txq *txq;
2949	unsigned int left;
2950	unsigned int written = 0;
2951	unsigned int flits, ndesc;
2952
2953	/* If another thread is currently in service_ofldq() processing the
2954	 * Pending Send Queue then there's nothing to do. Otherwise, flag
2955	 * that we're doing the work and continue.  Examining/modifying
2956	 * the Offload Queue boolean "service_ofldq_running" must be done
2957	 * while holding the Pending Send Queue Lock.
2958	 */
2959	if (q->service_ofldq_running)
2960		return;
2961	q->service_ofldq_running = true;
2962
2963	while ((skb = skb_peek(&q->sendq)) != NULL && !q->full) {
2964		/* We drop the lock while we're working with the skb at the
2965		 * head of the Pending Send Queue.  This allows more skbs to
2966		 * be added to the Pending Send Queue while we're working on
2967		 * this one.  We don't need to lock to guard the TX Ring
2968		 * updates because only one thread of execution is ever
2969		 * allowed into service_ofldq() at a time.
2970		 */
2971		spin_unlock(&q->sendq.lock);
2972
2973		cxgb4_reclaim_completed_tx(q->adap, &q->q, false);
2974
2975		flits = skb->priority;                /* previously saved */
2976		ndesc = flits_to_desc(flits);
2977		credits = txq_avail(&q->q) - ndesc;
2978		BUG_ON(credits < 0);
2979		if (unlikely(credits < TXQ_STOP_THRES))
2980			ofldtxq_stop(q, (struct fw_wr_hdr *)skb->data);
2981
2982		pos = (u64 *)&q->q.desc[q->q.pidx];
2983		if (is_ofld_imm(skb))
2984			cxgb4_inline_tx_skb(skb, &q->q, pos);
2985		else if (cxgb4_map_skb(q->adap->pdev_dev, skb,
2986				       (dma_addr_t *)skb->head)) {
2987			txq_stop_maperr(q);
2988			spin_lock(&q->sendq.lock);
2989			break;
2990		} else {
2991			int last_desc, hdr_len = skb_transport_offset(skb);
2992
2993			/* The WR headers  may not fit within one descriptor.
2994			 * So we need to deal with wrap-around here.
2995			 */
2996			before = (u64 *)pos;
2997			end = (u64 *)pos + flits;
2998			txq = &q->q;
2999			pos = (void *)inline_tx_skb_header(skb, &q->q,
3000							   (void *)pos,
3001							   hdr_len);
3002			if (before > (u64 *)pos) {
3003				left = (u8 *)end - (u8 *)txq->stat;
3004				end = (void *)txq->desc + left;
3005			}
3006
3007			/* If current position is already at the end of the
3008			 * ofld queue, reset the current to point to
3009			 * start of the queue and update the end ptr as well.
3010			 */
3011			if (pos == (u64 *)txq->stat) {
3012				left = (u8 *)end - (u8 *)txq->stat;
3013				end = (void *)txq->desc + left;
3014				pos = (void *)txq->desc;
3015			}
3016
3017			cxgb4_write_sgl(skb, &q->q, (void *)pos,
3018					end, hdr_len,
3019					(dma_addr_t *)skb->head);
3020#ifdef CONFIG_NEED_DMA_MAP_STATE
3021			skb->dev = q->adap->port[0];
3022			skb->destructor = deferred_unmap_destructor;
3023#endif
3024			last_desc = q->q.pidx + ndesc - 1;
3025			if (last_desc >= q->q.size)
3026				last_desc -= q->q.size;
3027			q->q.sdesc[last_desc].skb = skb;
3028		}
3029
3030		txq_advance(&q->q, ndesc);
3031		written += ndesc;
3032		if (unlikely(written > 32)) {
3033			cxgb4_ring_tx_db(q->adap, &q->q, written);
3034			written = 0;
3035		}
3036
3037		/* Reacquire the Pending Send Queue Lock so we can unlink the
3038		 * skb we've just successfully transferred to the TX Ring and
3039		 * loop for the next skb which may be at the head of the
3040		 * Pending Send Queue.
3041		 */
3042		spin_lock(&q->sendq.lock);
3043		__skb_unlink(skb, &q->sendq);
3044		if (is_ofld_imm(skb))
3045			kfree_skb(skb);
3046	}
3047	if (likely(written))
3048		cxgb4_ring_tx_db(q->adap, &q->q, written);
3049
3050	/*Indicate that no thread is processing the Pending Send Queue
3051	 * currently.
3052	 */
3053	q->service_ofldq_running = false;
3054}
3055
3056/**
3057 *	ofld_xmit - send a packet through an offload queue
3058 *	@q: the Tx offload queue
3059 *	@skb: the packet
3060 *
3061 *	Send an offload packet through an SGE offload queue.
3062 */
3063static int ofld_xmit(struct sge_uld_txq *q, struct sk_buff *skb)
3064{
3065	skb->priority = calc_tx_flits_ofld(skb);       /* save for restart */
3066	spin_lock(&q->sendq.lock);
3067
3068	/* Queue the new skb onto the Offload Queue's Pending Send Queue.  If
3069	 * that results in this new skb being the only one on the queue, start
3070	 * servicing it.  If there are other skbs already on the list, then
3071	 * either the queue is currently being processed or it's been stopped
3072	 * for some reason and it'll be restarted at a later time.  Restart
3073	 * paths are triggered by events like experiencing a DMA Mapping Error
3074	 * or filling the Hardware TX Ring.
3075	 */
3076	__skb_queue_tail(&q->sendq, skb);
3077	if (q->sendq.qlen == 1)
3078		service_ofldq(q);
3079
3080	spin_unlock(&q->sendq.lock);
3081	return NET_XMIT_SUCCESS;
3082}
3083
3084/**
3085 *	restart_ofldq - restart a suspended offload queue
3086 *	@t: pointer to the tasklet associated with this handler
3087 *
3088 *	Resumes transmission on a suspended Tx offload queue.
3089 */
3090static void restart_ofldq(struct tasklet_struct *t)
3091{
3092	struct sge_uld_txq *q = from_tasklet(q, t, qresume_tsk);
3093
3094	spin_lock(&q->sendq.lock);
3095	q->full = 0;            /* the queue actually is completely empty now */
3096	service_ofldq(q);
3097	spin_unlock(&q->sendq.lock);
3098}
3099
3100/**
3101 *	skb_txq - return the Tx queue an offload packet should use
3102 *	@skb: the packet
3103 *
3104 *	Returns the Tx queue an offload packet should use as indicated by bits
3105 *	1-15 in the packet's queue_mapping.
3106 */
3107static inline unsigned int skb_txq(const struct sk_buff *skb)
3108{
3109	return skb->queue_mapping >> 1;
3110}
3111
3112/**
3113 *	is_ctrl_pkt - return whether an offload packet is a control packet
3114 *	@skb: the packet
3115 *
3116 *	Returns whether an offload packet should use an OFLD or a CTRL
3117 *	Tx queue as indicated by bit 0 in the packet's queue_mapping.
3118 */
3119static inline unsigned int is_ctrl_pkt(const struct sk_buff *skb)
3120{
3121	return skb->queue_mapping & 1;
3122}
3123
3124static inline int uld_send(struct adapter *adap, struct sk_buff *skb,
3125			   unsigned int tx_uld_type)
3126{
3127	struct sge_uld_txq_info *txq_info;
3128	struct sge_uld_txq *txq;
3129	unsigned int idx = skb_txq(skb);
3130
3131	if (unlikely(is_ctrl_pkt(skb))) {
3132		/* Single ctrl queue is a requirement for LE workaround path */
3133		if (adap->tids.nsftids)
3134			idx = 0;
3135		return ctrl_xmit(&adap->sge.ctrlq[idx], skb);
3136	}
3137
3138	txq_info = adap->sge.uld_txq_info[tx_uld_type];
3139	if (unlikely(!txq_info)) {
3140		WARN_ON(true);
3141		kfree_skb(skb);
3142		return NET_XMIT_DROP;
3143	}
3144
3145	txq = &txq_info->uldtxq[idx];
3146	return ofld_xmit(txq, skb);
3147}
3148
3149/**
3150 *	t4_ofld_send - send an offload packet
3151 *	@adap: the adapter
3152 *	@skb: the packet
3153 *
3154 *	Sends an offload packet.  We use the packet queue_mapping to select the
3155 *	appropriate Tx queue as follows: bit 0 indicates whether the packet
3156 *	should be sent as regular or control, bits 1-15 select the queue.
3157 */
3158int t4_ofld_send(struct adapter *adap, struct sk_buff *skb)
3159{
3160	int ret;
3161
3162	local_bh_disable();
3163	ret = uld_send(adap, skb, CXGB4_TX_OFLD);
3164	local_bh_enable();
3165	return ret;
3166}
3167
3168/**
3169 *	cxgb4_ofld_send - send an offload packet
3170 *	@dev: the net device
3171 *	@skb: the packet
3172 *
3173 *	Sends an offload packet.  This is an exported version of @t4_ofld_send,
3174 *	intended for ULDs.
3175 */
3176int cxgb4_ofld_send(struct net_device *dev, struct sk_buff *skb)
3177{
3178	return t4_ofld_send(netdev2adap(dev), skb);
3179}
3180EXPORT_SYMBOL(cxgb4_ofld_send);
3181
3182static void *inline_tx_header(const void *src,
3183			      const struct sge_txq *q,
3184			      void *pos, int length)
3185{
3186	int left = (void *)q->stat - pos;
3187	u64 *p;
3188
3189	if (likely(length <= left)) {
3190		memcpy(pos, src, length);
3191		pos += length;
3192	} else {
3193		memcpy(pos, src, left);
3194		memcpy(q->desc, src + left, length - left);
3195		pos = (void *)q->desc + (length - left);
3196	}
3197	/* 0-pad to multiple of 16 */
3198	p = PTR_ALIGN(pos, 8);
3199	if ((uintptr_t)p & 8) {
3200		*p = 0;
3201		return p + 1;
3202	}
3203	return p;
3204}
3205
3206/**
3207 *      ofld_xmit_direct - copy a WR into offload queue
3208 *      @q: the Tx offload queue
3209 *      @src: location of WR
3210 *      @len: WR length
3211 *
3212 *      Copy an immediate WR into an uncontended SGE offload queue.
3213 */
3214static int ofld_xmit_direct(struct sge_uld_txq *q, const void *src,
3215			    unsigned int len)
3216{
3217	unsigned int ndesc;
3218	int credits;
3219	u64 *pos;
3220
3221	/* Use the lower limit as the cut-off */
3222	if (len > MAX_IMM_OFLD_TX_DATA_WR_LEN) {
3223		WARN_ON(1);
3224		return NET_XMIT_DROP;
3225	}
3226
3227	/* Don't return NET_XMIT_CN here as the current
3228	 * implementation doesn't queue the request
3229	 * using an skb when the following conditions not met
3230	 */
3231	if (!spin_trylock(&q->sendq.lock))
3232		return NET_XMIT_DROP;
3233
3234	if (q->full || !skb_queue_empty(&q->sendq) ||
3235	    q->service_ofldq_running) {
3236		spin_unlock(&q->sendq.lock);
3237		return NET_XMIT_DROP;
3238	}
3239	ndesc = flits_to_desc(DIV_ROUND_UP(len, 8));
3240	credits = txq_avail(&q->q) - ndesc;
3241	pos = (u64 *)&q->q.desc[q->q.pidx];
3242
3243	/* ofldtxq_stop modifies WR header in-situ */
3244	inline_tx_header(src, &q->q, pos, len);
3245	if (unlikely(credits < TXQ_STOP_THRES))
3246		ofldtxq_stop(q, (struct fw_wr_hdr *)pos);
3247	txq_advance(&q->q, ndesc);
3248	cxgb4_ring_tx_db(q->adap, &q->q, ndesc);
3249
3250	spin_unlock(&q->sendq.lock);
3251	return NET_XMIT_SUCCESS;
3252}
3253
3254int cxgb4_immdata_send(struct net_device *dev, unsigned int idx,
3255		       const void *src, unsigned int len)
3256{
3257	struct sge_uld_txq_info *txq_info;
3258	struct sge_uld_txq *txq;
3259	struct adapter *adap;
3260	int ret;
3261
3262	adap = netdev2adap(dev);
3263
3264	local_bh_disable();
3265	txq_info = adap->sge.uld_txq_info[CXGB4_TX_OFLD];
3266	if (unlikely(!txq_info)) {
3267		WARN_ON(true);
3268		local_bh_enable();
3269		return NET_XMIT_DROP;
3270	}
3271	txq = &txq_info->uldtxq[idx];
3272
3273	ret = ofld_xmit_direct(txq, src, len);
3274	local_bh_enable();
3275	return net_xmit_eval(ret);
3276}
3277EXPORT_SYMBOL(cxgb4_immdata_send);
3278
3279/**
3280 *	t4_crypto_send - send crypto packet
3281 *	@adap: the adapter
3282 *	@skb: the packet
3283 *
3284 *	Sends crypto packet.  We use the packet queue_mapping to select the
3285 *	appropriate Tx queue as follows: bit 0 indicates whether the packet
3286 *	should be sent as regular or control, bits 1-15 select the queue.
3287 */
3288static int t4_crypto_send(struct adapter *adap, struct sk_buff *skb)
3289{
3290	int ret;
3291
3292	local_bh_disable();
3293	ret = uld_send(adap, skb, CXGB4_TX_CRYPTO);
3294	local_bh_enable();
3295	return ret;
3296}
3297
3298/**
3299 *	cxgb4_crypto_send - send crypto packet
3300 *	@dev: the net device
3301 *	@skb: the packet
3302 *
3303 *	Sends crypto packet.  This is an exported version of @t4_crypto_send,
3304 *	intended for ULDs.
3305 */
3306int cxgb4_crypto_send(struct net_device *dev, struct sk_buff *skb)
3307{
3308	return t4_crypto_send(netdev2adap(dev), skb);
3309}
3310EXPORT_SYMBOL(cxgb4_crypto_send);
3311
3312static inline void copy_frags(struct sk_buff *skb,
3313			      const struct pkt_gl *gl, unsigned int offset)
3314{
3315	int i;
3316
3317	/* usually there's just one frag */
3318	__skb_fill_page_desc(skb, 0, gl->frags[0].page,
3319			     gl->frags[0].offset + offset,
3320			     gl->frags[0].size - offset);
3321	skb_shinfo(skb)->nr_frags = gl->nfrags;
3322	for (i = 1; i < gl->nfrags; i++)
3323		__skb_fill_page_desc(skb, i, gl->frags[i].page,
3324				     gl->frags[i].offset,
3325				     gl->frags[i].size);
3326
3327	/* get a reference to the last page, we don't own it */
3328	get_page(gl->frags[gl->nfrags - 1].page);
3329}
3330
3331/**
3332 *	cxgb4_pktgl_to_skb - build an sk_buff from a packet gather list
3333 *	@gl: the gather list
3334 *	@skb_len: size of sk_buff main body if it carries fragments
3335 *	@pull_len: amount of data to move to the sk_buff's main body
3336 *
3337 *	Builds an sk_buff from the given packet gather list.  Returns the
3338 *	sk_buff or %NULL if sk_buff allocation failed.
3339 */
3340struct sk_buff *cxgb4_pktgl_to_skb(const struct pkt_gl *gl,
3341				   unsigned int skb_len, unsigned int pull_len)
3342{
3343	struct sk_buff *skb;
3344
3345	/*
3346	 * Below we rely on RX_COPY_THRES being less than the smallest Rx buffer
3347	 * size, which is expected since buffers are at least PAGE_SIZEd.
3348	 * In this case packets up to RX_COPY_THRES have only one fragment.
3349	 */
3350	if (gl->tot_len <= RX_COPY_THRES) {
3351		skb = dev_alloc_skb(gl->tot_len);
3352		if (unlikely(!skb))
3353			goto out;
3354		__skb_put(skb, gl->tot_len);
3355		skb_copy_to_linear_data(skb, gl->va, gl->tot_len);
3356	} else {
3357		skb = dev_alloc_skb(skb_len);
3358		if (unlikely(!skb))
3359			goto out;
3360		__skb_put(skb, pull_len);
3361		skb_copy_to_linear_data(skb, gl->va, pull_len);
3362
3363		copy_frags(skb, gl, pull_len);
3364		skb->len = gl->tot_len;
3365		skb->data_len = skb->len - pull_len;
3366		skb->truesize += skb->data_len;
3367	}
3368out:	return skb;
3369}
3370EXPORT_SYMBOL(cxgb4_pktgl_to_skb);
3371
3372/**
3373 *	t4_pktgl_free - free a packet gather list
3374 *	@gl: the gather list
3375 *
3376 *	Releases the pages of a packet gather list.  We do not own the last
3377 *	page on the list and do not free it.
3378 */
3379static void t4_pktgl_free(const struct pkt_gl *gl)
3380{
3381	int n;
3382	const struct page_frag *p;
3383
3384	for (p = gl->frags, n = gl->nfrags - 1; n--; p++)
3385		put_page(p->page);
3386}
3387
3388/*
3389 * Process an MPS trace packet.  Give it an unused protocol number so it won't
3390 * be delivered to anyone and send it to the stack for capture.
3391 */
3392static noinline int handle_trace_pkt(struct adapter *adap,
3393				     const struct pkt_gl *gl)
3394{
3395	struct sk_buff *skb;
3396
3397	skb = cxgb4_pktgl_to_skb(gl, RX_PULL_LEN, RX_PULL_LEN);
3398	if (unlikely(!skb)) {
3399		t4_pktgl_free(gl);
3400		return 0;
3401	}
3402
3403	if (is_t4(adap->params.chip))
3404		__skb_pull(skb, sizeof(struct cpl_trace_pkt));
3405	else
3406		__skb_pull(skb, sizeof(struct cpl_t5_trace_pkt));
3407
3408	skb_reset_mac_header(skb);
3409	skb->protocol = htons(0xffff);
3410	skb->dev = adap->port[0];
3411	netif_receive_skb(skb);
3412	return 0;
3413}
3414
3415/**
3416 * cxgb4_sgetim_to_hwtstamp - convert sge time stamp to hw time stamp
3417 * @adap: the adapter
3418 * @hwtstamps: time stamp structure to update
3419 * @sgetstamp: 60bit iqe timestamp
3420 *
3421 * Every ingress queue entry has the 60-bit timestamp, convert that timestamp
3422 * which is in Core Clock ticks into ktime_t and assign it
3423 **/
3424static void cxgb4_sgetim_to_hwtstamp(struct adapter *adap,
3425				     struct skb_shared_hwtstamps *hwtstamps,
3426				     u64 sgetstamp)
3427{
3428	u64 ns;
3429	u64 tmp = (sgetstamp * 1000 * 1000 + adap->params.vpd.cclk / 2);
3430
3431	ns = div_u64(tmp, adap->params.vpd.cclk);
3432
3433	memset(hwtstamps, 0, sizeof(*hwtstamps));
3434	hwtstamps->hwtstamp = ns_to_ktime(ns);
3435}
3436
3437static void do_gro(struct sge_eth_rxq *rxq, const struct pkt_gl *gl,
3438		   const struct cpl_rx_pkt *pkt, unsigned long tnl_hdr_len)
3439{
3440	struct adapter *adapter = rxq->rspq.adap;
3441	struct sge *s = &adapter->sge;
3442	struct port_info *pi;
3443	int ret;
3444	struct sk_buff *skb;
3445
3446	skb = napi_get_frags(&rxq->rspq.napi);
3447	if (unlikely(!skb)) {
3448		t4_pktgl_free(gl);
3449		rxq->stats.rx_drops++;
3450		return;
3451	}
3452
3453	copy_frags(skb, gl, s->pktshift);
3454	if (tnl_hdr_len)
3455		skb->csum_level = 1;
3456	skb->len = gl->tot_len - s->pktshift;
3457	skb->data_len = skb->len;
3458	skb->truesize += skb->data_len;
3459	skb->ip_summed = CHECKSUM_UNNECESSARY;
3460	skb_record_rx_queue(skb, rxq->rspq.idx);
3461	pi = netdev_priv(skb->dev);
3462	if (pi->rxtstamp)
3463		cxgb4_sgetim_to_hwtstamp(adapter, skb_hwtstamps(skb),
3464					 gl->sgetstamp);
3465	if (rxq->rspq.netdev->features & NETIF_F_RXHASH)
3466		skb_set_hash(skb, (__force u32)pkt->rsshdr.hash_val,
3467			     PKT_HASH_TYPE_L3);
3468
3469	if (unlikely(pkt->vlan_ex)) {
3470		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(pkt->vlan));
3471		rxq->stats.vlan_ex++;
3472	}
3473	ret = napi_gro_frags(&rxq->rspq.napi);
3474	if (ret == GRO_HELD)
3475		rxq->stats.lro_pkts++;
3476	else if (ret == GRO_MERGED || ret == GRO_MERGED_FREE)
3477		rxq->stats.lro_merged++;
3478	rxq->stats.pkts++;
3479	rxq->stats.rx_cso++;
3480}
3481
3482enum {
3483	RX_NON_PTP_PKT = 0,
3484	RX_PTP_PKT_SUC = 1,
3485	RX_PTP_PKT_ERR = 2
3486};
3487
3488/**
3489 *     t4_systim_to_hwstamp - read hardware time stamp
3490 *     @adapter: the adapter
3491 *     @skb: the packet
3492 *
3493 *     Read Time Stamp from MPS packet and insert in skb which
3494 *     is forwarded to PTP application
3495 */
3496static noinline int t4_systim_to_hwstamp(struct adapter *adapter,
3497					 struct sk_buff *skb)
3498{
3499	struct skb_shared_hwtstamps *hwtstamps;
3500	struct cpl_rx_mps_pkt *cpl = NULL;
3501	unsigned char *data;
3502	int offset;
3503
3504	cpl = (struct cpl_rx_mps_pkt *)skb->data;
3505	if (!(CPL_RX_MPS_PKT_TYPE_G(ntohl(cpl->op_to_r1_hi)) &
3506	     X_CPL_RX_MPS_PKT_TYPE_PTP))
3507		return RX_PTP_PKT_ERR;
3508
3509	data = skb->data + sizeof(*cpl);
3510	skb_pull(skb, 2 * sizeof(u64) + sizeof(struct cpl_rx_mps_pkt));
3511	offset = ETH_HLEN + IPV4_HLEN(skb->data) + UDP_HLEN;
3512	if (skb->len < offset + OFF_PTP_SEQUENCE_ID + sizeof(short))
3513		return RX_PTP_PKT_ERR;
3514
3515	hwtstamps = skb_hwtstamps(skb);
3516	memset(hwtstamps, 0, sizeof(*hwtstamps));
3517	hwtstamps->hwtstamp = ns_to_ktime(get_unaligned_be64(data));
3518
3519	return RX_PTP_PKT_SUC;
3520}
3521
3522/**
3523 *     t4_rx_hststamp - Recv PTP Event Message
3524 *     @adapter: the adapter
3525 *     @rsp: the response queue descriptor holding the RX_PKT message
3526 *     @rxq: the response queue holding the RX_PKT message
3527 *     @skb: the packet
3528 *
3529 *     PTP enabled and MPS packet, read HW timestamp
3530 */
3531static int t4_rx_hststamp(struct adapter *adapter, const __be64 *rsp,
3532			  struct sge_eth_rxq *rxq, struct sk_buff *skb)
3533{
3534	int ret;
3535
3536	if (unlikely((*(u8 *)rsp == CPL_RX_MPS_PKT) &&
3537		     !is_t4(adapter->params.chip))) {
3538		ret = t4_systim_to_hwstamp(adapter, skb);
3539		if (ret == RX_PTP_PKT_ERR) {
3540			kfree_skb(skb);
3541			rxq->stats.rx_drops++;
3542		}
3543		return ret;
3544	}
3545	return RX_NON_PTP_PKT;
3546}
3547
3548/**
3549 *      t4_tx_hststamp - Loopback PTP Transmit Event Message
3550 *      @adapter: the adapter
3551 *      @skb: the packet
3552 *      @dev: the ingress net device
3553 *
3554 *      Read hardware timestamp for the loopback PTP Tx event message
3555 */
3556static int t4_tx_hststamp(struct adapter *adapter, struct sk_buff *skb,
3557			  struct net_device *dev)
3558{
3559	struct port_info *pi = netdev_priv(dev);
3560
3561	if (!is_t4(adapter->params.chip) && adapter->ptp_tx_skb) {
3562		cxgb4_ptp_read_hwstamp(adapter, pi);
3563		kfree_skb(skb);
3564		return 0;
3565	}
3566	return 1;
3567}
3568
3569/**
3570 *	t4_tx_completion_handler - handle CPL_SGE_EGR_UPDATE messages
3571 *	@rspq: Ethernet RX Response Queue associated with Ethernet TX Queue
3572 *	@rsp: Response Entry pointer into Response Queue
3573 *	@gl: Gather List pointer
3574 *
3575 *	For adapters which support the SGE Doorbell Queue Timer facility,
3576 *	we configure the Ethernet TX Queues to send CIDX Updates to the
3577 *	Associated Ethernet RX Response Queue with CPL_SGE_EGR_UPDATE
3578 *	messages.  This adds a small load to PCIe Link RX bandwidth and,
3579 *	potentially, higher CPU Interrupt load, but allows us to respond
3580 *	much more quickly to the CIDX Updates.  This is important for
3581 *	Upper Layer Software which isn't willing to have a large amount
3582 *	of TX Data outstanding before receiving DMA Completions.
3583 */
3584static void t4_tx_completion_handler(struct sge_rspq *rspq,
3585				     const __be64 *rsp,
3586				     const struct pkt_gl *gl)
3587{
3588	u8 opcode = ((const struct rss_header *)rsp)->opcode;
3589	struct port_info *pi = netdev_priv(rspq->netdev);
3590	struct adapter *adapter = rspq->adap;
3591	struct sge *s = &adapter->sge;
3592	struct sge_eth_txq *txq;
3593
3594	/* skip RSS header */
3595	rsp++;
3596
3597	/* FW can send EGR_UPDATEs encapsulated in a CPL_FW4_MSG.
3598	 */
3599	if (unlikely(opcode == CPL_FW4_MSG &&
3600		     ((const struct cpl_fw4_msg *)rsp)->type ==
3601							FW_TYPE_RSSCPL)) {
3602		rsp++;
3603		opcode = ((const struct rss_header *)rsp)->opcode;
3604		rsp++;
3605	}
3606
3607	if (unlikely(opcode != CPL_SGE_EGR_UPDATE)) {
3608		pr_info("%s: unexpected FW4/CPL %#x on Rx queue\n",
3609			__func__, opcode);
3610		return;
3611	}
3612
3613	txq = &s->ethtxq[pi->first_qset + rspq->idx];
3614	t4_sge_eth_txq_egress_update(adapter, txq, -1);
3615}
3616
3617static int cxgb4_validate_lb_pkt(struct port_info *pi, const struct pkt_gl *si)
3618{
3619	struct adapter *adap = pi->adapter;
3620	struct cxgb4_ethtool_lb_test *lb;
3621	struct sge *s = &adap->sge;
3622	struct net_device *netdev;
3623	u8 *data;
3624	int i;
3625
3626	netdev = adap->port[pi->port_id];
3627	lb = &pi->ethtool_lb;
3628	data = si->va + s->pktshift;
3629
3630	i = ETH_ALEN;
3631	if (!ether_addr_equal(data + i, netdev->dev_addr))
3632		return -1;
3633
3634	i += ETH_ALEN;
3635	if (strcmp(&data[i], CXGB4_SELFTEST_LB_STR))
3636		lb->result = -EIO;
3637
3638	complete(&lb->completion);
3639	return 0;
3640}
3641
3642/**
3643 *	t4_ethrx_handler - process an ingress ethernet packet
3644 *	@q: the response queue that received the packet
3645 *	@rsp: the response queue descriptor holding the RX_PKT message
3646 *	@si: the gather list of packet fragments
3647 *
3648 *	Process an ingress ethernet packet and deliver it to the stack.
3649 */
3650int t4_ethrx_handler(struct sge_rspq *q, const __be64 *rsp,
3651		     const struct pkt_gl *si)
3652{
3653	bool csum_ok;
3654	struct sk_buff *skb;
3655	const struct cpl_rx_pkt *pkt;
3656	struct sge_eth_rxq *rxq = container_of(q, struct sge_eth_rxq, rspq);
3657	struct adapter *adapter = q->adap;
3658	struct sge *s = &q->adap->sge;
3659	int cpl_trace_pkt = is_t4(q->adap->params.chip) ?
3660			    CPL_TRACE_PKT : CPL_TRACE_PKT_T5;
3661	u16 err_vec, tnl_hdr_len = 0;
3662	struct port_info *pi;
3663	int ret = 0;
3664
3665	pi = netdev_priv(q->netdev);
3666	/* If we're looking at TX Queue CIDX Update, handle that separately
3667	 * and return.
3668	 */
3669	if (unlikely((*(u8 *)rsp == CPL_FW4_MSG) ||
3670		     (*(u8 *)rsp == CPL_SGE_EGR_UPDATE))) {
3671		t4_tx_completion_handler(q, rsp, si);
3672		return 0;
3673	}
3674
3675	if (unlikely(*(u8 *)rsp == cpl_trace_pkt))
3676		return handle_trace_pkt(q->adap, si);
3677
3678	pkt = (const struct cpl_rx_pkt *)rsp;
3679	/* Compressed error vector is enabled for T6 only */
3680	if (q->adap->params.tp.rx_pkt_encap) {
3681		err_vec = T6_COMPR_RXERR_VEC_G(be16_to_cpu(pkt->err_vec));
3682		tnl_hdr_len = T6_RX_TNLHDR_LEN_G(ntohs(pkt->err_vec));
3683	} else {
3684		err_vec = be16_to_cpu(pkt->err_vec);
3685	}
3686
3687	csum_ok = pkt->csum_calc && !err_vec &&
3688		  (q->netdev->features & NETIF_F_RXCSUM);
3689
3690	if (err_vec)
3691		rxq->stats.bad_rx_pkts++;
3692
3693	if (unlikely(pi->ethtool_lb.loopback && pkt->iff >= NCHAN)) {
3694		ret = cxgb4_validate_lb_pkt(pi, si);
3695		if (!ret)
3696			return 0;
3697	}
3698
3699	if (((pkt->l2info & htonl(RXF_TCP_F)) ||
3700	     tnl_hdr_len) &&
3701	    (q->netdev->features & NETIF_F_GRO) && csum_ok && !pkt->ip_frag) {
3702		do_gro(rxq, si, pkt, tnl_hdr_len);
3703		return 0;
3704	}
3705
3706	skb = cxgb4_pktgl_to_skb(si, RX_PKT_SKB_LEN, RX_PULL_LEN);
3707	if (unlikely(!skb)) {
3708		t4_pktgl_free(si);
3709		rxq->stats.rx_drops++;
3710		return 0;
3711	}
3712
3713	/* Handle PTP Event Rx packet */
3714	if (unlikely(pi->ptp_enable)) {
3715		ret = t4_rx_hststamp(adapter, rsp, rxq, skb);
3716		if (ret == RX_PTP_PKT_ERR)
3717			return 0;
3718	}
3719	if (likely(!ret))
3720		__skb_pull(skb, s->pktshift); /* remove ethernet header pad */
3721
3722	/* Handle the PTP Event Tx Loopback packet */
3723	if (unlikely(pi->ptp_enable && !ret &&
3724		     (pkt->l2info & htonl(RXF_UDP_F)) &&
3725		     cxgb4_ptp_is_ptp_rx(skb))) {
3726		if (!t4_tx_hststamp(adapter, skb, q->netdev))
3727			return 0;
3728	}
3729
3730	skb->protocol = eth_type_trans(skb, q->netdev);
3731	skb_record_rx_queue(skb, q->idx);
3732	if (skb->dev->features & NETIF_F_RXHASH)
3733		skb_set_hash(skb, (__force u32)pkt->rsshdr.hash_val,
3734			     PKT_HASH_TYPE_L3);
3735
3736	rxq->stats.pkts++;
3737
3738	if (pi->rxtstamp)
3739		cxgb4_sgetim_to_hwtstamp(q->adap, skb_hwtstamps(skb),
3740					 si->sgetstamp);
3741	if (csum_ok && (pkt->l2info & htonl(RXF_UDP_F | RXF_TCP_F))) {
3742		if (!pkt->ip_frag) {
3743			skb->ip_summed = CHECKSUM_UNNECESSARY;
3744			rxq->stats.rx_cso++;
3745		} else if (pkt->l2info & htonl(RXF_IP_F)) {
3746			__sum16 c = (__force __sum16)pkt->csum;
3747			skb->csum = csum_unfold(c);
3748
3749			if (tnl_hdr_len) {
3750				skb->ip_summed = CHECKSUM_UNNECESSARY;
3751				skb->csum_level = 1;
3752			} else {
3753				skb->ip_summed = CHECKSUM_COMPLETE;
3754			}
3755			rxq->stats.rx_cso++;
3756		}
3757	} else {
3758		skb_checksum_none_assert(skb);
3759#ifdef CONFIG_CHELSIO_T4_FCOE
3760#define CPL_RX_PKT_FLAGS (RXF_PSH_F | RXF_SYN_F | RXF_UDP_F | \
3761			  RXF_TCP_F | RXF_IP_F | RXF_IP6_F | RXF_LRO_F)
3762
3763		if (!(pkt->l2info & cpu_to_be32(CPL_RX_PKT_FLAGS))) {
3764			if ((pkt->l2info & cpu_to_be32(RXF_FCOE_F)) &&
3765			    (pi->fcoe.flags & CXGB_FCOE_ENABLED)) {
3766				if (q->adap->params.tp.rx_pkt_encap)
3767					csum_ok = err_vec &
3768						  T6_COMPR_RXERR_SUM_F;
3769				else
3770					csum_ok = err_vec & RXERR_CSUM_F;
3771				if (!csum_ok)
3772					skb->ip_summed = CHECKSUM_UNNECESSARY;
3773			}
3774		}
3775
3776#undef CPL_RX_PKT_FLAGS
3777#endif /* CONFIG_CHELSIO_T4_FCOE */
3778	}
3779
3780	if (unlikely(pkt->vlan_ex)) {
3781		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(pkt->vlan));
3782		rxq->stats.vlan_ex++;
3783	}
3784	skb_mark_napi_id(skb, &q->napi);
3785	netif_receive_skb(skb);
3786	return 0;
3787}
3788
3789/**
3790 *	restore_rx_bufs - put back a packet's Rx buffers
3791 *	@si: the packet gather list
3792 *	@q: the SGE free list
3793 *	@frags: number of FL buffers to restore
3794 *
3795 *	Puts back on an FL the Rx buffers associated with @si.  The buffers
3796 *	have already been unmapped and are left unmapped, we mark them so to
3797 *	prevent further unmapping attempts.
3798 *
3799 *	This function undoes a series of @unmap_rx_buf calls when we find out
3800 *	that the current packet can't be processed right away afterall and we
3801 *	need to come back to it later.  This is a very rare event and there's
3802 *	no effort to make this particularly efficient.
3803 */
3804static void restore_rx_bufs(const struct pkt_gl *si, struct sge_fl *q,
3805			    int frags)
3806{
3807	struct rx_sw_desc *d;
3808
3809	while (frags--) {
3810		if (q->cidx == 0)
3811			q->cidx = q->size - 1;
3812		else
3813			q->cidx--;
3814		d = &q->sdesc[q->cidx];
3815		d->page = si->frags[frags].page;
3816		d->dma_addr |= RX_UNMAPPED_BUF;
3817		q->avail++;
3818	}
3819}
3820
3821/**
3822 *	is_new_response - check if a response is newly written
3823 *	@r: the response descriptor
3824 *	@q: the response queue
3825 *
3826 *	Returns true if a response descriptor contains a yet unprocessed
3827 *	response.
3828 */
3829static inline bool is_new_response(const struct rsp_ctrl *r,
3830				   const struct sge_rspq *q)
3831{
3832	return (r->type_gen >> RSPD_GEN_S) == q->gen;
3833}
3834
3835/**
3836 *	rspq_next - advance to the next entry in a response queue
3837 *	@q: the queue
3838 *
3839 *	Updates the state of a response queue to advance it to the next entry.
3840 */
3841static inline void rspq_next(struct sge_rspq *q)
3842{
3843	q->cur_desc = (void *)q->cur_desc + q->iqe_len;
3844	if (unlikely(++q->cidx == q->size)) {
3845		q->cidx = 0;
3846		q->gen ^= 1;
3847		q->cur_desc = q->desc;
3848	}
3849}
3850
3851/**
3852 *	process_responses - process responses from an SGE response queue
3853 *	@q: the ingress queue to process
3854 *	@budget: how many responses can be processed in this round
3855 *
3856 *	Process responses from an SGE response queue up to the supplied budget.
3857 *	Responses include received packets as well as control messages from FW
3858 *	or HW.
3859 *
3860 *	Additionally choose the interrupt holdoff time for the next interrupt
3861 *	on this queue.  If the system is under memory shortage use a fairly
3862 *	long delay to help recovery.
3863 */
3864static int process_responses(struct sge_rspq *q, int budget)
3865{
3866	int ret, rsp_type;
3867	int budget_left = budget;
3868	const struct rsp_ctrl *rc;
3869	struct sge_eth_rxq *rxq = container_of(q, struct sge_eth_rxq, rspq);
3870	struct adapter *adapter = q->adap;
3871	struct sge *s = &adapter->sge;
3872
3873	while (likely(budget_left)) {
3874		rc = (void *)q->cur_desc + (q->iqe_len - sizeof(*rc));
3875		if (!is_new_response(rc, q)) {
3876			if (q->flush_handler)
3877				q->flush_handler(q);
3878			break;
3879		}
3880
3881		dma_rmb();
3882		rsp_type = RSPD_TYPE_G(rc->type_gen);
3883		if (likely(rsp_type == RSPD_TYPE_FLBUF_X)) {
3884			struct page_frag *fp;
3885			struct pkt_gl si;
3886			const struct rx_sw_desc *rsd;
3887			u32 len = ntohl(rc->pldbuflen_qid), bufsz, frags;
3888
3889			if (len & RSPD_NEWBUF_F) {
3890				if (likely(q->offset > 0)) {
3891					free_rx_bufs(q->adap, &rxq->fl, 1);
3892					q->offset = 0;
3893				}
3894				len = RSPD_LEN_G(len);
3895			}
3896			si.tot_len = len;
3897
3898			/* gather packet fragments */
3899			for (frags = 0, fp = si.frags; ; frags++, fp++) {
3900				rsd = &rxq->fl.sdesc[rxq->fl.cidx];
3901				bufsz = get_buf_size(adapter, rsd);
3902				fp->page = rsd->page;
3903				fp->offset = q->offset;
3904				fp->size = min(bufsz, len);
3905				len -= fp->size;
3906				if (!len)
3907					break;
3908				unmap_rx_buf(q->adap, &rxq->fl);
3909			}
3910
3911			si.sgetstamp = SGE_TIMESTAMP_G(
3912					be64_to_cpu(rc->last_flit));
3913			/*
3914			 * Last buffer remains mapped so explicitly make it
3915			 * coherent for CPU access.
3916			 */
3917			dma_sync_single_for_cpu(q->adap->pdev_dev,
3918						get_buf_addr(rsd),
3919						fp->size, DMA_FROM_DEVICE);
3920
3921			si.va = page_address(si.frags[0].page) +
3922				si.frags[0].offset;
3923			prefetch(si.va);
3924
3925			si.nfrags = frags + 1;
3926			ret = q->handler(q, q->cur_desc, &si);
3927			if (likely(ret == 0))
3928				q->offset += ALIGN(fp->size, s->fl_align);
3929			else
3930				restore_rx_bufs(&si, &rxq->fl, frags);
3931		} else if (likely(rsp_type == RSPD_TYPE_CPL_X)) {
3932			ret = q->handler(q, q->cur_desc, NULL);
3933		} else {
3934			ret = q->handler(q, (const __be64 *)rc, CXGB4_MSG_AN);
3935		}
3936
3937		if (unlikely(ret)) {
3938			/* couldn't process descriptor, back off for recovery */
3939			q->next_intr_params = QINTR_TIMER_IDX_V(NOMEM_TMR_IDX);
3940			break;
3941		}
3942
3943		rspq_next(q);
3944		budget_left--;
3945	}
3946
3947	if (q->offset >= 0 && fl_cap(&rxq->fl) - rxq->fl.avail >= 16)
3948		__refill_fl(q->adap, &rxq->fl);
3949	return budget - budget_left;
3950}
3951
3952/**
3953 *	napi_rx_handler - the NAPI handler for Rx processing
3954 *	@napi: the napi instance
3955 *	@budget: how many packets we can process in this round
3956 *
3957 *	Handler for new data events when using NAPI.  This does not need any
3958 *	locking or protection from interrupts as data interrupts are off at
3959 *	this point and other adapter interrupts do not interfere (the latter
3960 *	in not a concern at all with MSI-X as non-data interrupts then have
3961 *	a separate handler).
3962 */
3963static int napi_rx_handler(struct napi_struct *napi, int budget)
3964{
3965	unsigned int params;
3966	struct sge_rspq *q = container_of(napi, struct sge_rspq, napi);
3967	int work_done;
3968	u32 val;
3969
3970	work_done = process_responses(q, budget);
3971	if (likely(work_done < budget)) {
3972		int timer_index;
3973
3974		napi_complete_done(napi, work_done);
3975		timer_index = QINTR_TIMER_IDX_G(q->next_intr_params);
3976
3977		if (q->adaptive_rx) {
3978			if (work_done > max(timer_pkt_quota[timer_index],
3979					    MIN_NAPI_WORK))
3980				timer_index = (timer_index + 1);
3981			else
3982				timer_index = timer_index - 1;
3983
3984			timer_index = clamp(timer_index, 0, SGE_TIMERREGS - 1);
3985			q->next_intr_params =
3986					QINTR_TIMER_IDX_V(timer_index) |
3987					QINTR_CNT_EN_V(0);
3988			params = q->next_intr_params;
3989		} else {
3990			params = q->next_intr_params;
3991			q->next_intr_params = q->intr_params;
3992		}
3993	} else
3994		params = QINTR_TIMER_IDX_V(7);
3995
3996	val = CIDXINC_V(work_done) | SEINTARM_V(params);
3997
3998	/* If we don't have access to the new User GTS (T5+), use the old
3999	 * doorbell mechanism; otherwise use the new BAR2 mechanism.
4000	 */
4001	if (unlikely(q->bar2_addr == NULL)) {
4002		t4_write_reg(q->adap, MYPF_REG(SGE_PF_GTS_A),
4003			     val | INGRESSQID_V((u32)q->cntxt_id));
4004	} else {
4005		writel(val | INGRESSQID_V(q->bar2_qid),
4006		       q->bar2_addr + SGE_UDB_GTS);
4007		wmb();
4008	}
4009	return work_done;
4010}
4011
4012void cxgb4_ethofld_restart(struct tasklet_struct *t)
4013{
4014	struct sge_eosw_txq *eosw_txq = from_tasklet(eosw_txq, t,
4015						     qresume_tsk);
4016	int pktcount;
4017
4018	spin_lock(&eosw_txq->lock);
4019	pktcount = eosw_txq->cidx - eosw_txq->last_cidx;
4020	if (pktcount < 0)
4021		pktcount += eosw_txq->ndesc;
4022
4023	if (pktcount) {
4024		cxgb4_eosw_txq_free_desc(netdev2adap(eosw_txq->netdev),
4025					 eosw_txq, pktcount);
4026		eosw_txq->inuse -= pktcount;
4027	}
4028
4029	/* There may be some packets waiting for completions. So,
4030	 * attempt to send these packets now.
4031	 */
4032	ethofld_xmit(eosw_txq->netdev, eosw_txq);
4033	spin_unlock(&eosw_txq->lock);
4034}
4035
4036/* cxgb4_ethofld_rx_handler - Process ETHOFLD Tx completions
4037 * @q: the response queue that received the packet
4038 * @rsp: the response queue descriptor holding the CPL message
4039 * @si: the gather list of packet fragments
4040 *
4041 * Process a ETHOFLD Tx completion. Increment the cidx here, but
4042 * free up the descriptors in a tasklet later.
4043 */
4044int cxgb4_ethofld_rx_handler(struct sge_rspq *q, const __be64 *rsp,
4045			     const struct pkt_gl *si)
4046{
4047	u8 opcode = ((const struct rss_header *)rsp)->opcode;
4048
4049	/* skip RSS header */
4050	rsp++;
4051
4052	if (opcode == CPL_FW4_ACK) {
4053		const struct cpl_fw4_ack *cpl;
4054		struct sge_eosw_txq *eosw_txq;
4055		struct eotid_entry *entry;
4056		struct sk_buff *skb;
4057		u32 hdr_len, eotid;
4058		u8 flits, wrlen16;
4059		int credits;
4060
4061		cpl = (const struct cpl_fw4_ack *)rsp;
4062		eotid = CPL_FW4_ACK_FLOWID_G(ntohl(OPCODE_TID(cpl))) -
4063			q->adap->tids.eotid_base;
4064		entry = cxgb4_lookup_eotid(&q->adap->tids, eotid);
4065		if (!entry)
4066			goto out_done;
4067
4068		eosw_txq = (struct sge_eosw_txq *)entry->data;
4069		if (!eosw_txq)
4070			goto out_done;
4071
4072		spin_lock(&eosw_txq->lock);
4073		credits = cpl->credits;
4074		while (credits > 0) {
4075			skb = eosw_txq->desc[eosw_txq->cidx].skb;
4076			if (!skb)
4077				break;
4078
4079			if (unlikely((eosw_txq->state ==
4080				      CXGB4_EO_STATE_FLOWC_OPEN_REPLY ||
4081				      eosw_txq->state ==
4082				      CXGB4_EO_STATE_FLOWC_CLOSE_REPLY) &&
4083				     eosw_txq->cidx == eosw_txq->flowc_idx)) {
4084				flits = DIV_ROUND_UP(skb->len, 8);
4085				if (eosw_txq->state ==
4086				    CXGB4_EO_STATE_FLOWC_OPEN_REPLY)
4087					eosw_txq->state = CXGB4_EO_STATE_ACTIVE;
4088				else
4089					eosw_txq->state = CXGB4_EO_STATE_CLOSED;
4090				complete(&eosw_txq->completion);
4091			} else {
4092				hdr_len = eth_get_headlen(eosw_txq->netdev,
4093							  skb->data,
4094							  skb_headlen(skb));
4095				flits = ethofld_calc_tx_flits(q->adap, skb,
4096							      hdr_len);
4097			}
4098			eosw_txq_advance_index(&eosw_txq->cidx, 1,
4099					       eosw_txq->ndesc);
4100			wrlen16 = DIV_ROUND_UP(flits * 8, 16);
4101			credits -= wrlen16;
4102		}
4103
4104		eosw_txq->cred += cpl->credits;
4105		eosw_txq->ncompl--;
4106
4107		spin_unlock(&eosw_txq->lock);
4108
4109		/* Schedule a tasklet to reclaim SKBs and restart ETHOFLD Tx,
4110		 * if there were packets waiting for completion.
4111		 */
4112		tasklet_schedule(&eosw_txq->qresume_tsk);
4113	}
4114
4115out_done:
4116	return 0;
4117}
4118
4119/*
4120 * The MSI-X interrupt handler for an SGE response queue.
4121 */
4122irqreturn_t t4_sge_intr_msix(int irq, void *cookie)
4123{
4124	struct sge_rspq *q = cookie;
4125
4126	napi_schedule(&q->napi);
4127	return IRQ_HANDLED;
4128}
4129
4130/*
4131 * Process the indirect interrupt entries in the interrupt queue and kick off
4132 * NAPI for each queue that has generated an entry.
4133 */
4134static unsigned int process_intrq(struct adapter *adap)
4135{
4136	unsigned int credits;
4137	const struct rsp_ctrl *rc;
4138	struct sge_rspq *q = &adap->sge.intrq;
4139	u32 val;
4140
4141	spin_lock(&adap->sge.intrq_lock);
4142	for (credits = 0; ; credits++) {
4143		rc = (void *)q->cur_desc + (q->iqe_len - sizeof(*rc));
4144		if (!is_new_response(rc, q))
4145			break;
4146
4147		dma_rmb();
4148		if (RSPD_TYPE_G(rc->type_gen) == RSPD_TYPE_INTR_X) {
4149			unsigned int qid = ntohl(rc->pldbuflen_qid);
4150
4151			qid -= adap->sge.ingr_start;
4152			napi_schedule(&adap->sge.ingr_map[qid]->napi);
4153		}
4154
4155		rspq_next(q);
4156	}
4157
4158	val =  CIDXINC_V(credits) | SEINTARM_V(q->intr_params);
4159
4160	/* If we don't have access to the new User GTS (T5+), use the old
4161	 * doorbell mechanism; otherwise use the new BAR2 mechanism.
4162	 */
4163	if (unlikely(q->bar2_addr == NULL)) {
4164		t4_write_reg(adap, MYPF_REG(SGE_PF_GTS_A),
4165			     val | INGRESSQID_V(q->cntxt_id));
4166	} else {
4167		writel(val | INGRESSQID_V(q->bar2_qid),
4168		       q->bar2_addr + SGE_UDB_GTS);
4169		wmb();
4170	}
4171	spin_unlock(&adap->sge.intrq_lock);
4172	return credits;
4173}
4174
4175/*
4176 * The MSI interrupt handler, which handles data events from SGE response queues
4177 * as well as error and other async events as they all use the same MSI vector.
4178 */
4179static irqreturn_t t4_intr_msi(int irq, void *cookie)
4180{
4181	struct adapter *adap = cookie;
4182
4183	if (adap->flags & CXGB4_MASTER_PF)
4184		t4_slow_intr_handler(adap);
4185	process_intrq(adap);
4186	return IRQ_HANDLED;
4187}
4188
4189/*
4190 * Interrupt handler for legacy INTx interrupts.
4191 * Handles data events from SGE response queues as well as error and other
4192 * async events as they all use the same interrupt line.
4193 */
4194static irqreturn_t t4_intr_intx(int irq, void *cookie)
4195{
4196	struct adapter *adap = cookie;
4197
4198	t4_write_reg(adap, MYPF_REG(PCIE_PF_CLI_A), 0);
4199	if (((adap->flags & CXGB4_MASTER_PF) && t4_slow_intr_handler(adap)) |
4200	    process_intrq(adap))
4201		return IRQ_HANDLED;
4202	return IRQ_NONE;             /* probably shared interrupt */
4203}
4204
4205/**
4206 *	t4_intr_handler - select the top-level interrupt handler
4207 *	@adap: the adapter
4208 *
4209 *	Selects the top-level interrupt handler based on the type of interrupts
4210 *	(MSI-X, MSI, or INTx).
4211 */
4212irq_handler_t t4_intr_handler(struct adapter *adap)
4213{
4214	if (adap->flags & CXGB4_USING_MSIX)
4215		return t4_sge_intr_msix;
4216	if (adap->flags & CXGB4_USING_MSI)
4217		return t4_intr_msi;
4218	return t4_intr_intx;
4219}
4220
4221static void sge_rx_timer_cb(struct timer_list *t)
4222{
4223	unsigned long m;
4224	unsigned int i;
4225	struct adapter *adap = from_timer(adap, t, sge.rx_timer);
4226	struct sge *s = &adap->sge;
4227
4228	for (i = 0; i < BITS_TO_LONGS(s->egr_sz); i++)
4229		for (m = s->starving_fl[i]; m; m &= m - 1) {
4230			struct sge_eth_rxq *rxq;
4231			unsigned int id = __ffs(m) + i * BITS_PER_LONG;
4232			struct sge_fl *fl = s->egr_map[id];
4233
4234			clear_bit(id, s->starving_fl);
4235			smp_mb__after_atomic();
4236
4237			if (fl_starving(adap, fl)) {
4238				rxq = container_of(fl, struct sge_eth_rxq, fl);
4239				if (napi_reschedule(&rxq->rspq.napi))
4240					fl->starving++;
4241				else
4242					set_bit(id, s->starving_fl);
4243			}
4244		}
4245	/* The remainder of the SGE RX Timer Callback routine is dedicated to
4246	 * global Master PF activities like checking for chip ingress stalls,
4247	 * etc.
4248	 */
4249	if (!(adap->flags & CXGB4_MASTER_PF))
4250		goto done;
4251
4252	t4_idma_monitor(adap, &s->idma_monitor, HZ, RX_QCHECK_PERIOD);
4253
4254done:
4255	mod_timer(&s->rx_timer, jiffies + RX_QCHECK_PERIOD);
4256}
4257
4258static void sge_tx_timer_cb(struct timer_list *t)
4259{
4260	struct adapter *adap = from_timer(adap, t, sge.tx_timer);
4261	struct sge *s = &adap->sge;
4262	unsigned long m, period;
4263	unsigned int i, budget;
4264
4265	for (i = 0; i < BITS_TO_LONGS(s->egr_sz); i++)
4266		for (m = s->txq_maperr[i]; m; m &= m - 1) {
4267			unsigned long id = __ffs(m) + i * BITS_PER_LONG;
4268			struct sge_uld_txq *txq = s->egr_map[id];
4269
4270			clear_bit(id, s->txq_maperr);
4271			tasklet_schedule(&txq->qresume_tsk);
4272		}
4273
4274	if (!is_t4(adap->params.chip)) {
4275		struct sge_eth_txq *q = &s->ptptxq;
4276		int avail;
4277
4278		spin_lock(&adap->ptp_lock);
4279		avail = reclaimable(&q->q);
4280
4281		if (avail) {
4282			free_tx_desc(adap, &q->q, avail, false);
4283			q->q.in_use -= avail;
4284		}
4285		spin_unlock(&adap->ptp_lock);
4286	}
4287
4288	budget = MAX_TIMER_TX_RECLAIM;
4289	i = s->ethtxq_rover;
4290	do {
4291		budget -= t4_sge_eth_txq_egress_update(adap, &s->ethtxq[i],
4292						       budget);
4293		if (!budget)
4294			break;
4295
4296		if (++i >= s->ethqsets)
4297			i = 0;
4298	} while (i != s->ethtxq_rover);
4299	s->ethtxq_rover = i;
4300
4301	if (budget == 0) {
4302		/* If we found too many reclaimable packets schedule a timer
4303		 * in the near future to continue where we left off.
4304		 */
4305		period = 2;
4306	} else {
4307		/* We reclaimed all reclaimable TX Descriptors, so reschedule
4308		 * at the normal period.
4309		 */
4310		period = TX_QCHECK_PERIOD;
4311	}
4312
4313	mod_timer(&s->tx_timer, jiffies + period);
4314}
4315
4316/**
4317 *	bar2_address - return the BAR2 address for an SGE Queue's Registers
4318 *	@adapter: the adapter
4319 *	@qid: the SGE Queue ID
4320 *	@qtype: the SGE Queue Type (Egress or Ingress)
4321 *	@pbar2_qid: BAR2 Queue ID or 0 for Queue ID inferred SGE Queues
4322 *
4323 *	Returns the BAR2 address for the SGE Queue Registers associated with
4324 *	@qid.  If BAR2 SGE Registers aren't available, returns NULL.  Also
4325 *	returns the BAR2 Queue ID to be used with writes to the BAR2 SGE
4326 *	Queue Registers.  If the BAR2 Queue ID is 0, then "Inferred Queue ID"
4327 *	Registers are supported (e.g. the Write Combining Doorbell Buffer).
4328 */
4329static void __iomem *bar2_address(struct adapter *adapter,
4330				  unsigned int qid,
4331				  enum t4_bar2_qtype qtype,
4332				  unsigned int *pbar2_qid)
4333{
4334	u64 bar2_qoffset;
4335	int ret;
4336
4337	ret = t4_bar2_sge_qregs(adapter, qid, qtype, 0,
4338				&bar2_qoffset, pbar2_qid);
4339	if (ret)
4340		return NULL;
4341
4342	return adapter->bar2 + bar2_qoffset;
4343}
4344
4345/* @intr_idx: MSI/MSI-X vector if >=0, -(absolute qid + 1) if < 0
4346 * @cong: < 0 -> no congestion feedback, >= 0 -> congestion channel map
4347 */
4348int t4_sge_alloc_rxq(struct adapter *adap, struct sge_rspq *iq, bool fwevtq,
4349		     struct net_device *dev, int intr_idx,
4350		     struct sge_fl *fl, rspq_handler_t hnd,
4351		     rspq_flush_handler_t flush_hnd, int cong)
4352{
4353	int ret, flsz = 0;
4354	struct fw_iq_cmd c;
4355	struct sge *s = &adap->sge;
4356	struct port_info *pi = netdev_priv(dev);
4357	int relaxed = !(adap->flags & CXGB4_ROOT_NO_RELAXED_ORDERING);
4358
4359	/* Size needs to be multiple of 16, including status entry. */
4360	iq->size = roundup(iq->size, 16);
4361
4362	iq->desc = alloc_ring(adap->pdev_dev, iq->size, iq->iqe_len, 0,
4363			      &iq->phys_addr, NULL, 0,
4364			      dev_to_node(adap->pdev_dev));
4365	if (!iq->desc)
4366		return -ENOMEM;
4367
4368	memset(&c, 0, sizeof(c));
4369	c.op_to_vfn = htonl(FW_CMD_OP_V(FW_IQ_CMD) | FW_CMD_REQUEST_F |
4370			    FW_CMD_WRITE_F | FW_CMD_EXEC_F |
4371			    FW_IQ_CMD_PFN_V(adap->pf) | FW_IQ_CMD_VFN_V(0));
4372	c.alloc_to_len16 = htonl(FW_IQ_CMD_ALLOC_F | FW_IQ_CMD_IQSTART_F |
4373				 FW_LEN16(c));
4374	c.type_to_iqandstindex = htonl(FW_IQ_CMD_TYPE_V(FW_IQ_TYPE_FL_INT_CAP) |
4375		FW_IQ_CMD_IQASYNCH_V(fwevtq) | FW_IQ_CMD_VIID_V(pi->viid) |
4376		FW_IQ_CMD_IQANDST_V(intr_idx < 0) |
4377		FW_IQ_CMD_IQANUD_V(UPDATEDELIVERY_INTERRUPT_X) |
4378		FW_IQ_CMD_IQANDSTINDEX_V(intr_idx >= 0 ? intr_idx :
4379							-intr_idx - 1));
4380	c.iqdroprss_to_iqesize = htons(FW_IQ_CMD_IQPCIECH_V(pi->tx_chan) |
4381		FW_IQ_CMD_IQGTSMODE_F |
4382		FW_IQ_CMD_IQINTCNTTHRESH_V(iq->pktcnt_idx) |
4383		FW_IQ_CMD_IQESIZE_V(ilog2(iq->iqe_len) - 4));
4384	c.iqsize = htons(iq->size);
4385	c.iqaddr = cpu_to_be64(iq->phys_addr);
4386	if (cong >= 0)
4387		c.iqns_to_fl0congen = htonl(FW_IQ_CMD_IQFLINTCONGEN_F |
4388				FW_IQ_CMD_IQTYPE_V(cong ? FW_IQ_IQTYPE_NIC
4389							:  FW_IQ_IQTYPE_OFLD));
4390
4391	if (fl) {
4392		unsigned int chip_ver =
4393			CHELSIO_CHIP_VERSION(adap->params.chip);
4394
4395		/* Allocate the ring for the hardware free list (with space
4396		 * for its status page) along with the associated software
4397		 * descriptor ring.  The free list size needs to be a multiple
4398		 * of the Egress Queue Unit and at least 2 Egress Units larger
4399		 * than the SGE's Egress Congrestion Threshold
4400		 * (fl_starve_thres - 1).
4401		 */
4402		if (fl->size < s->fl_starve_thres - 1 + 2 * 8)
4403			fl->size = s->fl_starve_thres - 1 + 2 * 8;
4404		fl->size = roundup(fl->size, 8);
4405		fl->desc = alloc_ring(adap->pdev_dev, fl->size, sizeof(__be64),
4406				      sizeof(struct rx_sw_desc), &fl->addr,
4407				      &fl->sdesc, s->stat_len,
4408				      dev_to_node(adap->pdev_dev));
4409		if (!fl->desc)
4410			goto fl_nomem;
4411
4412		flsz = fl->size / 8 + s->stat_len / sizeof(struct tx_desc);
4413		c.iqns_to_fl0congen |= htonl(FW_IQ_CMD_FL0PACKEN_F |
4414					     FW_IQ_CMD_FL0FETCHRO_V(relaxed) |
4415					     FW_IQ_CMD_FL0DATARO_V(relaxed) |
4416					     FW_IQ_CMD_FL0PADEN_F);
4417		if (cong >= 0)
4418			c.iqns_to_fl0congen |=
4419				htonl(FW_IQ_CMD_FL0CNGCHMAP_V(cong) |
4420				      FW_IQ_CMD_FL0CONGCIF_F |
4421				      FW_IQ_CMD_FL0CONGEN_F);
4422		/* In T6, for egress queue type FL there is internal overhead
4423		 * of 16B for header going into FLM module.  Hence the maximum
4424		 * allowed burst size is 448 bytes.  For T4/T5, the hardware
4425		 * doesn't coalesce fetch requests if more than 64 bytes of
4426		 * Free List pointers are provided, so we use a 128-byte Fetch
4427		 * Burst Minimum there (T6 implements coalescing so we can use
4428		 * the smaller 64-byte value there).
4429		 */
4430		c.fl0dcaen_to_fl0cidxfthresh =
4431			htons(FW_IQ_CMD_FL0FBMIN_V(chip_ver <= CHELSIO_T5 ?
4432						   FETCHBURSTMIN_128B_X :
4433						   FETCHBURSTMIN_64B_T6_X) |
4434			      FW_IQ_CMD_FL0FBMAX_V((chip_ver <= CHELSIO_T5) ?
4435						   FETCHBURSTMAX_512B_X :
4436						   FETCHBURSTMAX_256B_X));
4437		c.fl0size = htons(flsz);
4438		c.fl0addr = cpu_to_be64(fl->addr);
4439	}
4440
4441	ret = t4_wr_mbox(adap, adap->mbox, &c, sizeof(c), &c);
4442	if (ret)
4443		goto err;
4444
4445	netif_napi_add(dev, &iq->napi, napi_rx_handler, 64);
4446	iq->cur_desc = iq->desc;
4447	iq->cidx = 0;
4448	iq->gen = 1;
4449	iq->next_intr_params = iq->intr_params;
4450	iq->cntxt_id = ntohs(c.iqid);
4451	iq->abs_id = ntohs(c.physiqid);
4452	iq->bar2_addr = bar2_address(adap,
4453				     iq->cntxt_id,
4454				     T4_BAR2_QTYPE_INGRESS,
4455				     &iq->bar2_qid);
4456	iq->size--;                           /* subtract status entry */
4457	iq->netdev = dev;
4458	iq->handler = hnd;
4459	iq->flush_handler = flush_hnd;
4460
4461	memset(&iq->lro_mgr, 0, sizeof(struct t4_lro_mgr));
4462	skb_queue_head_init(&iq->lro_mgr.lroq);
4463
4464	/* set offset to -1 to distinguish ingress queues without FL */
4465	iq->offset = fl ? 0 : -1;
4466
4467	adap->sge.ingr_map[iq->cntxt_id - adap->sge.ingr_start] = iq;
4468
4469	if (fl) {
4470		fl->cntxt_id = ntohs(c.fl0id);
4471		fl->avail = fl->pend_cred = 0;
4472		fl->pidx = fl->cidx = 0;
4473		fl->alloc_failed = fl->large_alloc_failed = fl->starving = 0;
4474		adap->sge.egr_map[fl->cntxt_id - adap->sge.egr_start] = fl;
4475
4476		/* Note, we must initialize the BAR2 Free List User Doorbell
4477		 * information before refilling the Free List!
4478		 */
4479		fl->bar2_addr = bar2_address(adap,
4480					     fl->cntxt_id,
4481					     T4_BAR2_QTYPE_EGRESS,
4482					     &fl->bar2_qid);
4483		refill_fl(adap, fl, fl_cap(fl), GFP_KERNEL);
4484	}
4485
4486	/* For T5 and later we attempt to set up the Congestion Manager values
4487	 * of the new RX Ethernet Queue.  This should really be handled by
4488	 * firmware because it's more complex than any host driver wants to
4489	 * get involved with and it's different per chip and this is almost
4490	 * certainly wrong.  Firmware would be wrong as well, but it would be
4491	 * a lot easier to fix in one place ...  For now we do something very
4492	 * simple (and hopefully less wrong).
4493	 */
4494	if (!is_t4(adap->params.chip) && cong >= 0) {
4495		u32 param, val, ch_map = 0;
4496		int i;
4497		u16 cng_ch_bits_log = adap->params.arch.cng_ch_bits_log;
4498
4499		param = (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DMAQ) |
4500			 FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_DMAQ_CONM_CTXT) |
4501			 FW_PARAMS_PARAM_YZ_V(iq->cntxt_id));
4502		if (cong == 0) {
4503			val = CONMCTXT_CNGTPMODE_V(CONMCTXT_CNGTPMODE_QUEUE_X);
4504		} else {
4505			val =
4506			    CONMCTXT_CNGTPMODE_V(CONMCTXT_CNGTPMODE_CHANNEL_X);
4507			for (i = 0; i < 4; i++) {
4508				if (cong & (1 << i))
4509					ch_map |= 1 << (i << cng_ch_bits_log);
4510			}
4511			val |= CONMCTXT_CNGCHMAP_V(ch_map);
4512		}
4513		ret = t4_set_params(adap, adap->mbox, adap->pf, 0, 1,
4514				    &param, &val);
4515		if (ret)
4516			dev_warn(adap->pdev_dev, "Failed to set Congestion"
4517				 " Manager Context for Ingress Queue %d: %d\n",
4518				 iq->cntxt_id, -ret);
4519	}
4520
4521	return 0;
4522
4523fl_nomem:
4524	ret = -ENOMEM;
4525err:
4526	if (iq->desc) {
4527		dma_free_coherent(adap->pdev_dev, iq->size * iq->iqe_len,
4528				  iq->desc, iq->phys_addr);
4529		iq->desc = NULL;
4530	}
4531	if (fl && fl->desc) {
4532		kfree(fl->sdesc);
4533		fl->sdesc = NULL;
4534		dma_free_coherent(adap->pdev_dev, flsz * sizeof(struct tx_desc),
4535				  fl->desc, fl->addr);
4536		fl->desc = NULL;
4537	}
4538	return ret;
4539}
4540
4541static void init_txq(struct adapter *adap, struct sge_txq *q, unsigned int id)
4542{
4543	q->cntxt_id = id;
4544	q->bar2_addr = bar2_address(adap,
4545				    q->cntxt_id,
4546				    T4_BAR2_QTYPE_EGRESS,
4547				    &q->bar2_qid);
4548	q->in_use = 0;
4549	q->cidx = q->pidx = 0;
4550	q->stops = q->restarts = 0;
4551	q->stat = (void *)&q->desc[q->size];
4552	spin_lock_init(&q->db_lock);
4553	adap->sge.egr_map[id - adap->sge.egr_start] = q;
4554}
4555
4556/**
4557 *	t4_sge_alloc_eth_txq - allocate an Ethernet TX Queue
4558 *	@adap: the adapter
4559 *	@txq: the SGE Ethernet TX Queue to initialize
4560 *	@dev: the Linux Network Device
4561 *	@netdevq: the corresponding Linux TX Queue
4562 *	@iqid: the Ingress Queue to which to deliver CIDX Update messages
4563 *	@dbqt: whether this TX Queue will use the SGE Doorbell Queue Timers
4564 */
4565int t4_sge_alloc_eth_txq(struct adapter *adap, struct sge_eth_txq *txq,
4566			 struct net_device *dev, struct netdev_queue *netdevq,
4567			 unsigned int iqid, u8 dbqt)
4568{
4569	unsigned int chip_ver = CHELSIO_CHIP_VERSION(adap->params.chip);
4570	struct port_info *pi = netdev_priv(dev);
4571	struct sge *s = &adap->sge;
4572	struct fw_eq_eth_cmd c;
4573	int ret, nentries;
4574
4575	/* Add status entries */
4576	nentries = txq->q.size + s->stat_len / sizeof(struct tx_desc);
4577
4578	txq->q.desc = alloc_ring(adap->pdev_dev, txq->q.size,
4579			sizeof(struct tx_desc), sizeof(struct tx_sw_desc),
4580			&txq->q.phys_addr, &txq->q.sdesc, s->stat_len,
4581			netdev_queue_numa_node_read(netdevq));
4582	if (!txq->q.desc)
4583		return -ENOMEM;
4584
4585	memset(&c, 0, sizeof(c));
4586	c.op_to_vfn = htonl(FW_CMD_OP_V(FW_EQ_ETH_CMD) | FW_CMD_REQUEST_F |
4587			    FW_CMD_WRITE_F | FW_CMD_EXEC_F |
4588			    FW_EQ_ETH_CMD_PFN_V(adap->pf) |
4589			    FW_EQ_ETH_CMD_VFN_V(0));
4590	c.alloc_to_len16 = htonl(FW_EQ_ETH_CMD_ALLOC_F |
4591				 FW_EQ_ETH_CMD_EQSTART_F | FW_LEN16(c));
4592
4593	/* For TX Ethernet Queues using the SGE Doorbell Queue Timer
4594	 * mechanism, we use Ingress Queue messages for Hardware Consumer
4595	 * Index Updates on the TX Queue.  Otherwise we have the Hardware
4596	 * write the CIDX Updates into the Status Page at the end of the
4597	 * TX Queue.
4598	 */
4599	c.autoequiqe_to_viid = htonl(FW_EQ_ETH_CMD_AUTOEQUEQE_F |
4600				     FW_EQ_ETH_CMD_VIID_V(pi->viid));
4601
4602	c.fetchszm_to_iqid =
4603		htonl(FW_EQ_ETH_CMD_HOSTFCMODE_V(HOSTFCMODE_STATUS_PAGE_X) |
4604		      FW_EQ_ETH_CMD_PCIECHN_V(pi->tx_chan) |
4605		      FW_EQ_ETH_CMD_FETCHRO_F | FW_EQ_ETH_CMD_IQID_V(iqid));
4606
4607	/* Note that the CIDX Flush Threshold should match MAX_TX_RECLAIM. */
4608	c.dcaen_to_eqsize =
4609		htonl(FW_EQ_ETH_CMD_FBMIN_V(chip_ver <= CHELSIO_T5
4610					    ? FETCHBURSTMIN_64B_X
4611					    : FETCHBURSTMIN_64B_T6_X) |
4612		      FW_EQ_ETH_CMD_FBMAX_V(FETCHBURSTMAX_512B_X) |
4613		      FW_EQ_ETH_CMD_CIDXFTHRESH_V(CIDXFLUSHTHRESH_32_X) |
4614		      FW_EQ_ETH_CMD_EQSIZE_V(nentries));
4615
4616	c.eqaddr = cpu_to_be64(txq->q.phys_addr);
4617
4618	/* If we're using the SGE Doorbell Queue Timer mechanism, pass in the
4619	 * currently configured Timer Index.  THis can be changed later via an
4620	 * ethtool -C tx-usecs {Timer Val} command.  Note that the SGE
4621	 * Doorbell Queue mode is currently automatically enabled in the
4622	 * Firmware by setting either AUTOEQUEQE or AUTOEQUIQE ...
4623	 */
4624	if (dbqt)
4625		c.timeren_timerix =
4626			cpu_to_be32(FW_EQ_ETH_CMD_TIMEREN_F |
4627				    FW_EQ_ETH_CMD_TIMERIX_V(txq->dbqtimerix));
4628
4629	ret = t4_wr_mbox(adap, adap->mbox, &c, sizeof(c), &c);
4630	if (ret) {
4631		kfree(txq->q.sdesc);
4632		txq->q.sdesc = NULL;
4633		dma_free_coherent(adap->pdev_dev,
4634				  nentries * sizeof(struct tx_desc),
4635				  txq->q.desc, txq->q.phys_addr);
4636		txq->q.desc = NULL;
4637		return ret;
4638	}
4639
4640	txq->q.q_type = CXGB4_TXQ_ETH;
4641	init_txq(adap, &txq->q, FW_EQ_ETH_CMD_EQID_G(ntohl(c.eqid_pkd)));
4642	txq->txq = netdevq;
4643	txq->tso = 0;
4644	txq->uso = 0;
4645	txq->tx_cso = 0;
4646	txq->vlan_ins = 0;
4647	txq->mapping_err = 0;
4648	txq->dbqt = dbqt;
4649
4650	return 0;
4651}
4652
4653int t4_sge_alloc_ctrl_txq(struct adapter *adap, struct sge_ctrl_txq *txq,
4654			  struct net_device *dev, unsigned int iqid,
4655			  unsigned int cmplqid)
4656{
4657	unsigned int chip_ver = CHELSIO_CHIP_VERSION(adap->params.chip);
4658	struct port_info *pi = netdev_priv(dev);
4659	struct sge *s = &adap->sge;
4660	struct fw_eq_ctrl_cmd c;
4661	int ret, nentries;
4662
4663	/* Add status entries */
4664	nentries = txq->q.size + s->stat_len / sizeof(struct tx_desc);
4665
4666	txq->q.desc = alloc_ring(adap->pdev_dev, nentries,
4667				 sizeof(struct tx_desc), 0, &txq->q.phys_addr,
4668				 NULL, 0, dev_to_node(adap->pdev_dev));
4669	if (!txq->q.desc)
4670		return -ENOMEM;
4671
4672	c.op_to_vfn = htonl(FW_CMD_OP_V(FW_EQ_CTRL_CMD) | FW_CMD_REQUEST_F |
4673			    FW_CMD_WRITE_F | FW_CMD_EXEC_F |
4674			    FW_EQ_CTRL_CMD_PFN_V(adap->pf) |
4675			    FW_EQ_CTRL_CMD_VFN_V(0));
4676	c.alloc_to_len16 = htonl(FW_EQ_CTRL_CMD_ALLOC_F |
4677				 FW_EQ_CTRL_CMD_EQSTART_F | FW_LEN16(c));
4678	c.cmpliqid_eqid = htonl(FW_EQ_CTRL_CMD_CMPLIQID_V(cmplqid));
4679	c.physeqid_pkd = htonl(0);
4680	c.fetchszm_to_iqid =
4681		htonl(FW_EQ_CTRL_CMD_HOSTFCMODE_V(HOSTFCMODE_STATUS_PAGE_X) |
4682		      FW_EQ_CTRL_CMD_PCIECHN_V(pi->tx_chan) |
4683		      FW_EQ_CTRL_CMD_FETCHRO_F | FW_EQ_CTRL_CMD_IQID_V(iqid));
4684	c.dcaen_to_eqsize =
4685		htonl(FW_EQ_CTRL_CMD_FBMIN_V(chip_ver <= CHELSIO_T5
4686					     ? FETCHBURSTMIN_64B_X
4687					     : FETCHBURSTMIN_64B_T6_X) |
4688		      FW_EQ_CTRL_CMD_FBMAX_V(FETCHBURSTMAX_512B_X) |
4689		      FW_EQ_CTRL_CMD_CIDXFTHRESH_V(CIDXFLUSHTHRESH_32_X) |
4690		      FW_EQ_CTRL_CMD_EQSIZE_V(nentries));
4691	c.eqaddr = cpu_to_be64(txq->q.phys_addr);
4692
4693	ret = t4_wr_mbox(adap, adap->mbox, &c, sizeof(c), &c);
4694	if (ret) {
4695		dma_free_coherent(adap->pdev_dev,
4696				  nentries * sizeof(struct tx_desc),
4697				  txq->q.desc, txq->q.phys_addr);
4698		txq->q.desc = NULL;
4699		return ret;
4700	}
4701
4702	txq->q.q_type = CXGB4_TXQ_CTRL;
4703	init_txq(adap, &txq->q, FW_EQ_CTRL_CMD_EQID_G(ntohl(c.cmpliqid_eqid)));
4704	txq->adap = adap;
4705	skb_queue_head_init(&txq->sendq);
4706	tasklet_setup(&txq->qresume_tsk, restart_ctrlq);
4707	txq->full = 0;
4708	return 0;
4709}
4710
4711int t4_sge_mod_ctrl_txq(struct adapter *adap, unsigned int eqid,
4712			unsigned int cmplqid)
4713{
4714	u32 param, val;
4715
4716	param = (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DMAQ) |
4717		 FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_DMAQ_EQ_CMPLIQID_CTRL) |
4718		 FW_PARAMS_PARAM_YZ_V(eqid));
4719	val = cmplqid;
4720	return t4_set_params(adap, adap->mbox, adap->pf, 0, 1, &param, &val);
4721}
4722
4723static int t4_sge_alloc_ofld_txq(struct adapter *adap, struct sge_txq *q,
4724				 struct net_device *dev, u32 cmd, u32 iqid)
4725{
4726	unsigned int chip_ver = CHELSIO_CHIP_VERSION(adap->params.chip);
4727	struct port_info *pi = netdev_priv(dev);
4728	struct sge *s = &adap->sge;
4729	struct fw_eq_ofld_cmd c;
4730	u32 fb_min, nentries;
4731	int ret;
4732
4733	/* Add status entries */
4734	nentries = q->size + s->stat_len / sizeof(struct tx_desc);
4735	q->desc = alloc_ring(adap->pdev_dev, q->size, sizeof(struct tx_desc),
4736			     sizeof(struct tx_sw_desc), &q->phys_addr,
4737			     &q->sdesc, s->stat_len, NUMA_NO_NODE);
4738	if (!q->desc)
4739		return -ENOMEM;
4740
4741	if (chip_ver <= CHELSIO_T5)
4742		fb_min = FETCHBURSTMIN_64B_X;
4743	else
4744		fb_min = FETCHBURSTMIN_64B_T6_X;
4745
4746	memset(&c, 0, sizeof(c));
4747	c.op_to_vfn = htonl(FW_CMD_OP_V(cmd) | FW_CMD_REQUEST_F |
4748			    FW_CMD_WRITE_F | FW_CMD_EXEC_F |
4749			    FW_EQ_OFLD_CMD_PFN_V(adap->pf) |
4750			    FW_EQ_OFLD_CMD_VFN_V(0));
4751	c.alloc_to_len16 = htonl(FW_EQ_OFLD_CMD_ALLOC_F |
4752				 FW_EQ_OFLD_CMD_EQSTART_F | FW_LEN16(c));
4753	c.fetchszm_to_iqid =
4754		htonl(FW_EQ_OFLD_CMD_HOSTFCMODE_V(HOSTFCMODE_STATUS_PAGE_X) |
4755		      FW_EQ_OFLD_CMD_PCIECHN_V(pi->tx_chan) |
4756		      FW_EQ_OFLD_CMD_FETCHRO_F | FW_EQ_OFLD_CMD_IQID_V(iqid));
4757	c.dcaen_to_eqsize =
4758		htonl(FW_EQ_OFLD_CMD_FBMIN_V(fb_min) |
4759		      FW_EQ_OFLD_CMD_FBMAX_V(FETCHBURSTMAX_512B_X) |
4760		      FW_EQ_OFLD_CMD_CIDXFTHRESH_V(CIDXFLUSHTHRESH_32_X) |
4761		      FW_EQ_OFLD_CMD_EQSIZE_V(nentries));
4762	c.eqaddr = cpu_to_be64(q->phys_addr);
4763
4764	ret = t4_wr_mbox(adap, adap->mbox, &c, sizeof(c), &c);
4765	if (ret) {
4766		kfree(q->sdesc);
4767		q->sdesc = NULL;
4768		dma_free_coherent(adap->pdev_dev,
4769				  nentries * sizeof(struct tx_desc),
4770				  q->desc, q->phys_addr);
4771		q->desc = NULL;
4772		return ret;
4773	}
4774
4775	init_txq(adap, q, FW_EQ_OFLD_CMD_EQID_G(ntohl(c.eqid_pkd)));
4776	return 0;
4777}
4778
4779int t4_sge_alloc_uld_txq(struct adapter *adap, struct sge_uld_txq *txq,
4780			 struct net_device *dev, unsigned int iqid,
4781			 unsigned int uld_type)
4782{
4783	u32 cmd = FW_EQ_OFLD_CMD;
4784	int ret;
4785
4786	if (unlikely(uld_type == CXGB4_TX_CRYPTO))
4787		cmd = FW_EQ_CTRL_CMD;
4788
4789	ret = t4_sge_alloc_ofld_txq(adap, &txq->q, dev, cmd, iqid);
4790	if (ret)
4791		return ret;
4792
4793	txq->q.q_type = CXGB4_TXQ_ULD;
4794	txq->adap = adap;
4795	skb_queue_head_init(&txq->sendq);
4796	tasklet_setup(&txq->qresume_tsk, restart_ofldq);
4797	txq->full = 0;
4798	txq->mapping_err = 0;
4799	return 0;
4800}
4801
4802int t4_sge_alloc_ethofld_txq(struct adapter *adap, struct sge_eohw_txq *txq,
4803			     struct net_device *dev, u32 iqid)
4804{
4805	int ret;
4806
4807	ret = t4_sge_alloc_ofld_txq(adap, &txq->q, dev, FW_EQ_OFLD_CMD, iqid);
4808	if (ret)
4809		return ret;
4810
4811	txq->q.q_type = CXGB4_TXQ_ULD;
4812	spin_lock_init(&txq->lock);
4813	txq->adap = adap;
4814	txq->tso = 0;
4815	txq->uso = 0;
4816	txq->tx_cso = 0;
4817	txq->vlan_ins = 0;
4818	txq->mapping_err = 0;
4819	return 0;
4820}
4821
4822void free_txq(struct adapter *adap, struct sge_txq *q)
4823{
4824	struct sge *s = &adap->sge;
4825
4826	dma_free_coherent(adap->pdev_dev,
4827			  q->size * sizeof(struct tx_desc) + s->stat_len,
4828			  q->desc, q->phys_addr);
4829	q->cntxt_id = 0;
4830	q->sdesc = NULL;
4831	q->desc = NULL;
4832}
4833
4834void free_rspq_fl(struct adapter *adap, struct sge_rspq *rq,
4835		  struct sge_fl *fl)
4836{
4837	struct sge *s = &adap->sge;
4838	unsigned int fl_id = fl ? fl->cntxt_id : 0xffff;
4839
4840	adap->sge.ingr_map[rq->cntxt_id - adap->sge.ingr_start] = NULL;
4841	t4_iq_free(adap, adap->mbox, adap->pf, 0, FW_IQ_TYPE_FL_INT_CAP,
4842		   rq->cntxt_id, fl_id, 0xffff);
4843	dma_free_coherent(adap->pdev_dev, (rq->size + 1) * rq->iqe_len,
4844			  rq->desc, rq->phys_addr);
4845	netif_napi_del(&rq->napi);
4846	rq->netdev = NULL;
4847	rq->cntxt_id = rq->abs_id = 0;
4848	rq->desc = NULL;
4849
4850	if (fl) {
4851		free_rx_bufs(adap, fl, fl->avail);
4852		dma_free_coherent(adap->pdev_dev, fl->size * 8 + s->stat_len,
4853				  fl->desc, fl->addr);
4854		kfree(fl->sdesc);
4855		fl->sdesc = NULL;
4856		fl->cntxt_id = 0;
4857		fl->desc = NULL;
4858	}
4859}
4860
4861/**
4862 *      t4_free_ofld_rxqs - free a block of consecutive Rx queues
4863 *      @adap: the adapter
4864 *      @n: number of queues
4865 *      @q: pointer to first queue
4866 *
4867 *      Release the resources of a consecutive block of offload Rx queues.
4868 */
4869void t4_free_ofld_rxqs(struct adapter *adap, int n, struct sge_ofld_rxq *q)
4870{
4871	for ( ; n; n--, q++)
4872		if (q->rspq.desc)
4873			free_rspq_fl(adap, &q->rspq,
4874				     q->fl.size ? &q->fl : NULL);
4875}
4876
4877void t4_sge_free_ethofld_txq(struct adapter *adap, struct sge_eohw_txq *txq)
4878{
4879	if (txq->q.desc) {
4880		t4_ofld_eq_free(adap, adap->mbox, adap->pf, 0,
4881				txq->q.cntxt_id);
4882		free_tx_desc(adap, &txq->q, txq->q.in_use, false);
4883		kfree(txq->q.sdesc);
4884		free_txq(adap, &txq->q);
4885	}
4886}
4887
4888/**
4889 *	t4_free_sge_resources - free SGE resources
4890 *	@adap: the adapter
4891 *
4892 *	Frees resources used by the SGE queue sets.
4893 */
4894void t4_free_sge_resources(struct adapter *adap)
4895{
4896	int i;
4897	struct sge_eth_rxq *eq;
4898	struct sge_eth_txq *etq;
4899
4900	/* stop all Rx queues in order to start them draining */
4901	for (i = 0; i < adap->sge.ethqsets; i++) {
4902		eq = &adap->sge.ethrxq[i];
4903		if (eq->rspq.desc)
4904			t4_iq_stop(adap, adap->mbox, adap->pf, 0,
4905				   FW_IQ_TYPE_FL_INT_CAP,
4906				   eq->rspq.cntxt_id,
4907				   eq->fl.size ? eq->fl.cntxt_id : 0xffff,
4908				   0xffff);
4909	}
4910
4911	/* clean up Ethernet Tx/Rx queues */
4912	for (i = 0; i < adap->sge.ethqsets; i++) {
4913		eq = &adap->sge.ethrxq[i];
4914		if (eq->rspq.desc)
4915			free_rspq_fl(adap, &eq->rspq,
4916				     eq->fl.size ? &eq->fl : NULL);
4917		if (eq->msix) {
4918			cxgb4_free_msix_idx_in_bmap(adap, eq->msix->idx);
4919			eq->msix = NULL;
4920		}
4921
4922		etq = &adap->sge.ethtxq[i];
4923		if (etq->q.desc) {
4924			t4_eth_eq_free(adap, adap->mbox, adap->pf, 0,
4925				       etq->q.cntxt_id);
4926			__netif_tx_lock_bh(etq->txq);
4927			free_tx_desc(adap, &etq->q, etq->q.in_use, true);
4928			__netif_tx_unlock_bh(etq->txq);
4929			kfree(etq->q.sdesc);
4930			free_txq(adap, &etq->q);
4931		}
4932	}
4933
4934	/* clean up control Tx queues */
4935	for (i = 0; i < ARRAY_SIZE(adap->sge.ctrlq); i++) {
4936		struct sge_ctrl_txq *cq = &adap->sge.ctrlq[i];
4937
4938		if (cq->q.desc) {
4939			tasklet_kill(&cq->qresume_tsk);
4940			t4_ctrl_eq_free(adap, adap->mbox, adap->pf, 0,
4941					cq->q.cntxt_id);
4942			__skb_queue_purge(&cq->sendq);
4943			free_txq(adap, &cq->q);
4944		}
4945	}
4946
4947	if (adap->sge.fw_evtq.desc) {
4948		free_rspq_fl(adap, &adap->sge.fw_evtq, NULL);
4949		if (adap->sge.fwevtq_msix_idx >= 0)
4950			cxgb4_free_msix_idx_in_bmap(adap,
4951						    adap->sge.fwevtq_msix_idx);
4952	}
4953
4954	if (adap->sge.nd_msix_idx >= 0)
4955		cxgb4_free_msix_idx_in_bmap(adap, adap->sge.nd_msix_idx);
4956
4957	if (adap->sge.intrq.desc)
4958		free_rspq_fl(adap, &adap->sge.intrq, NULL);
4959
4960	if (!is_t4(adap->params.chip)) {
4961		etq = &adap->sge.ptptxq;
4962		if (etq->q.desc) {
4963			t4_eth_eq_free(adap, adap->mbox, adap->pf, 0,
4964				       etq->q.cntxt_id);
4965			spin_lock_bh(&adap->ptp_lock);
4966			free_tx_desc(adap, &etq->q, etq->q.in_use, true);
4967			spin_unlock_bh(&adap->ptp_lock);
4968			kfree(etq->q.sdesc);
4969			free_txq(adap, &etq->q);
4970		}
4971	}
4972
4973	/* clear the reverse egress queue map */
4974	memset(adap->sge.egr_map, 0,
4975	       adap->sge.egr_sz * sizeof(*adap->sge.egr_map));
4976}
4977
4978void t4_sge_start(struct adapter *adap)
4979{
4980	adap->sge.ethtxq_rover = 0;
4981	mod_timer(&adap->sge.rx_timer, jiffies + RX_QCHECK_PERIOD);
4982	mod_timer(&adap->sge.tx_timer, jiffies + TX_QCHECK_PERIOD);
4983}
4984
4985/**
4986 *	t4_sge_stop - disable SGE operation
4987 *	@adap: the adapter
4988 *
4989 *	Stop tasklets and timers associated with the DMA engine.  Note that
4990 *	this is effective only if measures have been taken to disable any HW
4991 *	events that may restart them.
4992 */
4993void t4_sge_stop(struct adapter *adap)
4994{
4995	int i;
4996	struct sge *s = &adap->sge;
4997
4998	if (s->rx_timer.function)
4999		del_timer_sync(&s->rx_timer);
5000	if (s->tx_timer.function)
5001		del_timer_sync(&s->tx_timer);
5002
5003	if (is_offload(adap)) {
5004		struct sge_uld_txq_info *txq_info;
5005
5006		txq_info = adap->sge.uld_txq_info[CXGB4_TX_OFLD];
5007		if (txq_info) {
5008			struct sge_uld_txq *txq = txq_info->uldtxq;
5009
5010			for_each_ofldtxq(&adap->sge, i) {
5011				if (txq->q.desc)
5012					tasklet_kill(&txq->qresume_tsk);
5013			}
5014		}
5015	}
5016
5017	if (is_pci_uld(adap)) {
5018		struct sge_uld_txq_info *txq_info;
5019
5020		txq_info = adap->sge.uld_txq_info[CXGB4_TX_CRYPTO];
5021		if (txq_info) {
5022			struct sge_uld_txq *txq = txq_info->uldtxq;
5023
5024			for_each_ofldtxq(&adap->sge, i) {
5025				if (txq->q.desc)
5026					tasklet_kill(&txq->qresume_tsk);
5027			}
5028		}
5029	}
5030
5031	for (i = 0; i < ARRAY_SIZE(s->ctrlq); i++) {
5032		struct sge_ctrl_txq *cq = &s->ctrlq[i];
5033
5034		if (cq->q.desc)
5035			tasklet_kill(&cq->qresume_tsk);
5036	}
5037}
5038
5039/**
5040 *	t4_sge_init_soft - grab core SGE values needed by SGE code
5041 *	@adap: the adapter
5042 *
5043 *	We need to grab the SGE operating parameters that we need to have
5044 *	in order to do our job and make sure we can live with them.
5045 */
5046
5047static int t4_sge_init_soft(struct adapter *adap)
5048{
5049	struct sge *s = &adap->sge;
5050	u32 fl_small_pg, fl_large_pg, fl_small_mtu, fl_large_mtu;
5051	u32 timer_value_0_and_1, timer_value_2_and_3, timer_value_4_and_5;
5052	u32 ingress_rx_threshold;
5053
5054	/*
5055	 * Verify that CPL messages are going to the Ingress Queue for
5056	 * process_responses() and that only packet data is going to the
5057	 * Free Lists.
5058	 */
5059	if ((t4_read_reg(adap, SGE_CONTROL_A) & RXPKTCPLMODE_F) !=
5060	    RXPKTCPLMODE_V(RXPKTCPLMODE_SPLIT_X)) {
5061		dev_err(adap->pdev_dev, "bad SGE CPL MODE\n");
5062		return -EINVAL;
5063	}
5064
5065	/*
5066	 * Validate the Host Buffer Register Array indices that we want to
5067	 * use ...
5068	 *
5069	 * XXX Note that we should really read through the Host Buffer Size
5070	 * XXX register array and find the indices of the Buffer Sizes which
5071	 * XXX meet our needs!
5072	 */
5073	#define READ_FL_BUF(x) \
5074		t4_read_reg(adap, SGE_FL_BUFFER_SIZE0_A+(x)*sizeof(u32))
5075
5076	fl_small_pg = READ_FL_BUF(RX_SMALL_PG_BUF);
5077	fl_large_pg = READ_FL_BUF(RX_LARGE_PG_BUF);
5078	fl_small_mtu = READ_FL_BUF(RX_SMALL_MTU_BUF);
5079	fl_large_mtu = READ_FL_BUF(RX_LARGE_MTU_BUF);
5080
5081	/* We only bother using the Large Page logic if the Large Page Buffer
5082	 * is larger than our Page Size Buffer.
5083	 */
5084	if (fl_large_pg <= fl_small_pg)
5085		fl_large_pg = 0;
5086
5087	#undef READ_FL_BUF
5088
5089	/* The Page Size Buffer must be exactly equal to our Page Size and the
5090	 * Large Page Size Buffer should be 0 (per above) or a power of 2.
5091	 */
5092	if (fl_small_pg != PAGE_SIZE ||
5093	    (fl_large_pg & (fl_large_pg-1)) != 0) {
5094		dev_err(adap->pdev_dev, "bad SGE FL page buffer sizes [%d, %d]\n",
5095			fl_small_pg, fl_large_pg);
5096		return -EINVAL;
5097	}
5098	if (fl_large_pg)
5099		s->fl_pg_order = ilog2(fl_large_pg) - PAGE_SHIFT;
5100
5101	if (fl_small_mtu < FL_MTU_SMALL_BUFSIZE(adap) ||
5102	    fl_large_mtu < FL_MTU_LARGE_BUFSIZE(adap)) {
5103		dev_err(adap->pdev_dev, "bad SGE FL MTU sizes [%d, %d]\n",
5104			fl_small_mtu, fl_large_mtu);
5105		return -EINVAL;
5106	}
5107
5108	/*
5109	 * Retrieve our RX interrupt holdoff timer values and counter
5110	 * threshold values from the SGE parameters.
5111	 */
5112	timer_value_0_and_1 = t4_read_reg(adap, SGE_TIMER_VALUE_0_AND_1_A);
5113	timer_value_2_and_3 = t4_read_reg(adap, SGE_TIMER_VALUE_2_AND_3_A);
5114	timer_value_4_and_5 = t4_read_reg(adap, SGE_TIMER_VALUE_4_AND_5_A);
5115	s->timer_val[0] = core_ticks_to_us(adap,
5116		TIMERVALUE0_G(timer_value_0_and_1));
5117	s->timer_val[1] = core_ticks_to_us(adap,
5118		TIMERVALUE1_G(timer_value_0_and_1));
5119	s->timer_val[2] = core_ticks_to_us(adap,
5120		TIMERVALUE2_G(timer_value_2_and_3));
5121	s->timer_val[3] = core_ticks_to_us(adap,
5122		TIMERVALUE3_G(timer_value_2_and_3));
5123	s->timer_val[4] = core_ticks_to_us(adap,
5124		TIMERVALUE4_G(timer_value_4_and_5));
5125	s->timer_val[5] = core_ticks_to_us(adap,
5126		TIMERVALUE5_G(timer_value_4_and_5));
5127
5128	ingress_rx_threshold = t4_read_reg(adap, SGE_INGRESS_RX_THRESHOLD_A);
5129	s->counter_val[0] = THRESHOLD_0_G(ingress_rx_threshold);
5130	s->counter_val[1] = THRESHOLD_1_G(ingress_rx_threshold);
5131	s->counter_val[2] = THRESHOLD_2_G(ingress_rx_threshold);
5132	s->counter_val[3] = THRESHOLD_3_G(ingress_rx_threshold);
5133
5134	return 0;
5135}
5136
5137/**
5138 *     t4_sge_init - initialize SGE
5139 *     @adap: the adapter
5140 *
5141 *     Perform low-level SGE code initialization needed every time after a
5142 *     chip reset.
5143 */
5144int t4_sge_init(struct adapter *adap)
5145{
5146	struct sge *s = &adap->sge;
5147	u32 sge_control, sge_conm_ctrl;
5148	int ret, egress_threshold;
5149
5150	/*
5151	 * Ingress Padding Boundary and Egress Status Page Size are set up by
5152	 * t4_fixup_host_params().
5153	 */
5154	sge_control = t4_read_reg(adap, SGE_CONTROL_A);
5155	s->pktshift = PKTSHIFT_G(sge_control);
5156	s->stat_len = (sge_control & EGRSTATUSPAGESIZE_F) ? 128 : 64;
5157
5158	s->fl_align = t4_fl_pkt_align(adap);
5159	ret = t4_sge_init_soft(adap);
5160	if (ret < 0)
5161		return ret;
5162
5163	/*
5164	 * A FL with <= fl_starve_thres buffers is starving and a periodic
5165	 * timer will attempt to refill it.  This needs to be larger than the
5166	 * SGE's Egress Congestion Threshold.  If it isn't, then we can get
5167	 * stuck waiting for new packets while the SGE is waiting for us to
5168	 * give it more Free List entries.  (Note that the SGE's Egress
5169	 * Congestion Threshold is in units of 2 Free List pointers.) For T4,
5170	 * there was only a single field to control this.  For T5 there's the
5171	 * original field which now only applies to Unpacked Mode Free List
5172	 * buffers and a new field which only applies to Packed Mode Free List
5173	 * buffers.
5174	 */
5175	sge_conm_ctrl = t4_read_reg(adap, SGE_CONM_CTRL_A);
5176	switch (CHELSIO_CHIP_VERSION(adap->params.chip)) {
5177	case CHELSIO_T4:
5178		egress_threshold = EGRTHRESHOLD_G(sge_conm_ctrl);
5179		break;
5180	case CHELSIO_T5:
5181		egress_threshold = EGRTHRESHOLDPACKING_G(sge_conm_ctrl);
5182		break;
5183	case CHELSIO_T6:
5184		egress_threshold = T6_EGRTHRESHOLDPACKING_G(sge_conm_ctrl);
5185		break;
5186	default:
5187		dev_err(adap->pdev_dev, "Unsupported Chip version %d\n",
5188			CHELSIO_CHIP_VERSION(adap->params.chip));
5189		return -EINVAL;
5190	}
5191	s->fl_starve_thres = 2*egress_threshold + 1;
5192
5193	t4_idma_monitor_init(adap, &s->idma_monitor);
5194
5195	/* Set up timers used for recuring callbacks to process RX and TX
5196	 * administrative tasks.
5197	 */
5198	timer_setup(&s->rx_timer, sge_rx_timer_cb, 0);
5199	timer_setup(&s->tx_timer, sge_tx_timer_cb, 0);
5200
5201	spin_lock_init(&s->intrq_lock);
5202
5203	return 0;
5204}
5205