xref: /kernel/linux/linux-6.6/net/sched/sch_taprio.c (revision 62306a36)
1// SPDX-License-Identifier: GPL-2.0
2
3/* net/sched/sch_taprio.c	 Time Aware Priority Scheduler
4 *
5 * Authors:	Vinicius Costa Gomes <vinicius.gomes@intel.com>
6 *
7 */
8
9#include <linux/ethtool.h>
10#include <linux/ethtool_netlink.h>
11#include <linux/types.h>
12#include <linux/slab.h>
13#include <linux/kernel.h>
14#include <linux/string.h>
15#include <linux/list.h>
16#include <linux/errno.h>
17#include <linux/skbuff.h>
18#include <linux/math64.h>
19#include <linux/module.h>
20#include <linux/spinlock.h>
21#include <linux/rcupdate.h>
22#include <linux/time.h>
23#include <net/gso.h>
24#include <net/netlink.h>
25#include <net/pkt_sched.h>
26#include <net/pkt_cls.h>
27#include <net/sch_generic.h>
28#include <net/sock.h>
29#include <net/tcp.h>
30
31#define TAPRIO_STAT_NOT_SET	(~0ULL)
32
33#include "sch_mqprio_lib.h"
34
35static LIST_HEAD(taprio_list);
36static struct static_key_false taprio_have_broken_mqprio;
37static struct static_key_false taprio_have_working_mqprio;
38
39#define TAPRIO_ALL_GATES_OPEN -1
40
41#define TXTIME_ASSIST_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST)
42#define FULL_OFFLOAD_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD)
43#define TAPRIO_FLAGS_INVALID U32_MAX
44
45struct sched_entry {
46	/* Durations between this GCL entry and the GCL entry where the
47	 * respective traffic class gate closes
48	 */
49	u64 gate_duration[TC_MAX_QUEUE];
50	atomic_t budget[TC_MAX_QUEUE];
51	/* The qdisc makes some effort so that no packet leaves
52	 * after this time
53	 */
54	ktime_t gate_close_time[TC_MAX_QUEUE];
55	struct list_head list;
56	/* Used to calculate when to advance the schedule */
57	ktime_t end_time;
58	ktime_t next_txtime;
59	int index;
60	u32 gate_mask;
61	u32 interval;
62	u8 command;
63};
64
65struct sched_gate_list {
66	/* Longest non-zero contiguous gate durations per traffic class,
67	 * or 0 if a traffic class gate never opens during the schedule.
68	 */
69	u64 max_open_gate_duration[TC_MAX_QUEUE];
70	u32 max_frm_len[TC_MAX_QUEUE]; /* for the fast path */
71	u32 max_sdu[TC_MAX_QUEUE]; /* for dump */
72	struct rcu_head rcu;
73	struct list_head entries;
74	size_t num_entries;
75	ktime_t cycle_end_time;
76	s64 cycle_time;
77	s64 cycle_time_extension;
78	s64 base_time;
79};
80
81struct taprio_sched {
82	struct Qdisc **qdiscs;
83	struct Qdisc *root;
84	u32 flags;
85	enum tk_offsets tk_offset;
86	int clockid;
87	bool offloaded;
88	bool detected_mqprio;
89	bool broken_mqprio;
90	atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+
91				    * speeds it's sub-nanoseconds per byte
92				    */
93
94	/* Protects the update side of the RCU protected current_entry */
95	spinlock_t current_entry_lock;
96	struct sched_entry __rcu *current_entry;
97	struct sched_gate_list __rcu *oper_sched;
98	struct sched_gate_list __rcu *admin_sched;
99	struct hrtimer advance_timer;
100	struct list_head taprio_list;
101	int cur_txq[TC_MAX_QUEUE];
102	u32 max_sdu[TC_MAX_QUEUE]; /* save info from the user */
103	u32 fp[TC_QOPT_MAX_QUEUE]; /* only for dump and offloading */
104	u32 txtime_delay;
105};
106
107struct __tc_taprio_qopt_offload {
108	refcount_t users;
109	struct tc_taprio_qopt_offload offload;
110};
111
112static void taprio_calculate_gate_durations(struct taprio_sched *q,
113					    struct sched_gate_list *sched)
114{
115	struct net_device *dev = qdisc_dev(q->root);
116	int num_tc = netdev_get_num_tc(dev);
117	struct sched_entry *entry, *cur;
118	int tc;
119
120	list_for_each_entry(entry, &sched->entries, list) {
121		u32 gates_still_open = entry->gate_mask;
122
123		/* For each traffic class, calculate each open gate duration,
124		 * starting at this schedule entry and ending at the schedule
125		 * entry containing a gate close event for that TC.
126		 */
127		cur = entry;
128
129		do {
130			if (!gates_still_open)
131				break;
132
133			for (tc = 0; tc < num_tc; tc++) {
134				if (!(gates_still_open & BIT(tc)))
135					continue;
136
137				if (cur->gate_mask & BIT(tc))
138					entry->gate_duration[tc] += cur->interval;
139				else
140					gates_still_open &= ~BIT(tc);
141			}
142
143			cur = list_next_entry_circular(cur, &sched->entries, list);
144		} while (cur != entry);
145
146		/* Keep track of the maximum gate duration for each traffic
147		 * class, taking care to not confuse a traffic class which is
148		 * temporarily closed with one that is always closed.
149		 */
150		for (tc = 0; tc < num_tc; tc++)
151			if (entry->gate_duration[tc] &&
152			    sched->max_open_gate_duration[tc] < entry->gate_duration[tc])
153				sched->max_open_gate_duration[tc] = entry->gate_duration[tc];
154	}
155}
156
157static bool taprio_entry_allows_tx(ktime_t skb_end_time,
158				   struct sched_entry *entry, int tc)
159{
160	return ktime_before(skb_end_time, entry->gate_close_time[tc]);
161}
162
163static ktime_t sched_base_time(const struct sched_gate_list *sched)
164{
165	if (!sched)
166		return KTIME_MAX;
167
168	return ns_to_ktime(sched->base_time);
169}
170
171static ktime_t taprio_mono_to_any(const struct taprio_sched *q, ktime_t mono)
172{
173	/* This pairs with WRITE_ONCE() in taprio_parse_clockid() */
174	enum tk_offsets tk_offset = READ_ONCE(q->tk_offset);
175
176	switch (tk_offset) {
177	case TK_OFFS_MAX:
178		return mono;
179	default:
180		return ktime_mono_to_any(mono, tk_offset);
181	}
182}
183
184static ktime_t taprio_get_time(const struct taprio_sched *q)
185{
186	return taprio_mono_to_any(q, ktime_get());
187}
188
189static void taprio_free_sched_cb(struct rcu_head *head)
190{
191	struct sched_gate_list *sched = container_of(head, struct sched_gate_list, rcu);
192	struct sched_entry *entry, *n;
193
194	list_for_each_entry_safe(entry, n, &sched->entries, list) {
195		list_del(&entry->list);
196		kfree(entry);
197	}
198
199	kfree(sched);
200}
201
202static void switch_schedules(struct taprio_sched *q,
203			     struct sched_gate_list **admin,
204			     struct sched_gate_list **oper)
205{
206	rcu_assign_pointer(q->oper_sched, *admin);
207	rcu_assign_pointer(q->admin_sched, NULL);
208
209	if (*oper)
210		call_rcu(&(*oper)->rcu, taprio_free_sched_cb);
211
212	*oper = *admin;
213	*admin = NULL;
214}
215
216/* Get how much time has been already elapsed in the current cycle. */
217static s32 get_cycle_time_elapsed(struct sched_gate_list *sched, ktime_t time)
218{
219	ktime_t time_since_sched_start;
220	s32 time_elapsed;
221
222	time_since_sched_start = ktime_sub(time, sched->base_time);
223	div_s64_rem(time_since_sched_start, sched->cycle_time, &time_elapsed);
224
225	return time_elapsed;
226}
227
228static ktime_t get_interval_end_time(struct sched_gate_list *sched,
229				     struct sched_gate_list *admin,
230				     struct sched_entry *entry,
231				     ktime_t intv_start)
232{
233	s32 cycle_elapsed = get_cycle_time_elapsed(sched, intv_start);
234	ktime_t intv_end, cycle_ext_end, cycle_end;
235
236	cycle_end = ktime_add_ns(intv_start, sched->cycle_time - cycle_elapsed);
237	intv_end = ktime_add_ns(intv_start, entry->interval);
238	cycle_ext_end = ktime_add(cycle_end, sched->cycle_time_extension);
239
240	if (ktime_before(intv_end, cycle_end))
241		return intv_end;
242	else if (admin && admin != sched &&
243		 ktime_after(admin->base_time, cycle_end) &&
244		 ktime_before(admin->base_time, cycle_ext_end))
245		return admin->base_time;
246	else
247		return cycle_end;
248}
249
250static int length_to_duration(struct taprio_sched *q, int len)
251{
252	return div_u64(len * atomic64_read(&q->picos_per_byte), PSEC_PER_NSEC);
253}
254
255static int duration_to_length(struct taprio_sched *q, u64 duration)
256{
257	return div_u64(duration * PSEC_PER_NSEC, atomic64_read(&q->picos_per_byte));
258}
259
260/* Sets sched->max_sdu[] and sched->max_frm_len[] to the minimum between the
261 * q->max_sdu[] requested by the user and the max_sdu dynamically determined by
262 * the maximum open gate durations at the given link speed.
263 */
264static void taprio_update_queue_max_sdu(struct taprio_sched *q,
265					struct sched_gate_list *sched,
266					struct qdisc_size_table *stab)
267{
268	struct net_device *dev = qdisc_dev(q->root);
269	int num_tc = netdev_get_num_tc(dev);
270	u32 max_sdu_from_user;
271	u32 max_sdu_dynamic;
272	u32 max_sdu;
273	int tc;
274
275	for (tc = 0; tc < num_tc; tc++) {
276		max_sdu_from_user = q->max_sdu[tc] ?: U32_MAX;
277
278		/* TC gate never closes => keep the queueMaxSDU
279		 * selected by the user
280		 */
281		if (sched->max_open_gate_duration[tc] == sched->cycle_time) {
282			max_sdu_dynamic = U32_MAX;
283		} else {
284			u32 max_frm_len;
285
286			max_frm_len = duration_to_length(q, sched->max_open_gate_duration[tc]);
287			/* Compensate for L1 overhead from size table,
288			 * but don't let the frame size go negative
289			 */
290			if (stab) {
291				max_frm_len -= stab->szopts.overhead;
292				max_frm_len = max_t(int, max_frm_len,
293						    dev->hard_header_len + 1);
294			}
295			max_sdu_dynamic = max_frm_len - dev->hard_header_len;
296			if (max_sdu_dynamic > dev->max_mtu)
297				max_sdu_dynamic = U32_MAX;
298		}
299
300		max_sdu = min(max_sdu_dynamic, max_sdu_from_user);
301
302		if (max_sdu != U32_MAX) {
303			sched->max_frm_len[tc] = max_sdu + dev->hard_header_len;
304			sched->max_sdu[tc] = max_sdu;
305		} else {
306			sched->max_frm_len[tc] = U32_MAX; /* never oversized */
307			sched->max_sdu[tc] = 0;
308		}
309	}
310}
311
312/* Returns the entry corresponding to next available interval. If
313 * validate_interval is set, it only validates whether the timestamp occurs
314 * when the gate corresponding to the skb's traffic class is open.
315 */
316static struct sched_entry *find_entry_to_transmit(struct sk_buff *skb,
317						  struct Qdisc *sch,
318						  struct sched_gate_list *sched,
319						  struct sched_gate_list *admin,
320						  ktime_t time,
321						  ktime_t *interval_start,
322						  ktime_t *interval_end,
323						  bool validate_interval)
324{
325	ktime_t curr_intv_start, curr_intv_end, cycle_end, packet_transmit_time;
326	ktime_t earliest_txtime = KTIME_MAX, txtime, cycle, transmit_end_time;
327	struct sched_entry *entry = NULL, *entry_found = NULL;
328	struct taprio_sched *q = qdisc_priv(sch);
329	struct net_device *dev = qdisc_dev(sch);
330	bool entry_available = false;
331	s32 cycle_elapsed;
332	int tc, n;
333
334	tc = netdev_get_prio_tc_map(dev, skb->priority);
335	packet_transmit_time = length_to_duration(q, qdisc_pkt_len(skb));
336
337	*interval_start = 0;
338	*interval_end = 0;
339
340	if (!sched)
341		return NULL;
342
343	cycle = sched->cycle_time;
344	cycle_elapsed = get_cycle_time_elapsed(sched, time);
345	curr_intv_end = ktime_sub_ns(time, cycle_elapsed);
346	cycle_end = ktime_add_ns(curr_intv_end, cycle);
347
348	list_for_each_entry(entry, &sched->entries, list) {
349		curr_intv_start = curr_intv_end;
350		curr_intv_end = get_interval_end_time(sched, admin, entry,
351						      curr_intv_start);
352
353		if (ktime_after(curr_intv_start, cycle_end))
354			break;
355
356		if (!(entry->gate_mask & BIT(tc)) ||
357		    packet_transmit_time > entry->interval)
358			continue;
359
360		txtime = entry->next_txtime;
361
362		if (ktime_before(txtime, time) || validate_interval) {
363			transmit_end_time = ktime_add_ns(time, packet_transmit_time);
364			if ((ktime_before(curr_intv_start, time) &&
365			     ktime_before(transmit_end_time, curr_intv_end)) ||
366			    (ktime_after(curr_intv_start, time) && !validate_interval)) {
367				entry_found = entry;
368				*interval_start = curr_intv_start;
369				*interval_end = curr_intv_end;
370				break;
371			} else if (!entry_available && !validate_interval) {
372				/* Here, we are just trying to find out the
373				 * first available interval in the next cycle.
374				 */
375				entry_available = true;
376				entry_found = entry;
377				*interval_start = ktime_add_ns(curr_intv_start, cycle);
378				*interval_end = ktime_add_ns(curr_intv_end, cycle);
379			}
380		} else if (ktime_before(txtime, earliest_txtime) &&
381			   !entry_available) {
382			earliest_txtime = txtime;
383			entry_found = entry;
384			n = div_s64(ktime_sub(txtime, curr_intv_start), cycle);
385			*interval_start = ktime_add(curr_intv_start, n * cycle);
386			*interval_end = ktime_add(curr_intv_end, n * cycle);
387		}
388	}
389
390	return entry_found;
391}
392
393static bool is_valid_interval(struct sk_buff *skb, struct Qdisc *sch)
394{
395	struct taprio_sched *q = qdisc_priv(sch);
396	struct sched_gate_list *sched, *admin;
397	ktime_t interval_start, interval_end;
398	struct sched_entry *entry;
399
400	rcu_read_lock();
401	sched = rcu_dereference(q->oper_sched);
402	admin = rcu_dereference(q->admin_sched);
403
404	entry = find_entry_to_transmit(skb, sch, sched, admin, skb->tstamp,
405				       &interval_start, &interval_end, true);
406	rcu_read_unlock();
407
408	return entry;
409}
410
411static bool taprio_flags_valid(u32 flags)
412{
413	/* Make sure no other flag bits are set. */
414	if (flags & ~(TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST |
415		      TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD))
416		return false;
417	/* txtime-assist and full offload are mutually exclusive */
418	if ((flags & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) &&
419	    (flags & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD))
420		return false;
421	return true;
422}
423
424/* This returns the tstamp value set by TCP in terms of the set clock. */
425static ktime_t get_tcp_tstamp(struct taprio_sched *q, struct sk_buff *skb)
426{
427	unsigned int offset = skb_network_offset(skb);
428	const struct ipv6hdr *ipv6h;
429	const struct iphdr *iph;
430	struct ipv6hdr _ipv6h;
431
432	ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
433	if (!ipv6h)
434		return 0;
435
436	if (ipv6h->version == 4) {
437		iph = (struct iphdr *)ipv6h;
438		offset += iph->ihl * 4;
439
440		/* special-case 6in4 tunnelling, as that is a common way to get
441		 * v6 connectivity in the home
442		 */
443		if (iph->protocol == IPPROTO_IPV6) {
444			ipv6h = skb_header_pointer(skb, offset,
445						   sizeof(_ipv6h), &_ipv6h);
446
447			if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
448				return 0;
449		} else if (iph->protocol != IPPROTO_TCP) {
450			return 0;
451		}
452	} else if (ipv6h->version == 6 && ipv6h->nexthdr != IPPROTO_TCP) {
453		return 0;
454	}
455
456	return taprio_mono_to_any(q, skb->skb_mstamp_ns);
457}
458
459/* There are a few scenarios where we will have to modify the txtime from
460 * what is read from next_txtime in sched_entry. They are:
461 * 1. If txtime is in the past,
462 *    a. The gate for the traffic class is currently open and packet can be
463 *       transmitted before it closes, schedule the packet right away.
464 *    b. If the gate corresponding to the traffic class is going to open later
465 *       in the cycle, set the txtime of packet to the interval start.
466 * 2. If txtime is in the future, there are packets corresponding to the
467 *    current traffic class waiting to be transmitted. So, the following
468 *    possibilities exist:
469 *    a. We can transmit the packet before the window containing the txtime
470 *       closes.
471 *    b. The window might close before the transmission can be completed
472 *       successfully. So, schedule the packet in the next open window.
473 */
474static long get_packet_txtime(struct sk_buff *skb, struct Qdisc *sch)
475{
476	ktime_t transmit_end_time, interval_end, interval_start, tcp_tstamp;
477	struct taprio_sched *q = qdisc_priv(sch);
478	struct sched_gate_list *sched, *admin;
479	ktime_t minimum_time, now, txtime;
480	int len, packet_transmit_time;
481	struct sched_entry *entry;
482	bool sched_changed;
483
484	now = taprio_get_time(q);
485	minimum_time = ktime_add_ns(now, q->txtime_delay);
486
487	tcp_tstamp = get_tcp_tstamp(q, skb);
488	minimum_time = max_t(ktime_t, minimum_time, tcp_tstamp);
489
490	rcu_read_lock();
491	admin = rcu_dereference(q->admin_sched);
492	sched = rcu_dereference(q->oper_sched);
493	if (admin && ktime_after(minimum_time, admin->base_time))
494		switch_schedules(q, &admin, &sched);
495
496	/* Until the schedule starts, all the queues are open */
497	if (!sched || ktime_before(minimum_time, sched->base_time)) {
498		txtime = minimum_time;
499		goto done;
500	}
501
502	len = qdisc_pkt_len(skb);
503	packet_transmit_time = length_to_duration(q, len);
504
505	do {
506		sched_changed = false;
507
508		entry = find_entry_to_transmit(skb, sch, sched, admin,
509					       minimum_time,
510					       &interval_start, &interval_end,
511					       false);
512		if (!entry) {
513			txtime = 0;
514			goto done;
515		}
516
517		txtime = entry->next_txtime;
518		txtime = max_t(ktime_t, txtime, minimum_time);
519		txtime = max_t(ktime_t, txtime, interval_start);
520
521		if (admin && admin != sched &&
522		    ktime_after(txtime, admin->base_time)) {
523			sched = admin;
524			sched_changed = true;
525			continue;
526		}
527
528		transmit_end_time = ktime_add(txtime, packet_transmit_time);
529		minimum_time = transmit_end_time;
530
531		/* Update the txtime of current entry to the next time it's
532		 * interval starts.
533		 */
534		if (ktime_after(transmit_end_time, interval_end))
535			entry->next_txtime = ktime_add(interval_start, sched->cycle_time);
536	} while (sched_changed || ktime_after(transmit_end_time, interval_end));
537
538	entry->next_txtime = transmit_end_time;
539
540done:
541	rcu_read_unlock();
542	return txtime;
543}
544
545/* Devices with full offload are expected to honor this in hardware */
546static bool taprio_skb_exceeds_queue_max_sdu(struct Qdisc *sch,
547					     struct sk_buff *skb)
548{
549	struct taprio_sched *q = qdisc_priv(sch);
550	struct net_device *dev = qdisc_dev(sch);
551	struct sched_gate_list *sched;
552	int prio = skb->priority;
553	bool exceeds = false;
554	u8 tc;
555
556	tc = netdev_get_prio_tc_map(dev, prio);
557
558	rcu_read_lock();
559	sched = rcu_dereference(q->oper_sched);
560	if (sched && skb->len > sched->max_frm_len[tc])
561		exceeds = true;
562	rcu_read_unlock();
563
564	return exceeds;
565}
566
567static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch,
568			      struct Qdisc *child, struct sk_buff **to_free)
569{
570	struct taprio_sched *q = qdisc_priv(sch);
571
572	/* sk_flags are only safe to use on full sockets. */
573	if (skb->sk && sk_fullsock(skb->sk) && sock_flag(skb->sk, SOCK_TXTIME)) {
574		if (!is_valid_interval(skb, sch))
575			return qdisc_drop(skb, sch, to_free);
576	} else if (TXTIME_ASSIST_IS_ENABLED(q->flags)) {
577		skb->tstamp = get_packet_txtime(skb, sch);
578		if (!skb->tstamp)
579			return qdisc_drop(skb, sch, to_free);
580	}
581
582	qdisc_qstats_backlog_inc(sch, skb);
583	sch->q.qlen++;
584
585	return qdisc_enqueue(skb, child, to_free);
586}
587
588static int taprio_enqueue_segmented(struct sk_buff *skb, struct Qdisc *sch,
589				    struct Qdisc *child,
590				    struct sk_buff **to_free)
591{
592	unsigned int slen = 0, numsegs = 0, len = qdisc_pkt_len(skb);
593	netdev_features_t features = netif_skb_features(skb);
594	struct sk_buff *segs, *nskb;
595	int ret;
596
597	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
598	if (IS_ERR_OR_NULL(segs))
599		return qdisc_drop(skb, sch, to_free);
600
601	skb_list_walk_safe(segs, segs, nskb) {
602		skb_mark_not_on_list(segs);
603		qdisc_skb_cb(segs)->pkt_len = segs->len;
604		slen += segs->len;
605
606		/* FIXME: we should be segmenting to a smaller size
607		 * rather than dropping these
608		 */
609		if (taprio_skb_exceeds_queue_max_sdu(sch, segs))
610			ret = qdisc_drop(segs, sch, to_free);
611		else
612			ret = taprio_enqueue_one(segs, sch, child, to_free);
613
614		if (ret != NET_XMIT_SUCCESS) {
615			if (net_xmit_drop_count(ret))
616				qdisc_qstats_drop(sch);
617		} else {
618			numsegs++;
619		}
620	}
621
622	if (numsegs > 1)
623		qdisc_tree_reduce_backlog(sch, 1 - numsegs, len - slen);
624	consume_skb(skb);
625
626	return numsegs > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
627}
628
629/* Will not be called in the full offload case, since the TX queues are
630 * attached to the Qdisc created using qdisc_create_dflt()
631 */
632static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
633			  struct sk_buff **to_free)
634{
635	struct taprio_sched *q = qdisc_priv(sch);
636	struct Qdisc *child;
637	int queue;
638
639	queue = skb_get_queue_mapping(skb);
640
641	child = q->qdiscs[queue];
642	if (unlikely(!child))
643		return qdisc_drop(skb, sch, to_free);
644
645	if (taprio_skb_exceeds_queue_max_sdu(sch, skb)) {
646		/* Large packets might not be transmitted when the transmission
647		 * duration exceeds any configured interval. Therefore, segment
648		 * the skb into smaller chunks. Drivers with full offload are
649		 * expected to handle this in hardware.
650		 */
651		if (skb_is_gso(skb))
652			return taprio_enqueue_segmented(skb, sch, child,
653							to_free);
654
655		return qdisc_drop(skb, sch, to_free);
656	}
657
658	return taprio_enqueue_one(skb, sch, child, to_free);
659}
660
661static struct sk_buff *taprio_peek(struct Qdisc *sch)
662{
663	WARN_ONCE(1, "taprio only supports operating as root qdisc, peek() not implemented");
664	return NULL;
665}
666
667static void taprio_set_budgets(struct taprio_sched *q,
668			       struct sched_gate_list *sched,
669			       struct sched_entry *entry)
670{
671	struct net_device *dev = qdisc_dev(q->root);
672	int num_tc = netdev_get_num_tc(dev);
673	int tc, budget;
674
675	for (tc = 0; tc < num_tc; tc++) {
676		/* Traffic classes which never close have infinite budget */
677		if (entry->gate_duration[tc] == sched->cycle_time)
678			budget = INT_MAX;
679		else
680			budget = div64_u64((u64)entry->gate_duration[tc] * PSEC_PER_NSEC,
681					   atomic64_read(&q->picos_per_byte));
682
683		atomic_set(&entry->budget[tc], budget);
684	}
685}
686
687/* When an skb is sent, it consumes from the budget of all traffic classes */
688static int taprio_update_budgets(struct sched_entry *entry, size_t len,
689				 int tc_consumed, int num_tc)
690{
691	int tc, budget, new_budget = 0;
692
693	for (tc = 0; tc < num_tc; tc++) {
694		budget = atomic_read(&entry->budget[tc]);
695		/* Don't consume from infinite budget */
696		if (budget == INT_MAX) {
697			if (tc == tc_consumed)
698				new_budget = budget;
699			continue;
700		}
701
702		if (tc == tc_consumed)
703			new_budget = atomic_sub_return(len, &entry->budget[tc]);
704		else
705			atomic_sub(len, &entry->budget[tc]);
706	}
707
708	return new_budget;
709}
710
711static struct sk_buff *taprio_dequeue_from_txq(struct Qdisc *sch, int txq,
712					       struct sched_entry *entry,
713					       u32 gate_mask)
714{
715	struct taprio_sched *q = qdisc_priv(sch);
716	struct net_device *dev = qdisc_dev(sch);
717	struct Qdisc *child = q->qdiscs[txq];
718	int num_tc = netdev_get_num_tc(dev);
719	struct sk_buff *skb;
720	ktime_t guard;
721	int prio;
722	int len;
723	u8 tc;
724
725	if (unlikely(!child))
726		return NULL;
727
728	if (TXTIME_ASSIST_IS_ENABLED(q->flags))
729		goto skip_peek_checks;
730
731	skb = child->ops->peek(child);
732	if (!skb)
733		return NULL;
734
735	prio = skb->priority;
736	tc = netdev_get_prio_tc_map(dev, prio);
737
738	if (!(gate_mask & BIT(tc)))
739		return NULL;
740
741	len = qdisc_pkt_len(skb);
742	guard = ktime_add_ns(taprio_get_time(q), length_to_duration(q, len));
743
744	/* In the case that there's no gate entry, there's no
745	 * guard band ...
746	 */
747	if (gate_mask != TAPRIO_ALL_GATES_OPEN &&
748	    !taprio_entry_allows_tx(guard, entry, tc))
749		return NULL;
750
751	/* ... and no budget. */
752	if (gate_mask != TAPRIO_ALL_GATES_OPEN &&
753	    taprio_update_budgets(entry, len, tc, num_tc) < 0)
754		return NULL;
755
756skip_peek_checks:
757	skb = child->ops->dequeue(child);
758	if (unlikely(!skb))
759		return NULL;
760
761	qdisc_bstats_update(sch, skb);
762	qdisc_qstats_backlog_dec(sch, skb);
763	sch->q.qlen--;
764
765	return skb;
766}
767
768static void taprio_next_tc_txq(struct net_device *dev, int tc, int *txq)
769{
770	int offset = dev->tc_to_txq[tc].offset;
771	int count = dev->tc_to_txq[tc].count;
772
773	(*txq)++;
774	if (*txq == offset + count)
775		*txq = offset;
776}
777
778/* Prioritize higher traffic classes, and select among TXQs belonging to the
779 * same TC using round robin
780 */
781static struct sk_buff *taprio_dequeue_tc_priority(struct Qdisc *sch,
782						  struct sched_entry *entry,
783						  u32 gate_mask)
784{
785	struct taprio_sched *q = qdisc_priv(sch);
786	struct net_device *dev = qdisc_dev(sch);
787	int num_tc = netdev_get_num_tc(dev);
788	struct sk_buff *skb;
789	int tc;
790
791	for (tc = num_tc - 1; tc >= 0; tc--) {
792		int first_txq = q->cur_txq[tc];
793
794		if (!(gate_mask & BIT(tc)))
795			continue;
796
797		do {
798			skb = taprio_dequeue_from_txq(sch, q->cur_txq[tc],
799						      entry, gate_mask);
800
801			taprio_next_tc_txq(dev, tc, &q->cur_txq[tc]);
802
803			if (q->cur_txq[tc] >= dev->num_tx_queues)
804				q->cur_txq[tc] = first_txq;
805
806			if (skb)
807				return skb;
808		} while (q->cur_txq[tc] != first_txq);
809	}
810
811	return NULL;
812}
813
814/* Broken way of prioritizing smaller TXQ indices and ignoring the traffic
815 * class other than to determine whether the gate is open or not
816 */
817static struct sk_buff *taprio_dequeue_txq_priority(struct Qdisc *sch,
818						   struct sched_entry *entry,
819						   u32 gate_mask)
820{
821	struct net_device *dev = qdisc_dev(sch);
822	struct sk_buff *skb;
823	int i;
824
825	for (i = 0; i < dev->num_tx_queues; i++) {
826		skb = taprio_dequeue_from_txq(sch, i, entry, gate_mask);
827		if (skb)
828			return skb;
829	}
830
831	return NULL;
832}
833
834/* Will not be called in the full offload case, since the TX queues are
835 * attached to the Qdisc created using qdisc_create_dflt()
836 */
837static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
838{
839	struct taprio_sched *q = qdisc_priv(sch);
840	struct sk_buff *skb = NULL;
841	struct sched_entry *entry;
842	u32 gate_mask;
843
844	rcu_read_lock();
845	entry = rcu_dereference(q->current_entry);
846	/* if there's no entry, it means that the schedule didn't
847	 * start yet, so force all gates to be open, this is in
848	 * accordance to IEEE 802.1Qbv-2015 Section 8.6.9.4.5
849	 * "AdminGateStates"
850	 */
851	gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN;
852	if (!gate_mask)
853		goto done;
854
855	if (static_branch_unlikely(&taprio_have_broken_mqprio) &&
856	    !static_branch_likely(&taprio_have_working_mqprio)) {
857		/* Single NIC kind which is broken */
858		skb = taprio_dequeue_txq_priority(sch, entry, gate_mask);
859	} else if (static_branch_likely(&taprio_have_working_mqprio) &&
860		   !static_branch_unlikely(&taprio_have_broken_mqprio)) {
861		/* Single NIC kind which prioritizes properly */
862		skb = taprio_dequeue_tc_priority(sch, entry, gate_mask);
863	} else {
864		/* Mixed NIC kinds present in system, need dynamic testing */
865		if (q->broken_mqprio)
866			skb = taprio_dequeue_txq_priority(sch, entry, gate_mask);
867		else
868			skb = taprio_dequeue_tc_priority(sch, entry, gate_mask);
869	}
870
871done:
872	rcu_read_unlock();
873
874	return skb;
875}
876
877static bool should_restart_cycle(const struct sched_gate_list *oper,
878				 const struct sched_entry *entry)
879{
880	if (list_is_last(&entry->list, &oper->entries))
881		return true;
882
883	if (ktime_compare(entry->end_time, oper->cycle_end_time) == 0)
884		return true;
885
886	return false;
887}
888
889static bool should_change_schedules(const struct sched_gate_list *admin,
890				    const struct sched_gate_list *oper,
891				    ktime_t end_time)
892{
893	ktime_t next_base_time, extension_time;
894
895	if (!admin)
896		return false;
897
898	next_base_time = sched_base_time(admin);
899
900	/* This is the simple case, the end_time would fall after
901	 * the next schedule base_time.
902	 */
903	if (ktime_compare(next_base_time, end_time) <= 0)
904		return true;
905
906	/* This is the cycle_time_extension case, if the end_time
907	 * plus the amount that can be extended would fall after the
908	 * next schedule base_time, we can extend the current schedule
909	 * for that amount.
910	 */
911	extension_time = ktime_add_ns(end_time, oper->cycle_time_extension);
912
913	/* FIXME: the IEEE 802.1Q-2018 Specification isn't clear about
914	 * how precisely the extension should be made. So after
915	 * conformance testing, this logic may change.
916	 */
917	if (ktime_compare(next_base_time, extension_time) <= 0)
918		return true;
919
920	return false;
921}
922
923static enum hrtimer_restart advance_sched(struct hrtimer *timer)
924{
925	struct taprio_sched *q = container_of(timer, struct taprio_sched,
926					      advance_timer);
927	struct net_device *dev = qdisc_dev(q->root);
928	struct sched_gate_list *oper, *admin;
929	int num_tc = netdev_get_num_tc(dev);
930	struct sched_entry *entry, *next;
931	struct Qdisc *sch = q->root;
932	ktime_t end_time;
933	int tc;
934
935	spin_lock(&q->current_entry_lock);
936	entry = rcu_dereference_protected(q->current_entry,
937					  lockdep_is_held(&q->current_entry_lock));
938	oper = rcu_dereference_protected(q->oper_sched,
939					 lockdep_is_held(&q->current_entry_lock));
940	admin = rcu_dereference_protected(q->admin_sched,
941					  lockdep_is_held(&q->current_entry_lock));
942
943	if (!oper)
944		switch_schedules(q, &admin, &oper);
945
946	/* This can happen in two cases: 1. this is the very first run
947	 * of this function (i.e. we weren't running any schedule
948	 * previously); 2. The previous schedule just ended. The first
949	 * entry of all schedules are pre-calculated during the
950	 * schedule initialization.
951	 */
952	if (unlikely(!entry || entry->end_time == oper->base_time)) {
953		next = list_first_entry(&oper->entries, struct sched_entry,
954					list);
955		end_time = next->end_time;
956		goto first_run;
957	}
958
959	if (should_restart_cycle(oper, entry)) {
960		next = list_first_entry(&oper->entries, struct sched_entry,
961					list);
962		oper->cycle_end_time = ktime_add_ns(oper->cycle_end_time,
963						    oper->cycle_time);
964	} else {
965		next = list_next_entry(entry, list);
966	}
967
968	end_time = ktime_add_ns(entry->end_time, next->interval);
969	end_time = min_t(ktime_t, end_time, oper->cycle_end_time);
970
971	for (tc = 0; tc < num_tc; tc++) {
972		if (next->gate_duration[tc] == oper->cycle_time)
973			next->gate_close_time[tc] = KTIME_MAX;
974		else
975			next->gate_close_time[tc] = ktime_add_ns(entry->end_time,
976								 next->gate_duration[tc]);
977	}
978
979	if (should_change_schedules(admin, oper, end_time)) {
980		/* Set things so the next time this runs, the new
981		 * schedule runs.
982		 */
983		end_time = sched_base_time(admin);
984		switch_schedules(q, &admin, &oper);
985	}
986
987	next->end_time = end_time;
988	taprio_set_budgets(q, oper, next);
989
990first_run:
991	rcu_assign_pointer(q->current_entry, next);
992	spin_unlock(&q->current_entry_lock);
993
994	hrtimer_set_expires(&q->advance_timer, end_time);
995
996	rcu_read_lock();
997	__netif_schedule(sch);
998	rcu_read_unlock();
999
1000	return HRTIMER_RESTART;
1001}
1002
1003static const struct nla_policy entry_policy[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = {
1004	[TCA_TAPRIO_SCHED_ENTRY_INDEX]	   = { .type = NLA_U32 },
1005	[TCA_TAPRIO_SCHED_ENTRY_CMD]	   = { .type = NLA_U8 },
1006	[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK] = { .type = NLA_U32 },
1007	[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]  = { .type = NLA_U32 },
1008};
1009
1010static const struct nla_policy taprio_tc_policy[TCA_TAPRIO_TC_ENTRY_MAX + 1] = {
1011	[TCA_TAPRIO_TC_ENTRY_INDEX]	   = NLA_POLICY_MAX(NLA_U32,
1012							    TC_QOPT_MAX_QUEUE),
1013	[TCA_TAPRIO_TC_ENTRY_MAX_SDU]	   = { .type = NLA_U32 },
1014	[TCA_TAPRIO_TC_ENTRY_FP]	   = NLA_POLICY_RANGE(NLA_U32,
1015							      TC_FP_EXPRESS,
1016							      TC_FP_PREEMPTIBLE),
1017};
1018
1019static struct netlink_range_validation_signed taprio_cycle_time_range = {
1020	.min = 0,
1021	.max = INT_MAX,
1022};
1023
1024static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = {
1025	[TCA_TAPRIO_ATTR_PRIOMAP]	       = {
1026		.len = sizeof(struct tc_mqprio_qopt)
1027	},
1028	[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST]           = { .type = NLA_NESTED },
1029	[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]            = { .type = NLA_S64 },
1030	[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY]         = { .type = NLA_NESTED },
1031	[TCA_TAPRIO_ATTR_SCHED_CLOCKID]              = { .type = NLA_S32 },
1032	[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME]           =
1033		NLA_POLICY_FULL_RANGE_SIGNED(NLA_S64, &taprio_cycle_time_range),
1034	[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION] = { .type = NLA_S64 },
1035	[TCA_TAPRIO_ATTR_FLAGS]                      = { .type = NLA_U32 },
1036	[TCA_TAPRIO_ATTR_TXTIME_DELAY]		     = { .type = NLA_U32 },
1037	[TCA_TAPRIO_ATTR_TC_ENTRY]		     = { .type = NLA_NESTED },
1038};
1039
1040static int fill_sched_entry(struct taprio_sched *q, struct nlattr **tb,
1041			    struct sched_entry *entry,
1042			    struct netlink_ext_ack *extack)
1043{
1044	int min_duration = length_to_duration(q, ETH_ZLEN);
1045	u32 interval = 0;
1046
1047	if (tb[TCA_TAPRIO_SCHED_ENTRY_CMD])
1048		entry->command = nla_get_u8(
1049			tb[TCA_TAPRIO_SCHED_ENTRY_CMD]);
1050
1051	if (tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK])
1052		entry->gate_mask = nla_get_u32(
1053			tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK]);
1054
1055	if (tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL])
1056		interval = nla_get_u32(
1057			tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]);
1058
1059	/* The interval should allow at least the minimum ethernet
1060	 * frame to go out.
1061	 */
1062	if (interval < min_duration) {
1063		NL_SET_ERR_MSG(extack, "Invalid interval for schedule entry");
1064		return -EINVAL;
1065	}
1066
1067	entry->interval = interval;
1068
1069	return 0;
1070}
1071
1072static int parse_sched_entry(struct taprio_sched *q, struct nlattr *n,
1073			     struct sched_entry *entry, int index,
1074			     struct netlink_ext_ack *extack)
1075{
1076	struct nlattr *tb[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { };
1077	int err;
1078
1079	err = nla_parse_nested_deprecated(tb, TCA_TAPRIO_SCHED_ENTRY_MAX, n,
1080					  entry_policy, NULL);
1081	if (err < 0) {
1082		NL_SET_ERR_MSG(extack, "Could not parse nested entry");
1083		return -EINVAL;
1084	}
1085
1086	entry->index = index;
1087
1088	return fill_sched_entry(q, tb, entry, extack);
1089}
1090
1091static int parse_sched_list(struct taprio_sched *q, struct nlattr *list,
1092			    struct sched_gate_list *sched,
1093			    struct netlink_ext_ack *extack)
1094{
1095	struct nlattr *n;
1096	int err, rem;
1097	int i = 0;
1098
1099	if (!list)
1100		return -EINVAL;
1101
1102	nla_for_each_nested(n, list, rem) {
1103		struct sched_entry *entry;
1104
1105		if (nla_type(n) != TCA_TAPRIO_SCHED_ENTRY) {
1106			NL_SET_ERR_MSG(extack, "Attribute is not of type 'entry'");
1107			continue;
1108		}
1109
1110		entry = kzalloc(sizeof(*entry), GFP_KERNEL);
1111		if (!entry) {
1112			NL_SET_ERR_MSG(extack, "Not enough memory for entry");
1113			return -ENOMEM;
1114		}
1115
1116		err = parse_sched_entry(q, n, entry, i, extack);
1117		if (err < 0) {
1118			kfree(entry);
1119			return err;
1120		}
1121
1122		list_add_tail(&entry->list, &sched->entries);
1123		i++;
1124	}
1125
1126	sched->num_entries = i;
1127
1128	return i;
1129}
1130
1131static int parse_taprio_schedule(struct taprio_sched *q, struct nlattr **tb,
1132				 struct sched_gate_list *new,
1133				 struct netlink_ext_ack *extack)
1134{
1135	int err = 0;
1136
1137	if (tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY]) {
1138		NL_SET_ERR_MSG(extack, "Adding a single entry is not supported");
1139		return -ENOTSUPP;
1140	}
1141
1142	if (tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME])
1143		new->base_time = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]);
1144
1145	if (tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION])
1146		new->cycle_time_extension = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION]);
1147
1148	if (tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME])
1149		new->cycle_time = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME]);
1150
1151	if (tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST])
1152		err = parse_sched_list(q, tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST],
1153				       new, extack);
1154	if (err < 0)
1155		return err;
1156
1157	if (!new->cycle_time) {
1158		struct sched_entry *entry;
1159		ktime_t cycle = 0;
1160
1161		list_for_each_entry(entry, &new->entries, list)
1162			cycle = ktime_add_ns(cycle, entry->interval);
1163
1164		if (!cycle) {
1165			NL_SET_ERR_MSG(extack, "'cycle_time' can never be 0");
1166			return -EINVAL;
1167		}
1168
1169		if (cycle < 0 || cycle > INT_MAX) {
1170			NL_SET_ERR_MSG(extack, "'cycle_time' is too big");
1171			return -EINVAL;
1172		}
1173
1174		new->cycle_time = cycle;
1175	}
1176
1177	taprio_calculate_gate_durations(q, new);
1178
1179	return 0;
1180}
1181
1182static int taprio_parse_mqprio_opt(struct net_device *dev,
1183				   struct tc_mqprio_qopt *qopt,
1184				   struct netlink_ext_ack *extack,
1185				   u32 taprio_flags)
1186{
1187	bool allow_overlapping_txqs = TXTIME_ASSIST_IS_ENABLED(taprio_flags);
1188
1189	if (!qopt && !dev->num_tc) {
1190		NL_SET_ERR_MSG(extack, "'mqprio' configuration is necessary");
1191		return -EINVAL;
1192	}
1193
1194	/* If num_tc is already set, it means that the user already
1195	 * configured the mqprio part
1196	 */
1197	if (dev->num_tc)
1198		return 0;
1199
1200	/* taprio imposes that traffic classes map 1:n to tx queues */
1201	if (qopt->num_tc > dev->num_tx_queues) {
1202		NL_SET_ERR_MSG(extack, "Number of traffic classes is greater than number of HW queues");
1203		return -EINVAL;
1204	}
1205
1206	/* For some reason, in txtime-assist mode, we allow TXQ ranges for
1207	 * different TCs to overlap, and just validate the TXQ ranges.
1208	 */
1209	return mqprio_validate_qopt(dev, qopt, true, allow_overlapping_txqs,
1210				    extack);
1211}
1212
1213static int taprio_get_start_time(struct Qdisc *sch,
1214				 struct sched_gate_list *sched,
1215				 ktime_t *start)
1216{
1217	struct taprio_sched *q = qdisc_priv(sch);
1218	ktime_t now, base, cycle;
1219	s64 n;
1220
1221	base = sched_base_time(sched);
1222	now = taprio_get_time(q);
1223
1224	if (ktime_after(base, now)) {
1225		*start = base;
1226		return 0;
1227	}
1228
1229	cycle = sched->cycle_time;
1230
1231	/* The qdisc is expected to have at least one sched_entry.  Moreover,
1232	 * any entry must have 'interval' > 0. Thus if the cycle time is zero,
1233	 * something went really wrong. In that case, we should warn about this
1234	 * inconsistent state and return error.
1235	 */
1236	if (WARN_ON(!cycle))
1237		return -EFAULT;
1238
1239	/* Schedule the start time for the beginning of the next
1240	 * cycle.
1241	 */
1242	n = div64_s64(ktime_sub_ns(now, base), cycle);
1243	*start = ktime_add_ns(base, (n + 1) * cycle);
1244	return 0;
1245}
1246
1247static void setup_first_end_time(struct taprio_sched *q,
1248				 struct sched_gate_list *sched, ktime_t base)
1249{
1250	struct net_device *dev = qdisc_dev(q->root);
1251	int num_tc = netdev_get_num_tc(dev);
1252	struct sched_entry *first;
1253	ktime_t cycle;
1254	int tc;
1255
1256	first = list_first_entry(&sched->entries,
1257				 struct sched_entry, list);
1258
1259	cycle = sched->cycle_time;
1260
1261	/* FIXME: find a better place to do this */
1262	sched->cycle_end_time = ktime_add_ns(base, cycle);
1263
1264	first->end_time = ktime_add_ns(base, first->interval);
1265	taprio_set_budgets(q, sched, first);
1266
1267	for (tc = 0; tc < num_tc; tc++) {
1268		if (first->gate_duration[tc] == sched->cycle_time)
1269			first->gate_close_time[tc] = KTIME_MAX;
1270		else
1271			first->gate_close_time[tc] = ktime_add_ns(base, first->gate_duration[tc]);
1272	}
1273
1274	rcu_assign_pointer(q->current_entry, NULL);
1275}
1276
1277static void taprio_start_sched(struct Qdisc *sch,
1278			       ktime_t start, struct sched_gate_list *new)
1279{
1280	struct taprio_sched *q = qdisc_priv(sch);
1281	ktime_t expires;
1282
1283	if (FULL_OFFLOAD_IS_ENABLED(q->flags))
1284		return;
1285
1286	expires = hrtimer_get_expires(&q->advance_timer);
1287	if (expires == 0)
1288		expires = KTIME_MAX;
1289
1290	/* If the new schedule starts before the next expiration, we
1291	 * reprogram it to the earliest one, so we change the admin
1292	 * schedule to the operational one at the right time.
1293	 */
1294	start = min_t(ktime_t, start, expires);
1295
1296	hrtimer_start(&q->advance_timer, start, HRTIMER_MODE_ABS);
1297}
1298
1299static void taprio_set_picos_per_byte(struct net_device *dev,
1300				      struct taprio_sched *q)
1301{
1302	struct ethtool_link_ksettings ecmd;
1303	int speed = SPEED_10;
1304	int picos_per_byte;
1305	int err;
1306
1307	err = __ethtool_get_link_ksettings(dev, &ecmd);
1308	if (err < 0)
1309		goto skip;
1310
1311	if (ecmd.base.speed && ecmd.base.speed != SPEED_UNKNOWN)
1312		speed = ecmd.base.speed;
1313
1314skip:
1315	picos_per_byte = (USEC_PER_SEC * 8) / speed;
1316
1317	atomic64_set(&q->picos_per_byte, picos_per_byte);
1318	netdev_dbg(dev, "taprio: set %s's picos_per_byte to: %lld, linkspeed: %d\n",
1319		   dev->name, (long long)atomic64_read(&q->picos_per_byte),
1320		   ecmd.base.speed);
1321}
1322
1323static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event,
1324			       void *ptr)
1325{
1326	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1327	struct sched_gate_list *oper, *admin;
1328	struct qdisc_size_table *stab;
1329	struct taprio_sched *q;
1330
1331	ASSERT_RTNL();
1332
1333	if (event != NETDEV_UP && event != NETDEV_CHANGE)
1334		return NOTIFY_DONE;
1335
1336	list_for_each_entry(q, &taprio_list, taprio_list) {
1337		if (dev != qdisc_dev(q->root))
1338			continue;
1339
1340		taprio_set_picos_per_byte(dev, q);
1341
1342		stab = rtnl_dereference(q->root->stab);
1343
1344		oper = rtnl_dereference(q->oper_sched);
1345		if (oper)
1346			taprio_update_queue_max_sdu(q, oper, stab);
1347
1348		admin = rtnl_dereference(q->admin_sched);
1349		if (admin)
1350			taprio_update_queue_max_sdu(q, admin, stab);
1351
1352		break;
1353	}
1354
1355	return NOTIFY_DONE;
1356}
1357
1358static void setup_txtime(struct taprio_sched *q,
1359			 struct sched_gate_list *sched, ktime_t base)
1360{
1361	struct sched_entry *entry;
1362	u64 interval = 0;
1363
1364	list_for_each_entry(entry, &sched->entries, list) {
1365		entry->next_txtime = ktime_add_ns(base, interval);
1366		interval += entry->interval;
1367	}
1368}
1369
1370static struct tc_taprio_qopt_offload *taprio_offload_alloc(int num_entries)
1371{
1372	struct __tc_taprio_qopt_offload *__offload;
1373
1374	__offload = kzalloc(struct_size(__offload, offload.entries, num_entries),
1375			    GFP_KERNEL);
1376	if (!__offload)
1377		return NULL;
1378
1379	refcount_set(&__offload->users, 1);
1380
1381	return &__offload->offload;
1382}
1383
1384struct tc_taprio_qopt_offload *taprio_offload_get(struct tc_taprio_qopt_offload
1385						  *offload)
1386{
1387	struct __tc_taprio_qopt_offload *__offload;
1388
1389	__offload = container_of(offload, struct __tc_taprio_qopt_offload,
1390				 offload);
1391
1392	refcount_inc(&__offload->users);
1393
1394	return offload;
1395}
1396EXPORT_SYMBOL_GPL(taprio_offload_get);
1397
1398void taprio_offload_free(struct tc_taprio_qopt_offload *offload)
1399{
1400	struct __tc_taprio_qopt_offload *__offload;
1401
1402	__offload = container_of(offload, struct __tc_taprio_qopt_offload,
1403				 offload);
1404
1405	if (!refcount_dec_and_test(&__offload->users))
1406		return;
1407
1408	kfree(__offload);
1409}
1410EXPORT_SYMBOL_GPL(taprio_offload_free);
1411
1412/* The function will only serve to keep the pointers to the "oper" and "admin"
1413 * schedules valid in relation to their base times, so when calling dump() the
1414 * users looks at the right schedules.
1415 * When using full offload, the admin configuration is promoted to oper at the
1416 * base_time in the PHC time domain.  But because the system time is not
1417 * necessarily in sync with that, we can't just trigger a hrtimer to call
1418 * switch_schedules at the right hardware time.
1419 * At the moment we call this by hand right away from taprio, but in the future
1420 * it will be useful to create a mechanism for drivers to notify taprio of the
1421 * offload state (PENDING, ACTIVE, INACTIVE) so it can be visible in dump().
1422 * This is left as TODO.
1423 */
1424static void taprio_offload_config_changed(struct taprio_sched *q)
1425{
1426	struct sched_gate_list *oper, *admin;
1427
1428	oper = rtnl_dereference(q->oper_sched);
1429	admin = rtnl_dereference(q->admin_sched);
1430
1431	switch_schedules(q, &admin, &oper);
1432}
1433
1434static u32 tc_map_to_queue_mask(struct net_device *dev, u32 tc_mask)
1435{
1436	u32 i, queue_mask = 0;
1437
1438	for (i = 0; i < dev->num_tc; i++) {
1439		u32 offset, count;
1440
1441		if (!(tc_mask & BIT(i)))
1442			continue;
1443
1444		offset = dev->tc_to_txq[i].offset;
1445		count = dev->tc_to_txq[i].count;
1446
1447		queue_mask |= GENMASK(offset + count - 1, offset);
1448	}
1449
1450	return queue_mask;
1451}
1452
1453static void taprio_sched_to_offload(struct net_device *dev,
1454				    struct sched_gate_list *sched,
1455				    struct tc_taprio_qopt_offload *offload,
1456				    const struct tc_taprio_caps *caps)
1457{
1458	struct sched_entry *entry;
1459	int i = 0;
1460
1461	offload->base_time = sched->base_time;
1462	offload->cycle_time = sched->cycle_time;
1463	offload->cycle_time_extension = sched->cycle_time_extension;
1464
1465	list_for_each_entry(entry, &sched->entries, list) {
1466		struct tc_taprio_sched_entry *e = &offload->entries[i];
1467
1468		e->command = entry->command;
1469		e->interval = entry->interval;
1470		if (caps->gate_mask_per_txq)
1471			e->gate_mask = tc_map_to_queue_mask(dev,
1472							    entry->gate_mask);
1473		else
1474			e->gate_mask = entry->gate_mask;
1475
1476		i++;
1477	}
1478
1479	offload->num_entries = i;
1480}
1481
1482static void taprio_detect_broken_mqprio(struct taprio_sched *q)
1483{
1484	struct net_device *dev = qdisc_dev(q->root);
1485	struct tc_taprio_caps caps;
1486
1487	qdisc_offload_query_caps(dev, TC_SETUP_QDISC_TAPRIO,
1488				 &caps, sizeof(caps));
1489
1490	q->broken_mqprio = caps.broken_mqprio;
1491	if (q->broken_mqprio)
1492		static_branch_inc(&taprio_have_broken_mqprio);
1493	else
1494		static_branch_inc(&taprio_have_working_mqprio);
1495
1496	q->detected_mqprio = true;
1497}
1498
1499static void taprio_cleanup_broken_mqprio(struct taprio_sched *q)
1500{
1501	if (!q->detected_mqprio)
1502		return;
1503
1504	if (q->broken_mqprio)
1505		static_branch_dec(&taprio_have_broken_mqprio);
1506	else
1507		static_branch_dec(&taprio_have_working_mqprio);
1508}
1509
1510static int taprio_enable_offload(struct net_device *dev,
1511				 struct taprio_sched *q,
1512				 struct sched_gate_list *sched,
1513				 struct netlink_ext_ack *extack)
1514{
1515	const struct net_device_ops *ops = dev->netdev_ops;
1516	struct tc_taprio_qopt_offload *offload;
1517	struct tc_taprio_caps caps;
1518	int tc, err = 0;
1519
1520	if (!ops->ndo_setup_tc) {
1521		NL_SET_ERR_MSG(extack,
1522			       "Device does not support taprio offload");
1523		return -EOPNOTSUPP;
1524	}
1525
1526	qdisc_offload_query_caps(dev, TC_SETUP_QDISC_TAPRIO,
1527				 &caps, sizeof(caps));
1528
1529	if (!caps.supports_queue_max_sdu) {
1530		for (tc = 0; tc < TC_MAX_QUEUE; tc++) {
1531			if (q->max_sdu[tc]) {
1532				NL_SET_ERR_MSG_MOD(extack,
1533						   "Device does not handle queueMaxSDU");
1534				return -EOPNOTSUPP;
1535			}
1536		}
1537	}
1538
1539	offload = taprio_offload_alloc(sched->num_entries);
1540	if (!offload) {
1541		NL_SET_ERR_MSG(extack,
1542			       "Not enough memory for enabling offload mode");
1543		return -ENOMEM;
1544	}
1545	offload->cmd = TAPRIO_CMD_REPLACE;
1546	offload->extack = extack;
1547	mqprio_qopt_reconstruct(dev, &offload->mqprio.qopt);
1548	offload->mqprio.extack = extack;
1549	taprio_sched_to_offload(dev, sched, offload, &caps);
1550	mqprio_fp_to_offload(q->fp, &offload->mqprio);
1551
1552	for (tc = 0; tc < TC_MAX_QUEUE; tc++)
1553		offload->max_sdu[tc] = q->max_sdu[tc];
1554
1555	err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload);
1556	if (err < 0) {
1557		NL_SET_ERR_MSG_WEAK(extack,
1558				    "Device failed to setup taprio offload");
1559		goto done;
1560	}
1561
1562	q->offloaded = true;
1563
1564done:
1565	/* The offload structure may linger around via a reference taken by the
1566	 * device driver, so clear up the netlink extack pointer so that the
1567	 * driver isn't tempted to dereference data which stopped being valid
1568	 */
1569	offload->extack = NULL;
1570	offload->mqprio.extack = NULL;
1571	taprio_offload_free(offload);
1572
1573	return err;
1574}
1575
1576static int taprio_disable_offload(struct net_device *dev,
1577				  struct taprio_sched *q,
1578				  struct netlink_ext_ack *extack)
1579{
1580	const struct net_device_ops *ops = dev->netdev_ops;
1581	struct tc_taprio_qopt_offload *offload;
1582	int err;
1583
1584	if (!q->offloaded)
1585		return 0;
1586
1587	offload = taprio_offload_alloc(0);
1588	if (!offload) {
1589		NL_SET_ERR_MSG(extack,
1590			       "Not enough memory to disable offload mode");
1591		return -ENOMEM;
1592	}
1593	offload->cmd = TAPRIO_CMD_DESTROY;
1594
1595	err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload);
1596	if (err < 0) {
1597		NL_SET_ERR_MSG(extack,
1598			       "Device failed to disable offload");
1599		goto out;
1600	}
1601
1602	q->offloaded = false;
1603
1604out:
1605	taprio_offload_free(offload);
1606
1607	return err;
1608}
1609
1610/* If full offload is enabled, the only possible clockid is the net device's
1611 * PHC. For that reason, specifying a clockid through netlink is incorrect.
1612 * For txtime-assist, it is implicitly assumed that the device's PHC is kept
1613 * in sync with the specified clockid via a user space daemon such as phc2sys.
1614 * For both software taprio and txtime-assist, the clockid is used for the
1615 * hrtimer that advances the schedule and hence mandatory.
1616 */
1617static int taprio_parse_clockid(struct Qdisc *sch, struct nlattr **tb,
1618				struct netlink_ext_ack *extack)
1619{
1620	struct taprio_sched *q = qdisc_priv(sch);
1621	struct net_device *dev = qdisc_dev(sch);
1622	int err = -EINVAL;
1623
1624	if (FULL_OFFLOAD_IS_ENABLED(q->flags)) {
1625		const struct ethtool_ops *ops = dev->ethtool_ops;
1626		struct ethtool_ts_info info = {
1627			.cmd = ETHTOOL_GET_TS_INFO,
1628			.phc_index = -1,
1629		};
1630
1631		if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) {
1632			NL_SET_ERR_MSG(extack,
1633				       "The 'clockid' cannot be specified for full offload");
1634			goto out;
1635		}
1636
1637		if (ops && ops->get_ts_info)
1638			err = ops->get_ts_info(dev, &info);
1639
1640		if (err || info.phc_index < 0) {
1641			NL_SET_ERR_MSG(extack,
1642				       "Device does not have a PTP clock");
1643			err = -ENOTSUPP;
1644			goto out;
1645		}
1646	} else if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) {
1647		int clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]);
1648		enum tk_offsets tk_offset;
1649
1650		/* We only support static clockids and we don't allow
1651		 * for it to be modified after the first init.
1652		 */
1653		if (clockid < 0 ||
1654		    (q->clockid != -1 && q->clockid != clockid)) {
1655			NL_SET_ERR_MSG(extack,
1656				       "Changing the 'clockid' of a running schedule is not supported");
1657			err = -ENOTSUPP;
1658			goto out;
1659		}
1660
1661		switch (clockid) {
1662		case CLOCK_REALTIME:
1663			tk_offset = TK_OFFS_REAL;
1664			break;
1665		case CLOCK_MONOTONIC:
1666			tk_offset = TK_OFFS_MAX;
1667			break;
1668		case CLOCK_BOOTTIME:
1669			tk_offset = TK_OFFS_BOOT;
1670			break;
1671		case CLOCK_TAI:
1672			tk_offset = TK_OFFS_TAI;
1673			break;
1674		default:
1675			NL_SET_ERR_MSG(extack, "Invalid 'clockid'");
1676			err = -EINVAL;
1677			goto out;
1678		}
1679		/* This pairs with READ_ONCE() in taprio_mono_to_any */
1680		WRITE_ONCE(q->tk_offset, tk_offset);
1681
1682		q->clockid = clockid;
1683	} else {
1684		NL_SET_ERR_MSG(extack, "Specifying a 'clockid' is mandatory");
1685		goto out;
1686	}
1687
1688	/* Everything went ok, return success. */
1689	err = 0;
1690
1691out:
1692	return err;
1693}
1694
1695static int taprio_parse_tc_entry(struct Qdisc *sch,
1696				 struct nlattr *opt,
1697				 u32 max_sdu[TC_QOPT_MAX_QUEUE],
1698				 u32 fp[TC_QOPT_MAX_QUEUE],
1699				 unsigned long *seen_tcs,
1700				 struct netlink_ext_ack *extack)
1701{
1702	struct nlattr *tb[TCA_TAPRIO_TC_ENTRY_MAX + 1] = { };
1703	struct net_device *dev = qdisc_dev(sch);
1704	int err, tc;
1705	u32 val;
1706
1707	err = nla_parse_nested(tb, TCA_TAPRIO_TC_ENTRY_MAX, opt,
1708			       taprio_tc_policy, extack);
1709	if (err < 0)
1710		return err;
1711
1712	if (!tb[TCA_TAPRIO_TC_ENTRY_INDEX]) {
1713		NL_SET_ERR_MSG_MOD(extack, "TC entry index missing");
1714		return -EINVAL;
1715	}
1716
1717	tc = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_INDEX]);
1718	if (tc >= TC_QOPT_MAX_QUEUE) {
1719		NL_SET_ERR_MSG_MOD(extack, "TC entry index out of range");
1720		return -ERANGE;
1721	}
1722
1723	if (*seen_tcs & BIT(tc)) {
1724		NL_SET_ERR_MSG_MOD(extack, "Duplicate TC entry");
1725		return -EINVAL;
1726	}
1727
1728	*seen_tcs |= BIT(tc);
1729
1730	if (tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU]) {
1731		val = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU]);
1732		if (val > dev->max_mtu) {
1733			NL_SET_ERR_MSG_MOD(extack, "TC max SDU exceeds device max MTU");
1734			return -ERANGE;
1735		}
1736
1737		max_sdu[tc] = val;
1738	}
1739
1740	if (tb[TCA_TAPRIO_TC_ENTRY_FP])
1741		fp[tc] = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_FP]);
1742
1743	return 0;
1744}
1745
1746static int taprio_parse_tc_entries(struct Qdisc *sch,
1747				   struct nlattr *opt,
1748				   struct netlink_ext_ack *extack)
1749{
1750	struct taprio_sched *q = qdisc_priv(sch);
1751	struct net_device *dev = qdisc_dev(sch);
1752	u32 max_sdu[TC_QOPT_MAX_QUEUE];
1753	bool have_preemption = false;
1754	unsigned long seen_tcs = 0;
1755	u32 fp[TC_QOPT_MAX_QUEUE];
1756	struct nlattr *n;
1757	int tc, rem;
1758	int err = 0;
1759
1760	for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) {
1761		max_sdu[tc] = q->max_sdu[tc];
1762		fp[tc] = q->fp[tc];
1763	}
1764
1765	nla_for_each_nested(n, opt, rem) {
1766		if (nla_type(n) != TCA_TAPRIO_ATTR_TC_ENTRY)
1767			continue;
1768
1769		err = taprio_parse_tc_entry(sch, n, max_sdu, fp, &seen_tcs,
1770					    extack);
1771		if (err)
1772			return err;
1773	}
1774
1775	for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) {
1776		q->max_sdu[tc] = max_sdu[tc];
1777		q->fp[tc] = fp[tc];
1778		if (fp[tc] != TC_FP_EXPRESS)
1779			have_preemption = true;
1780	}
1781
1782	if (have_preemption) {
1783		if (!FULL_OFFLOAD_IS_ENABLED(q->flags)) {
1784			NL_SET_ERR_MSG(extack,
1785				       "Preemption only supported with full offload");
1786			return -EOPNOTSUPP;
1787		}
1788
1789		if (!ethtool_dev_mm_supported(dev)) {
1790			NL_SET_ERR_MSG(extack,
1791				       "Device does not support preemption");
1792			return -EOPNOTSUPP;
1793		}
1794	}
1795
1796	return err;
1797}
1798
1799static int taprio_mqprio_cmp(const struct net_device *dev,
1800			     const struct tc_mqprio_qopt *mqprio)
1801{
1802	int i;
1803
1804	if (!mqprio || mqprio->num_tc != dev->num_tc)
1805		return -1;
1806
1807	for (i = 0; i < mqprio->num_tc; i++)
1808		if (dev->tc_to_txq[i].count != mqprio->count[i] ||
1809		    dev->tc_to_txq[i].offset != mqprio->offset[i])
1810			return -1;
1811
1812	for (i = 0; i <= TC_BITMASK; i++)
1813		if (dev->prio_tc_map[i] != mqprio->prio_tc_map[i])
1814			return -1;
1815
1816	return 0;
1817}
1818
1819/* The semantics of the 'flags' argument in relation to 'change()'
1820 * requests, are interpreted following two rules (which are applied in
1821 * this order): (1) an omitted 'flags' argument is interpreted as
1822 * zero; (2) the 'flags' of a "running" taprio instance cannot be
1823 * changed.
1824 */
1825static int taprio_new_flags(const struct nlattr *attr, u32 old,
1826			    struct netlink_ext_ack *extack)
1827{
1828	u32 new = 0;
1829
1830	if (attr)
1831		new = nla_get_u32(attr);
1832
1833	if (old != TAPRIO_FLAGS_INVALID && old != new) {
1834		NL_SET_ERR_MSG_MOD(extack, "Changing 'flags' of a running schedule is not supported");
1835		return -EOPNOTSUPP;
1836	}
1837
1838	if (!taprio_flags_valid(new)) {
1839		NL_SET_ERR_MSG_MOD(extack, "Specified 'flags' are not valid");
1840		return -EINVAL;
1841	}
1842
1843	return new;
1844}
1845
1846static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
1847			 struct netlink_ext_ack *extack)
1848{
1849	struct qdisc_size_table *stab = rtnl_dereference(sch->stab);
1850	struct nlattr *tb[TCA_TAPRIO_ATTR_MAX + 1] = { };
1851	struct sched_gate_list *oper, *admin, *new_admin;
1852	struct taprio_sched *q = qdisc_priv(sch);
1853	struct net_device *dev = qdisc_dev(sch);
1854	struct tc_mqprio_qopt *mqprio = NULL;
1855	unsigned long flags;
1856	ktime_t start;
1857	int i, err;
1858
1859	err = nla_parse_nested_deprecated(tb, TCA_TAPRIO_ATTR_MAX, opt,
1860					  taprio_policy, extack);
1861	if (err < 0)
1862		return err;
1863
1864	if (tb[TCA_TAPRIO_ATTR_PRIOMAP])
1865		mqprio = nla_data(tb[TCA_TAPRIO_ATTR_PRIOMAP]);
1866
1867	err = taprio_new_flags(tb[TCA_TAPRIO_ATTR_FLAGS],
1868			       q->flags, extack);
1869	if (err < 0)
1870		return err;
1871
1872	q->flags = err;
1873
1874	err = taprio_parse_mqprio_opt(dev, mqprio, extack, q->flags);
1875	if (err < 0)
1876		return err;
1877
1878	err = taprio_parse_tc_entries(sch, opt, extack);
1879	if (err)
1880		return err;
1881
1882	new_admin = kzalloc(sizeof(*new_admin), GFP_KERNEL);
1883	if (!new_admin) {
1884		NL_SET_ERR_MSG(extack, "Not enough memory for a new schedule");
1885		return -ENOMEM;
1886	}
1887	INIT_LIST_HEAD(&new_admin->entries);
1888
1889	oper = rtnl_dereference(q->oper_sched);
1890	admin = rtnl_dereference(q->admin_sched);
1891
1892	/* no changes - no new mqprio settings */
1893	if (!taprio_mqprio_cmp(dev, mqprio))
1894		mqprio = NULL;
1895
1896	if (mqprio && (oper || admin)) {
1897		NL_SET_ERR_MSG(extack, "Changing the traffic mapping of a running schedule is not supported");
1898		err = -ENOTSUPP;
1899		goto free_sched;
1900	}
1901
1902	if (mqprio) {
1903		err = netdev_set_num_tc(dev, mqprio->num_tc);
1904		if (err)
1905			goto free_sched;
1906		for (i = 0; i < mqprio->num_tc; i++) {
1907			netdev_set_tc_queue(dev, i,
1908					    mqprio->count[i],
1909					    mqprio->offset[i]);
1910			q->cur_txq[i] = mqprio->offset[i];
1911		}
1912
1913		/* Always use supplied priority mappings */
1914		for (i = 0; i <= TC_BITMASK; i++)
1915			netdev_set_prio_tc_map(dev, i,
1916					       mqprio->prio_tc_map[i]);
1917	}
1918
1919	err = parse_taprio_schedule(q, tb, new_admin, extack);
1920	if (err < 0)
1921		goto free_sched;
1922
1923	if (new_admin->num_entries == 0) {
1924		NL_SET_ERR_MSG(extack, "There should be at least one entry in the schedule");
1925		err = -EINVAL;
1926		goto free_sched;
1927	}
1928
1929	err = taprio_parse_clockid(sch, tb, extack);
1930	if (err < 0)
1931		goto free_sched;
1932
1933	taprio_set_picos_per_byte(dev, q);
1934	taprio_update_queue_max_sdu(q, new_admin, stab);
1935
1936	if (FULL_OFFLOAD_IS_ENABLED(q->flags))
1937		err = taprio_enable_offload(dev, q, new_admin, extack);
1938	else
1939		err = taprio_disable_offload(dev, q, extack);
1940	if (err)
1941		goto free_sched;
1942
1943	/* Protects against enqueue()/dequeue() */
1944	spin_lock_bh(qdisc_lock(sch));
1945
1946	if (tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]) {
1947		if (!TXTIME_ASSIST_IS_ENABLED(q->flags)) {
1948			NL_SET_ERR_MSG_MOD(extack, "txtime-delay can only be set when txtime-assist mode is enabled");
1949			err = -EINVAL;
1950			goto unlock;
1951		}
1952
1953		q->txtime_delay = nla_get_u32(tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]);
1954	}
1955
1956	if (!TXTIME_ASSIST_IS_ENABLED(q->flags) &&
1957	    !FULL_OFFLOAD_IS_ENABLED(q->flags) &&
1958	    !hrtimer_active(&q->advance_timer)) {
1959		hrtimer_init(&q->advance_timer, q->clockid, HRTIMER_MODE_ABS);
1960		q->advance_timer.function = advance_sched;
1961	}
1962
1963	err = taprio_get_start_time(sch, new_admin, &start);
1964	if (err < 0) {
1965		NL_SET_ERR_MSG(extack, "Internal error: failed get start time");
1966		goto unlock;
1967	}
1968
1969	setup_txtime(q, new_admin, start);
1970
1971	if (TXTIME_ASSIST_IS_ENABLED(q->flags)) {
1972		if (!oper) {
1973			rcu_assign_pointer(q->oper_sched, new_admin);
1974			err = 0;
1975			new_admin = NULL;
1976			goto unlock;
1977		}
1978
1979		rcu_assign_pointer(q->admin_sched, new_admin);
1980		if (admin)
1981			call_rcu(&admin->rcu, taprio_free_sched_cb);
1982	} else {
1983		setup_first_end_time(q, new_admin, start);
1984
1985		/* Protects against advance_sched() */
1986		spin_lock_irqsave(&q->current_entry_lock, flags);
1987
1988		taprio_start_sched(sch, start, new_admin);
1989
1990		rcu_assign_pointer(q->admin_sched, new_admin);
1991		if (admin)
1992			call_rcu(&admin->rcu, taprio_free_sched_cb);
1993
1994		spin_unlock_irqrestore(&q->current_entry_lock, flags);
1995
1996		if (FULL_OFFLOAD_IS_ENABLED(q->flags))
1997			taprio_offload_config_changed(q);
1998	}
1999
2000	new_admin = NULL;
2001	err = 0;
2002
2003	if (!stab)
2004		NL_SET_ERR_MSG_MOD(extack,
2005				   "Size table not specified, frame length estimations may be inaccurate");
2006
2007unlock:
2008	spin_unlock_bh(qdisc_lock(sch));
2009
2010free_sched:
2011	if (new_admin)
2012		call_rcu(&new_admin->rcu, taprio_free_sched_cb);
2013
2014	return err;
2015}
2016
2017static void taprio_reset(struct Qdisc *sch)
2018{
2019	struct taprio_sched *q = qdisc_priv(sch);
2020	struct net_device *dev = qdisc_dev(sch);
2021	int i;
2022
2023	hrtimer_cancel(&q->advance_timer);
2024
2025	if (q->qdiscs) {
2026		for (i = 0; i < dev->num_tx_queues; i++)
2027			if (q->qdiscs[i])
2028				qdisc_reset(q->qdiscs[i]);
2029	}
2030}
2031
2032static void taprio_destroy(struct Qdisc *sch)
2033{
2034	struct taprio_sched *q = qdisc_priv(sch);
2035	struct net_device *dev = qdisc_dev(sch);
2036	struct sched_gate_list *oper, *admin;
2037	unsigned int i;
2038
2039	list_del(&q->taprio_list);
2040
2041	/* Note that taprio_reset() might not be called if an error
2042	 * happens in qdisc_create(), after taprio_init() has been called.
2043	 */
2044	hrtimer_cancel(&q->advance_timer);
2045	qdisc_synchronize(sch);
2046
2047	taprio_disable_offload(dev, q, NULL);
2048
2049	if (q->qdiscs) {
2050		for (i = 0; i < dev->num_tx_queues; i++)
2051			qdisc_put(q->qdiscs[i]);
2052
2053		kfree(q->qdiscs);
2054	}
2055	q->qdiscs = NULL;
2056
2057	netdev_reset_tc(dev);
2058
2059	oper = rtnl_dereference(q->oper_sched);
2060	admin = rtnl_dereference(q->admin_sched);
2061
2062	if (oper)
2063		call_rcu(&oper->rcu, taprio_free_sched_cb);
2064
2065	if (admin)
2066		call_rcu(&admin->rcu, taprio_free_sched_cb);
2067
2068	taprio_cleanup_broken_mqprio(q);
2069}
2070
2071static int taprio_init(struct Qdisc *sch, struct nlattr *opt,
2072		       struct netlink_ext_ack *extack)
2073{
2074	struct taprio_sched *q = qdisc_priv(sch);
2075	struct net_device *dev = qdisc_dev(sch);
2076	int i, tc;
2077
2078	spin_lock_init(&q->current_entry_lock);
2079
2080	hrtimer_init(&q->advance_timer, CLOCK_TAI, HRTIMER_MODE_ABS);
2081	q->advance_timer.function = advance_sched;
2082
2083	q->root = sch;
2084
2085	/* We only support static clockids. Use an invalid value as default
2086	 * and get the valid one on taprio_change().
2087	 */
2088	q->clockid = -1;
2089	q->flags = TAPRIO_FLAGS_INVALID;
2090
2091	list_add(&q->taprio_list, &taprio_list);
2092
2093	if (sch->parent != TC_H_ROOT) {
2094		NL_SET_ERR_MSG_MOD(extack, "Can only be attached as root qdisc");
2095		return -EOPNOTSUPP;
2096	}
2097
2098	if (!netif_is_multiqueue(dev)) {
2099		NL_SET_ERR_MSG_MOD(extack, "Multi-queue device is required");
2100		return -EOPNOTSUPP;
2101	}
2102
2103	q->qdiscs = kcalloc(dev->num_tx_queues, sizeof(q->qdiscs[0]),
2104			    GFP_KERNEL);
2105	if (!q->qdiscs)
2106		return -ENOMEM;
2107
2108	if (!opt)
2109		return -EINVAL;
2110
2111	for (i = 0; i < dev->num_tx_queues; i++) {
2112		struct netdev_queue *dev_queue;
2113		struct Qdisc *qdisc;
2114
2115		dev_queue = netdev_get_tx_queue(dev, i);
2116		qdisc = qdisc_create_dflt(dev_queue,
2117					  &pfifo_qdisc_ops,
2118					  TC_H_MAKE(TC_H_MAJ(sch->handle),
2119						    TC_H_MIN(i + 1)),
2120					  extack);
2121		if (!qdisc)
2122			return -ENOMEM;
2123
2124		if (i < dev->real_num_tx_queues)
2125			qdisc_hash_add(qdisc, false);
2126
2127		q->qdiscs[i] = qdisc;
2128	}
2129
2130	for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++)
2131		q->fp[tc] = TC_FP_EXPRESS;
2132
2133	taprio_detect_broken_mqprio(q);
2134
2135	return taprio_change(sch, opt, extack);
2136}
2137
2138static void taprio_attach(struct Qdisc *sch)
2139{
2140	struct taprio_sched *q = qdisc_priv(sch);
2141	struct net_device *dev = qdisc_dev(sch);
2142	unsigned int ntx;
2143
2144	/* Attach underlying qdisc */
2145	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
2146		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
2147		struct Qdisc *old, *dev_queue_qdisc;
2148
2149		if (FULL_OFFLOAD_IS_ENABLED(q->flags)) {
2150			struct Qdisc *qdisc = q->qdiscs[ntx];
2151
2152			/* In offload mode, the root taprio qdisc is bypassed
2153			 * and the netdev TX queues see the children directly
2154			 */
2155			qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
2156			dev_queue_qdisc = qdisc;
2157		} else {
2158			/* In software mode, attach the root taprio qdisc
2159			 * to all netdev TX queues, so that dev_qdisc_enqueue()
2160			 * goes through taprio_enqueue().
2161			 */
2162			dev_queue_qdisc = sch;
2163		}
2164		old = dev_graft_qdisc(dev_queue, dev_queue_qdisc);
2165		/* The qdisc's refcount requires to be elevated once
2166		 * for each netdev TX queue it is grafted onto
2167		 */
2168		qdisc_refcount_inc(dev_queue_qdisc);
2169		if (old)
2170			qdisc_put(old);
2171	}
2172}
2173
2174static struct netdev_queue *taprio_queue_get(struct Qdisc *sch,
2175					     unsigned long cl)
2176{
2177	struct net_device *dev = qdisc_dev(sch);
2178	unsigned long ntx = cl - 1;
2179
2180	if (ntx >= dev->num_tx_queues)
2181		return NULL;
2182
2183	return netdev_get_tx_queue(dev, ntx);
2184}
2185
2186static int taprio_graft(struct Qdisc *sch, unsigned long cl,
2187			struct Qdisc *new, struct Qdisc **old,
2188			struct netlink_ext_ack *extack)
2189{
2190	struct taprio_sched *q = qdisc_priv(sch);
2191	struct net_device *dev = qdisc_dev(sch);
2192	struct netdev_queue *dev_queue = taprio_queue_get(sch, cl);
2193
2194	if (!dev_queue)
2195		return -EINVAL;
2196
2197	if (dev->flags & IFF_UP)
2198		dev_deactivate(dev);
2199
2200	/* In offload mode, the child Qdisc is directly attached to the netdev
2201	 * TX queue, and thus, we need to keep its refcount elevated in order
2202	 * to counteract qdisc_graft()'s call to qdisc_put() once per TX queue.
2203	 * However, save the reference to the new qdisc in the private array in
2204	 * both software and offload cases, to have an up-to-date reference to
2205	 * our children.
2206	 */
2207	*old = q->qdiscs[cl - 1];
2208	if (FULL_OFFLOAD_IS_ENABLED(q->flags)) {
2209		WARN_ON_ONCE(dev_graft_qdisc(dev_queue, new) != *old);
2210		if (new)
2211			qdisc_refcount_inc(new);
2212		if (*old)
2213			qdisc_put(*old);
2214	}
2215
2216	q->qdiscs[cl - 1] = new;
2217	if (new)
2218		new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
2219
2220	if (dev->flags & IFF_UP)
2221		dev_activate(dev);
2222
2223	return 0;
2224}
2225
2226static int dump_entry(struct sk_buff *msg,
2227		      const struct sched_entry *entry)
2228{
2229	struct nlattr *item;
2230
2231	item = nla_nest_start_noflag(msg, TCA_TAPRIO_SCHED_ENTRY);
2232	if (!item)
2233		return -ENOSPC;
2234
2235	if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INDEX, entry->index))
2236		goto nla_put_failure;
2237
2238	if (nla_put_u8(msg, TCA_TAPRIO_SCHED_ENTRY_CMD, entry->command))
2239		goto nla_put_failure;
2240
2241	if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_GATE_MASK,
2242			entry->gate_mask))
2243		goto nla_put_failure;
2244
2245	if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INTERVAL,
2246			entry->interval))
2247		goto nla_put_failure;
2248
2249	return nla_nest_end(msg, item);
2250
2251nla_put_failure:
2252	nla_nest_cancel(msg, item);
2253	return -1;
2254}
2255
2256static int dump_schedule(struct sk_buff *msg,
2257			 const struct sched_gate_list *root)
2258{
2259	struct nlattr *entry_list;
2260	struct sched_entry *entry;
2261
2262	if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_BASE_TIME,
2263			root->base_time, TCA_TAPRIO_PAD))
2264		return -1;
2265
2266	if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME,
2267			root->cycle_time, TCA_TAPRIO_PAD))
2268		return -1;
2269
2270	if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION,
2271			root->cycle_time_extension, TCA_TAPRIO_PAD))
2272		return -1;
2273
2274	entry_list = nla_nest_start_noflag(msg,
2275					   TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST);
2276	if (!entry_list)
2277		goto error_nest;
2278
2279	list_for_each_entry(entry, &root->entries, list) {
2280		if (dump_entry(msg, entry) < 0)
2281			goto error_nest;
2282	}
2283
2284	nla_nest_end(msg, entry_list);
2285	return 0;
2286
2287error_nest:
2288	nla_nest_cancel(msg, entry_list);
2289	return -1;
2290}
2291
2292static int taprio_dump_tc_entries(struct sk_buff *skb,
2293				  struct taprio_sched *q,
2294				  struct sched_gate_list *sched)
2295{
2296	struct nlattr *n;
2297	int tc;
2298
2299	for (tc = 0; tc < TC_MAX_QUEUE; tc++) {
2300		n = nla_nest_start(skb, TCA_TAPRIO_ATTR_TC_ENTRY);
2301		if (!n)
2302			return -EMSGSIZE;
2303
2304		if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_INDEX, tc))
2305			goto nla_put_failure;
2306
2307		if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_MAX_SDU,
2308				sched->max_sdu[tc]))
2309			goto nla_put_failure;
2310
2311		if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_FP, q->fp[tc]))
2312			goto nla_put_failure;
2313
2314		nla_nest_end(skb, n);
2315	}
2316
2317	return 0;
2318
2319nla_put_failure:
2320	nla_nest_cancel(skb, n);
2321	return -EMSGSIZE;
2322}
2323
2324static int taprio_put_stat(struct sk_buff *skb, u64 val, u16 attrtype)
2325{
2326	if (val == TAPRIO_STAT_NOT_SET)
2327		return 0;
2328	if (nla_put_u64_64bit(skb, attrtype, val, TCA_TAPRIO_OFFLOAD_STATS_PAD))
2329		return -EMSGSIZE;
2330	return 0;
2331}
2332
2333static int taprio_dump_xstats(struct Qdisc *sch, struct gnet_dump *d,
2334			      struct tc_taprio_qopt_offload *offload,
2335			      struct tc_taprio_qopt_stats *stats)
2336{
2337	struct net_device *dev = qdisc_dev(sch);
2338	const struct net_device_ops *ops;
2339	struct sk_buff *skb = d->skb;
2340	struct nlattr *xstats;
2341	int err;
2342
2343	ops = qdisc_dev(sch)->netdev_ops;
2344
2345	/* FIXME I could use qdisc_offload_dump_helper(), but that messes
2346	 * with sch->flags depending on whether the device reports taprio
2347	 * stats, and I'm not sure whether that's a good idea, considering
2348	 * that stats are optional to the offload itself
2349	 */
2350	if (!ops->ndo_setup_tc)
2351		return 0;
2352
2353	memset(stats, 0xff, sizeof(*stats));
2354
2355	err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload);
2356	if (err == -EOPNOTSUPP)
2357		return 0;
2358	if (err)
2359		return err;
2360
2361	xstats = nla_nest_start(skb, TCA_STATS_APP);
2362	if (!xstats)
2363		goto err;
2364
2365	if (taprio_put_stat(skb, stats->window_drops,
2366			    TCA_TAPRIO_OFFLOAD_STATS_WINDOW_DROPS) ||
2367	    taprio_put_stat(skb, stats->tx_overruns,
2368			    TCA_TAPRIO_OFFLOAD_STATS_TX_OVERRUNS))
2369		goto err_cancel;
2370
2371	nla_nest_end(skb, xstats);
2372
2373	return 0;
2374
2375err_cancel:
2376	nla_nest_cancel(skb, xstats);
2377err:
2378	return -EMSGSIZE;
2379}
2380
2381static int taprio_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
2382{
2383	struct tc_taprio_qopt_offload offload = {
2384		.cmd = TAPRIO_CMD_STATS,
2385	};
2386
2387	return taprio_dump_xstats(sch, d, &offload, &offload.stats);
2388}
2389
2390static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb)
2391{
2392	struct taprio_sched *q = qdisc_priv(sch);
2393	struct net_device *dev = qdisc_dev(sch);
2394	struct sched_gate_list *oper, *admin;
2395	struct tc_mqprio_qopt opt = { 0 };
2396	struct nlattr *nest, *sched_nest;
2397
2398	oper = rtnl_dereference(q->oper_sched);
2399	admin = rtnl_dereference(q->admin_sched);
2400
2401	mqprio_qopt_reconstruct(dev, &opt);
2402
2403	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
2404	if (!nest)
2405		goto start_error;
2406
2407	if (nla_put(skb, TCA_TAPRIO_ATTR_PRIOMAP, sizeof(opt), &opt))
2408		goto options_error;
2409
2410	if (!FULL_OFFLOAD_IS_ENABLED(q->flags) &&
2411	    nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid))
2412		goto options_error;
2413
2414	if (q->flags && nla_put_u32(skb, TCA_TAPRIO_ATTR_FLAGS, q->flags))
2415		goto options_error;
2416
2417	if (q->txtime_delay &&
2418	    nla_put_u32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay))
2419		goto options_error;
2420
2421	if (oper && taprio_dump_tc_entries(skb, q, oper))
2422		goto options_error;
2423
2424	if (oper && dump_schedule(skb, oper))
2425		goto options_error;
2426
2427	if (!admin)
2428		goto done;
2429
2430	sched_nest = nla_nest_start_noflag(skb, TCA_TAPRIO_ATTR_ADMIN_SCHED);
2431	if (!sched_nest)
2432		goto options_error;
2433
2434	if (dump_schedule(skb, admin))
2435		goto admin_error;
2436
2437	nla_nest_end(skb, sched_nest);
2438
2439done:
2440	return nla_nest_end(skb, nest);
2441
2442admin_error:
2443	nla_nest_cancel(skb, sched_nest);
2444
2445options_error:
2446	nla_nest_cancel(skb, nest);
2447
2448start_error:
2449	return -ENOSPC;
2450}
2451
2452static struct Qdisc *taprio_leaf(struct Qdisc *sch, unsigned long cl)
2453{
2454	struct taprio_sched *q = qdisc_priv(sch);
2455	struct net_device *dev = qdisc_dev(sch);
2456	unsigned int ntx = cl - 1;
2457
2458	if (ntx >= dev->num_tx_queues)
2459		return NULL;
2460
2461	return q->qdiscs[ntx];
2462}
2463
2464static unsigned long taprio_find(struct Qdisc *sch, u32 classid)
2465{
2466	unsigned int ntx = TC_H_MIN(classid);
2467
2468	if (!taprio_queue_get(sch, ntx))
2469		return 0;
2470	return ntx;
2471}
2472
2473static int taprio_dump_class(struct Qdisc *sch, unsigned long cl,
2474			     struct sk_buff *skb, struct tcmsg *tcm)
2475{
2476	struct Qdisc *child = taprio_leaf(sch, cl);
2477
2478	tcm->tcm_parent = TC_H_ROOT;
2479	tcm->tcm_handle |= TC_H_MIN(cl);
2480	tcm->tcm_info = child->handle;
2481
2482	return 0;
2483}
2484
2485static int taprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
2486				   struct gnet_dump *d)
2487	__releases(d->lock)
2488	__acquires(d->lock)
2489{
2490	struct Qdisc *child = taprio_leaf(sch, cl);
2491	struct tc_taprio_qopt_offload offload = {
2492		.cmd = TAPRIO_CMD_QUEUE_STATS,
2493		.queue_stats = {
2494			.queue = cl - 1,
2495		},
2496	};
2497
2498	if (gnet_stats_copy_basic(d, NULL, &child->bstats, true) < 0 ||
2499	    qdisc_qstats_copy(d, child) < 0)
2500		return -1;
2501
2502	return taprio_dump_xstats(sch, d, &offload, &offload.queue_stats.stats);
2503}
2504
2505static void taprio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2506{
2507	struct net_device *dev = qdisc_dev(sch);
2508	unsigned long ntx;
2509
2510	if (arg->stop)
2511		return;
2512
2513	arg->count = arg->skip;
2514	for (ntx = arg->skip; ntx < dev->num_tx_queues; ntx++) {
2515		if (!tc_qdisc_stats_dump(sch, ntx + 1, arg))
2516			break;
2517	}
2518}
2519
2520static struct netdev_queue *taprio_select_queue(struct Qdisc *sch,
2521						struct tcmsg *tcm)
2522{
2523	return taprio_queue_get(sch, TC_H_MIN(tcm->tcm_parent));
2524}
2525
2526static const struct Qdisc_class_ops taprio_class_ops = {
2527	.graft		= taprio_graft,
2528	.leaf		= taprio_leaf,
2529	.find		= taprio_find,
2530	.walk		= taprio_walk,
2531	.dump		= taprio_dump_class,
2532	.dump_stats	= taprio_dump_class_stats,
2533	.select_queue	= taprio_select_queue,
2534};
2535
2536static struct Qdisc_ops taprio_qdisc_ops __read_mostly = {
2537	.cl_ops		= &taprio_class_ops,
2538	.id		= "taprio",
2539	.priv_size	= sizeof(struct taprio_sched),
2540	.init		= taprio_init,
2541	.change		= taprio_change,
2542	.destroy	= taprio_destroy,
2543	.reset		= taprio_reset,
2544	.attach		= taprio_attach,
2545	.peek		= taprio_peek,
2546	.dequeue	= taprio_dequeue,
2547	.enqueue	= taprio_enqueue,
2548	.dump		= taprio_dump,
2549	.dump_stats	= taprio_dump_stats,
2550	.owner		= THIS_MODULE,
2551};
2552
2553static struct notifier_block taprio_device_notifier = {
2554	.notifier_call = taprio_dev_notifier,
2555};
2556
2557static int __init taprio_module_init(void)
2558{
2559	int err = register_netdevice_notifier(&taprio_device_notifier);
2560
2561	if (err)
2562		return err;
2563
2564	return register_qdisc(&taprio_qdisc_ops);
2565}
2566
2567static void __exit taprio_module_exit(void)
2568{
2569	unregister_qdisc(&taprio_qdisc_ops);
2570	unregister_netdevice_notifier(&taprio_device_notifier);
2571}
2572
2573module_init(taprio_module_init);
2574module_exit(taprio_module_exit);
2575MODULE_LICENSE("GPL");
2576