xref: /kernel/linux/linux-5.10/net/sched/sch_cbq.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * net/sched/sch_cbq.c	Class-Based Queueing discipline.
4 *
5 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6 */
7
8#include <linux/module.h>
9#include <linux/slab.h>
10#include <linux/types.h>
11#include <linux/kernel.h>
12#include <linux/string.h>
13#include <linux/errno.h>
14#include <linux/skbuff.h>
15#include <net/netlink.h>
16#include <net/pkt_sched.h>
17#include <net/pkt_cls.h>
18
19
20/*	Class-Based Queueing (CBQ) algorithm.
21	=======================================
22
23	Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
24		 Management Models for Packet Networks",
25		 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
26
27		 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
28
29		 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
30		 Parameters", 1996
31
32		 [4] Sally Floyd and Michael Speer, "Experimental Results
33		 for Class-Based Queueing", 1998, not published.
34
35	-----------------------------------------------------------------------
36
37	Algorithm skeleton was taken from NS simulator cbq.cc.
38	If someone wants to check this code against the LBL version,
39	he should take into account that ONLY the skeleton was borrowed,
40	the implementation is different. Particularly:
41
42	--- The WRR algorithm is different. Our version looks more
43	reasonable (I hope) and works when quanta are allowed to be
44	less than MTU, which is always the case when real time classes
45	have small rates. Note, that the statement of [3] is
46	incomplete, delay may actually be estimated even if class
47	per-round allotment is less than MTU. Namely, if per-round
48	allotment is W*r_i, and r_1+...+r_k = r < 1
49
50	delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
51
52	In the worst case we have IntServ estimate with D = W*r+k*MTU
53	and C = MTU*r. The proof (if correct at all) is trivial.
54
55
56	--- It seems that cbq-2.0 is not very accurate. At least, I cannot
57	interpret some places, which look like wrong translations
58	from NS. Anyone is advised to find these differences
59	and explain to me, why I am wrong 8).
60
61	--- Linux has no EOI event, so that we cannot estimate true class
62	idle time. Workaround is to consider the next dequeue event
63	as sign that previous packet is finished. This is wrong because of
64	internal device queueing, but on a permanently loaded link it is true.
65	Moreover, combined with clock integrator, this scheme looks
66	very close to an ideal solution.  */
67
68struct cbq_sched_data;
69
70
71struct cbq_class {
72	struct Qdisc_class_common common;
73	struct cbq_class	*next_alive;	/* next class with backlog in this priority band */
74
75/* Parameters */
76	unsigned char		priority;	/* class priority */
77	unsigned char		priority2;	/* priority to be used after overlimit */
78	unsigned char		ewma_log;	/* time constant for idle time calculation */
79
80	u32			defmap;
81
82	/* Link-sharing scheduler parameters */
83	long			maxidle;	/* Class parameters: see below. */
84	long			offtime;
85	long			minidle;
86	u32			avpkt;
87	struct qdisc_rate_table	*R_tab;
88
89	/* General scheduler (WRR) parameters */
90	long			allot;
91	long			quantum;	/* Allotment per WRR round */
92	long			weight;		/* Relative allotment: see below */
93
94	struct Qdisc		*qdisc;		/* Ptr to CBQ discipline */
95	struct cbq_class	*split;		/* Ptr to split node */
96	struct cbq_class	*share;		/* Ptr to LS parent in the class tree */
97	struct cbq_class	*tparent;	/* Ptr to tree parent in the class tree */
98	struct cbq_class	*borrow;	/* NULL if class is bandwidth limited;
99						   parent otherwise */
100	struct cbq_class	*sibling;	/* Sibling chain */
101	struct cbq_class	*children;	/* Pointer to children chain */
102
103	struct Qdisc		*q;		/* Elementary queueing discipline */
104
105
106/* Variables */
107	unsigned char		cpriority;	/* Effective priority */
108	unsigned char		delayed;
109	unsigned char		level;		/* level of the class in hierarchy:
110						   0 for leaf classes, and maximal
111						   level of children + 1 for nodes.
112						 */
113
114	psched_time_t		last;		/* Last end of service */
115	psched_time_t		undertime;
116	long			avgidle;
117	long			deficit;	/* Saved deficit for WRR */
118	psched_time_t		penalized;
119	struct gnet_stats_basic_packed bstats;
120	struct gnet_stats_queue qstats;
121	struct net_rate_estimator __rcu *rate_est;
122	struct tc_cbq_xstats	xstats;
123
124	struct tcf_proto __rcu	*filter_list;
125	struct tcf_block	*block;
126
127	int			filters;
128
129	struct cbq_class	*defaults[TC_PRIO_MAX + 1];
130};
131
132struct cbq_sched_data {
133	struct Qdisc_class_hash	clhash;			/* Hash table of all classes */
134	int			nclasses[TC_CBQ_MAXPRIO + 1];
135	unsigned int		quanta[TC_CBQ_MAXPRIO + 1];
136
137	struct cbq_class	link;
138
139	unsigned int		activemask;
140	struct cbq_class	*active[TC_CBQ_MAXPRIO + 1];	/* List of all classes
141								   with backlog */
142
143#ifdef CONFIG_NET_CLS_ACT
144	struct cbq_class	*rx_class;
145#endif
146	struct cbq_class	*tx_class;
147	struct cbq_class	*tx_borrowed;
148	int			tx_len;
149	psched_time_t		now;		/* Cached timestamp */
150	unsigned int		pmask;
151
152	struct hrtimer		delay_timer;
153	struct qdisc_watchdog	watchdog;	/* Watchdog timer,
154						   started when CBQ has
155						   backlog, but cannot
156						   transmit just now */
157	psched_tdiff_t		wd_expires;
158	int			toplevel;
159	u32			hgenerator;
160};
161
162
163#define L2T(cl, len)	qdisc_l2t((cl)->R_tab, len)
164
165static inline struct cbq_class *
166cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
167{
168	struct Qdisc_class_common *clc;
169
170	clc = qdisc_class_find(&q->clhash, classid);
171	if (clc == NULL)
172		return NULL;
173	return container_of(clc, struct cbq_class, common);
174}
175
176#ifdef CONFIG_NET_CLS_ACT
177
178static struct cbq_class *
179cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
180{
181	struct cbq_class *cl;
182
183	for (cl = this->tparent; cl; cl = cl->tparent) {
184		struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
185
186		if (new != NULL && new != this)
187			return new;
188	}
189	return NULL;
190}
191
192#endif
193
194/* Classify packet. The procedure is pretty complicated, but
195 * it allows us to combine link sharing and priority scheduling
196 * transparently.
197 *
198 * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
199 * so that it resolves to split nodes. Then packets are classified
200 * by logical priority, or a more specific classifier may be attached
201 * to the split node.
202 */
203
204static struct cbq_class *
205cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
206{
207	struct cbq_sched_data *q = qdisc_priv(sch);
208	struct cbq_class *head = &q->link;
209	struct cbq_class **defmap;
210	struct cbq_class *cl = NULL;
211	u32 prio = skb->priority;
212	struct tcf_proto *fl;
213	struct tcf_result res;
214
215	/*
216	 *  Step 1. If skb->priority points to one of our classes, use it.
217	 */
218	if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
219	    (cl = cbq_class_lookup(q, prio)) != NULL)
220		return cl;
221
222	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
223	for (;;) {
224		int result = 0;
225		defmap = head->defaults;
226
227		fl = rcu_dereference_bh(head->filter_list);
228		/*
229		 * Step 2+n. Apply classifier.
230		 */
231		result = tcf_classify(skb, fl, &res, true);
232		if (!fl || result < 0)
233			goto fallback;
234		if (result == TC_ACT_SHOT)
235			return NULL;
236
237		cl = (void *)res.class;
238		if (!cl) {
239			if (TC_H_MAJ(res.classid))
240				cl = cbq_class_lookup(q, res.classid);
241			else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
242				cl = defmap[TC_PRIO_BESTEFFORT];
243
244			if (cl == NULL)
245				goto fallback;
246		}
247		if (cl->level >= head->level)
248			goto fallback;
249#ifdef CONFIG_NET_CLS_ACT
250		switch (result) {
251		case TC_ACT_QUEUED:
252		case TC_ACT_STOLEN:
253		case TC_ACT_TRAP:
254			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
255			fallthrough;
256		case TC_ACT_RECLASSIFY:
257			return cbq_reclassify(skb, cl);
258		}
259#endif
260		if (cl->level == 0)
261			return cl;
262
263		/*
264		 * Step 3+n. If classifier selected a link sharing class,
265		 *	   apply agency specific classifier.
266		 *	   Repeat this procdure until we hit a leaf node.
267		 */
268		head = cl;
269	}
270
271fallback:
272	cl = head;
273
274	/*
275	 * Step 4. No success...
276	 */
277	if (TC_H_MAJ(prio) == 0 &&
278	    !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
279	    !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
280		return head;
281
282	return cl;
283}
284
285/*
286 * A packet has just been enqueued on the empty class.
287 * cbq_activate_class adds it to the tail of active class list
288 * of its priority band.
289 */
290
291static inline void cbq_activate_class(struct cbq_class *cl)
292{
293	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
294	int prio = cl->cpriority;
295	struct cbq_class *cl_tail;
296
297	cl_tail = q->active[prio];
298	q->active[prio] = cl;
299
300	if (cl_tail != NULL) {
301		cl->next_alive = cl_tail->next_alive;
302		cl_tail->next_alive = cl;
303	} else {
304		cl->next_alive = cl;
305		q->activemask |= (1<<prio);
306	}
307}
308
309/*
310 * Unlink class from active chain.
311 * Note that this same procedure is done directly in cbq_dequeue*
312 * during round-robin procedure.
313 */
314
315static void cbq_deactivate_class(struct cbq_class *this)
316{
317	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
318	int prio = this->cpriority;
319	struct cbq_class *cl;
320	struct cbq_class *cl_prev = q->active[prio];
321
322	do {
323		cl = cl_prev->next_alive;
324		if (cl == this) {
325			cl_prev->next_alive = cl->next_alive;
326			cl->next_alive = NULL;
327
328			if (cl == q->active[prio]) {
329				q->active[prio] = cl_prev;
330				if (cl == q->active[prio]) {
331					q->active[prio] = NULL;
332					q->activemask &= ~(1<<prio);
333					return;
334				}
335			}
336			return;
337		}
338	} while ((cl_prev = cl) != q->active[prio]);
339}
340
341static void
342cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
343{
344	int toplevel = q->toplevel;
345
346	if (toplevel > cl->level) {
347		psched_time_t now = psched_get_time();
348
349		do {
350			if (cl->undertime < now) {
351				q->toplevel = cl->level;
352				return;
353			}
354		} while ((cl = cl->borrow) != NULL && toplevel > cl->level);
355	}
356}
357
358static int
359cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
360	    struct sk_buff **to_free)
361{
362	struct cbq_sched_data *q = qdisc_priv(sch);
363	int ret;
364	struct cbq_class *cl = cbq_classify(skb, sch, &ret);
365
366#ifdef CONFIG_NET_CLS_ACT
367	q->rx_class = cl;
368#endif
369	if (cl == NULL) {
370		if (ret & __NET_XMIT_BYPASS)
371			qdisc_qstats_drop(sch);
372		__qdisc_drop(skb, to_free);
373		return ret;
374	}
375
376	ret = qdisc_enqueue(skb, cl->q, to_free);
377	if (ret == NET_XMIT_SUCCESS) {
378		sch->q.qlen++;
379		cbq_mark_toplevel(q, cl);
380		if (!cl->next_alive)
381			cbq_activate_class(cl);
382		return ret;
383	}
384
385	if (net_xmit_drop_count(ret)) {
386		qdisc_qstats_drop(sch);
387		cbq_mark_toplevel(q, cl);
388		cl->qstats.drops++;
389	}
390	return ret;
391}
392
393/* Overlimit action: penalize leaf class by adding offtime */
394static void cbq_overlimit(struct cbq_class *cl)
395{
396	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
397	psched_tdiff_t delay = cl->undertime - q->now;
398
399	if (!cl->delayed) {
400		delay += cl->offtime;
401
402		/*
403		 * Class goes to sleep, so that it will have no
404		 * chance to work avgidle. Let's forgive it 8)
405		 *
406		 * BTW cbq-2.0 has a crap in this
407		 * place, apparently they forgot to shift it by cl->ewma_log.
408		 */
409		if (cl->avgidle < 0)
410			delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
411		if (cl->avgidle < cl->minidle)
412			cl->avgidle = cl->minidle;
413		if (delay <= 0)
414			delay = 1;
415		cl->undertime = q->now + delay;
416
417		cl->xstats.overactions++;
418		cl->delayed = 1;
419	}
420	if (q->wd_expires == 0 || q->wd_expires > delay)
421		q->wd_expires = delay;
422
423	/* Dirty work! We must schedule wakeups based on
424	 * real available rate, rather than leaf rate,
425	 * which may be tiny (even zero).
426	 */
427	if (q->toplevel == TC_CBQ_MAXLEVEL) {
428		struct cbq_class *b;
429		psched_tdiff_t base_delay = q->wd_expires;
430
431		for (b = cl->borrow; b; b = b->borrow) {
432			delay = b->undertime - q->now;
433			if (delay < base_delay) {
434				if (delay <= 0)
435					delay = 1;
436				base_delay = delay;
437			}
438		}
439
440		q->wd_expires = base_delay;
441	}
442}
443
444static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
445				       psched_time_t now)
446{
447	struct cbq_class *cl;
448	struct cbq_class *cl_prev = q->active[prio];
449	psched_time_t sched = now;
450
451	if (cl_prev == NULL)
452		return 0;
453
454	do {
455		cl = cl_prev->next_alive;
456		if (now - cl->penalized > 0) {
457			cl_prev->next_alive = cl->next_alive;
458			cl->next_alive = NULL;
459			cl->cpriority = cl->priority;
460			cl->delayed = 0;
461			cbq_activate_class(cl);
462
463			if (cl == q->active[prio]) {
464				q->active[prio] = cl_prev;
465				if (cl == q->active[prio]) {
466					q->active[prio] = NULL;
467					return 0;
468				}
469			}
470
471			cl = cl_prev->next_alive;
472		} else if (sched - cl->penalized > 0)
473			sched = cl->penalized;
474	} while ((cl_prev = cl) != q->active[prio]);
475
476	return sched - now;
477}
478
479static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
480{
481	struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
482						delay_timer);
483	struct Qdisc *sch = q->watchdog.qdisc;
484	psched_time_t now;
485	psched_tdiff_t delay = 0;
486	unsigned int pmask;
487
488	now = psched_get_time();
489
490	pmask = q->pmask;
491	q->pmask = 0;
492
493	while (pmask) {
494		int prio = ffz(~pmask);
495		psched_tdiff_t tmp;
496
497		pmask &= ~(1<<prio);
498
499		tmp = cbq_undelay_prio(q, prio, now);
500		if (tmp > 0) {
501			q->pmask |= 1<<prio;
502			if (tmp < delay || delay == 0)
503				delay = tmp;
504		}
505	}
506
507	if (delay) {
508		ktime_t time;
509
510		time = 0;
511		time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
512		hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
513	}
514
515	__netif_schedule(qdisc_root(sch));
516	return HRTIMER_NORESTART;
517}
518
519/*
520 * It is mission critical procedure.
521 *
522 * We "regenerate" toplevel cutoff, if transmitting class
523 * has backlog and it is not regulated. It is not part of
524 * original CBQ description, but looks more reasonable.
525 * Probably, it is wrong. This question needs further investigation.
526 */
527
528static inline void
529cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
530		    struct cbq_class *borrowed)
531{
532	if (cl && q->toplevel >= borrowed->level) {
533		if (cl->q->q.qlen > 1) {
534			do {
535				if (borrowed->undertime == PSCHED_PASTPERFECT) {
536					q->toplevel = borrowed->level;
537					return;
538				}
539			} while ((borrowed = borrowed->borrow) != NULL);
540		}
541#if 0
542	/* It is not necessary now. Uncommenting it
543	   will save CPU cycles, but decrease fairness.
544	 */
545		q->toplevel = TC_CBQ_MAXLEVEL;
546#endif
547	}
548}
549
550static void
551cbq_update(struct cbq_sched_data *q)
552{
553	struct cbq_class *this = q->tx_class;
554	struct cbq_class *cl = this;
555	int len = q->tx_len;
556	psched_time_t now;
557
558	q->tx_class = NULL;
559	/* Time integrator. We calculate EOS time
560	 * by adding expected packet transmission time.
561	 */
562	now = q->now + L2T(&q->link, len);
563
564	for ( ; cl; cl = cl->share) {
565		long avgidle = cl->avgidle;
566		long idle;
567
568		cl->bstats.packets++;
569		cl->bstats.bytes += len;
570
571		/*
572		 * (now - last) is total time between packet right edges.
573		 * (last_pktlen/rate) is "virtual" busy time, so that
574		 *
575		 *	idle = (now - last) - last_pktlen/rate
576		 */
577
578		idle = now - cl->last;
579		if ((unsigned long)idle > 128*1024*1024) {
580			avgidle = cl->maxidle;
581		} else {
582			idle -= L2T(cl, len);
583
584		/* true_avgidle := (1-W)*true_avgidle + W*idle,
585		 * where W=2^{-ewma_log}. But cl->avgidle is scaled:
586		 * cl->avgidle == true_avgidle/W,
587		 * hence:
588		 */
589			avgidle += idle - (avgidle>>cl->ewma_log);
590		}
591
592		if (avgidle <= 0) {
593			/* Overlimit or at-limit */
594
595			if (avgidle < cl->minidle)
596				avgidle = cl->minidle;
597
598			cl->avgidle = avgidle;
599
600			/* Calculate expected time, when this class
601			 * will be allowed to send.
602			 * It will occur, when:
603			 * (1-W)*true_avgidle + W*delay = 0, i.e.
604			 * idle = (1/W - 1)*(-true_avgidle)
605			 * or
606			 * idle = (1 - W)*(-cl->avgidle);
607			 */
608			idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
609
610			/*
611			 * That is not all.
612			 * To maintain the rate allocated to the class,
613			 * we add to undertime virtual clock,
614			 * necessary to complete transmitted packet.
615			 * (len/phys_bandwidth has been already passed
616			 * to the moment of cbq_update)
617			 */
618
619			idle -= L2T(&q->link, len);
620			idle += L2T(cl, len);
621
622			cl->undertime = now + idle;
623		} else {
624			/* Underlimit */
625
626			cl->undertime = PSCHED_PASTPERFECT;
627			if (avgidle > cl->maxidle)
628				cl->avgidle = cl->maxidle;
629			else
630				cl->avgidle = avgidle;
631		}
632		if ((s64)(now - cl->last) > 0)
633			cl->last = now;
634	}
635
636	cbq_update_toplevel(q, this, q->tx_borrowed);
637}
638
639static inline struct cbq_class *
640cbq_under_limit(struct cbq_class *cl)
641{
642	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
643	struct cbq_class *this_cl = cl;
644
645	if (cl->tparent == NULL)
646		return cl;
647
648	if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
649		cl->delayed = 0;
650		return cl;
651	}
652
653	do {
654		/* It is very suspicious place. Now overlimit
655		 * action is generated for not bounded classes
656		 * only if link is completely congested.
657		 * Though it is in agree with ancestor-only paradigm,
658		 * it looks very stupid. Particularly,
659		 * it means that this chunk of code will either
660		 * never be called or result in strong amplification
661		 * of burstiness. Dangerous, silly, and, however,
662		 * no another solution exists.
663		 */
664		cl = cl->borrow;
665		if (!cl) {
666			this_cl->qstats.overlimits++;
667			cbq_overlimit(this_cl);
668			return NULL;
669		}
670		if (cl->level > q->toplevel)
671			return NULL;
672	} while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
673
674	cl->delayed = 0;
675	return cl;
676}
677
678static inline struct sk_buff *
679cbq_dequeue_prio(struct Qdisc *sch, int prio)
680{
681	struct cbq_sched_data *q = qdisc_priv(sch);
682	struct cbq_class *cl_tail, *cl_prev, *cl;
683	struct sk_buff *skb;
684	int deficit;
685
686	cl_tail = cl_prev = q->active[prio];
687	cl = cl_prev->next_alive;
688
689	do {
690		deficit = 0;
691
692		/* Start round */
693		do {
694			struct cbq_class *borrow = cl;
695
696			if (cl->q->q.qlen &&
697			    (borrow = cbq_under_limit(cl)) == NULL)
698				goto skip_class;
699
700			if (cl->deficit <= 0) {
701				/* Class exhausted its allotment per
702				 * this round. Switch to the next one.
703				 */
704				deficit = 1;
705				cl->deficit += cl->quantum;
706				goto next_class;
707			}
708
709			skb = cl->q->dequeue(cl->q);
710
711			/* Class did not give us any skb :-(
712			 * It could occur even if cl->q->q.qlen != 0
713			 * f.e. if cl->q == "tbf"
714			 */
715			if (skb == NULL)
716				goto skip_class;
717
718			cl->deficit -= qdisc_pkt_len(skb);
719			q->tx_class = cl;
720			q->tx_borrowed = borrow;
721			if (borrow != cl) {
722#ifndef CBQ_XSTATS_BORROWS_BYTES
723				borrow->xstats.borrows++;
724				cl->xstats.borrows++;
725#else
726				borrow->xstats.borrows += qdisc_pkt_len(skb);
727				cl->xstats.borrows += qdisc_pkt_len(skb);
728#endif
729			}
730			q->tx_len = qdisc_pkt_len(skb);
731
732			if (cl->deficit <= 0) {
733				q->active[prio] = cl;
734				cl = cl->next_alive;
735				cl->deficit += cl->quantum;
736			}
737			return skb;
738
739skip_class:
740			if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
741				/* Class is empty or penalized.
742				 * Unlink it from active chain.
743				 */
744				cl_prev->next_alive = cl->next_alive;
745				cl->next_alive = NULL;
746
747				/* Did cl_tail point to it? */
748				if (cl == cl_tail) {
749					/* Repair it! */
750					cl_tail = cl_prev;
751
752					/* Was it the last class in this band? */
753					if (cl == cl_tail) {
754						/* Kill the band! */
755						q->active[prio] = NULL;
756						q->activemask &= ~(1<<prio);
757						if (cl->q->q.qlen)
758							cbq_activate_class(cl);
759						return NULL;
760					}
761
762					q->active[prio] = cl_tail;
763				}
764				if (cl->q->q.qlen)
765					cbq_activate_class(cl);
766
767				cl = cl_prev;
768			}
769
770next_class:
771			cl_prev = cl;
772			cl = cl->next_alive;
773		} while (cl_prev != cl_tail);
774	} while (deficit);
775
776	q->active[prio] = cl_prev;
777
778	return NULL;
779}
780
781static inline struct sk_buff *
782cbq_dequeue_1(struct Qdisc *sch)
783{
784	struct cbq_sched_data *q = qdisc_priv(sch);
785	struct sk_buff *skb;
786	unsigned int activemask;
787
788	activemask = q->activemask & 0xFF;
789	while (activemask) {
790		int prio = ffz(~activemask);
791		activemask &= ~(1<<prio);
792		skb = cbq_dequeue_prio(sch, prio);
793		if (skb)
794			return skb;
795	}
796	return NULL;
797}
798
799static struct sk_buff *
800cbq_dequeue(struct Qdisc *sch)
801{
802	struct sk_buff *skb;
803	struct cbq_sched_data *q = qdisc_priv(sch);
804	psched_time_t now;
805
806	now = psched_get_time();
807
808	if (q->tx_class)
809		cbq_update(q);
810
811	q->now = now;
812
813	for (;;) {
814		q->wd_expires = 0;
815
816		skb = cbq_dequeue_1(sch);
817		if (skb) {
818			qdisc_bstats_update(sch, skb);
819			sch->q.qlen--;
820			return skb;
821		}
822
823		/* All the classes are overlimit.
824		 *
825		 * It is possible, if:
826		 *
827		 * 1. Scheduler is empty.
828		 * 2. Toplevel cutoff inhibited borrowing.
829		 * 3. Root class is overlimit.
830		 *
831		 * Reset 2d and 3d conditions and retry.
832		 *
833		 * Note, that NS and cbq-2.0 are buggy, peeking
834		 * an arbitrary class is appropriate for ancestor-only
835		 * sharing, but not for toplevel algorithm.
836		 *
837		 * Our version is better, but slower, because it requires
838		 * two passes, but it is unavoidable with top-level sharing.
839		 */
840
841		if (q->toplevel == TC_CBQ_MAXLEVEL &&
842		    q->link.undertime == PSCHED_PASTPERFECT)
843			break;
844
845		q->toplevel = TC_CBQ_MAXLEVEL;
846		q->link.undertime = PSCHED_PASTPERFECT;
847	}
848
849	/* No packets in scheduler or nobody wants to give them to us :-(
850	 * Sigh... start watchdog timer in the last case.
851	 */
852
853	if (sch->q.qlen) {
854		qdisc_qstats_overlimit(sch);
855		if (q->wd_expires)
856			qdisc_watchdog_schedule(&q->watchdog,
857						now + q->wd_expires);
858	}
859	return NULL;
860}
861
862/* CBQ class maintanance routines */
863
864static void cbq_adjust_levels(struct cbq_class *this)
865{
866	if (this == NULL)
867		return;
868
869	do {
870		int level = 0;
871		struct cbq_class *cl;
872
873		cl = this->children;
874		if (cl) {
875			do {
876				if (cl->level > level)
877					level = cl->level;
878			} while ((cl = cl->sibling) != this->children);
879		}
880		this->level = level + 1;
881	} while ((this = this->tparent) != NULL);
882}
883
884static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
885{
886	struct cbq_class *cl;
887	unsigned int h;
888
889	if (q->quanta[prio] == 0)
890		return;
891
892	for (h = 0; h < q->clhash.hashsize; h++) {
893		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
894			/* BUGGGG... Beware! This expression suffer of
895			 * arithmetic overflows!
896			 */
897			if (cl->priority == prio) {
898				cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
899					q->quanta[prio];
900			}
901			if (cl->quantum <= 0 ||
902			    cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
903				pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
904					cl->common.classid, cl->quantum);
905				cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
906			}
907		}
908	}
909}
910
911static void cbq_sync_defmap(struct cbq_class *cl)
912{
913	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
914	struct cbq_class *split = cl->split;
915	unsigned int h;
916	int i;
917
918	if (split == NULL)
919		return;
920
921	for (i = 0; i <= TC_PRIO_MAX; i++) {
922		if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
923			split->defaults[i] = NULL;
924	}
925
926	for (i = 0; i <= TC_PRIO_MAX; i++) {
927		int level = split->level;
928
929		if (split->defaults[i])
930			continue;
931
932		for (h = 0; h < q->clhash.hashsize; h++) {
933			struct cbq_class *c;
934
935			hlist_for_each_entry(c, &q->clhash.hash[h],
936					     common.hnode) {
937				if (c->split == split && c->level < level &&
938				    c->defmap & (1<<i)) {
939					split->defaults[i] = c;
940					level = c->level;
941				}
942			}
943		}
944	}
945}
946
947static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
948{
949	struct cbq_class *split = NULL;
950
951	if (splitid == 0) {
952		split = cl->split;
953		if (!split)
954			return;
955		splitid = split->common.classid;
956	}
957
958	if (split == NULL || split->common.classid != splitid) {
959		for (split = cl->tparent; split; split = split->tparent)
960			if (split->common.classid == splitid)
961				break;
962	}
963
964	if (split == NULL)
965		return;
966
967	if (cl->split != split) {
968		cl->defmap = 0;
969		cbq_sync_defmap(cl);
970		cl->split = split;
971		cl->defmap = def & mask;
972	} else
973		cl->defmap = (cl->defmap & ~mask) | (def & mask);
974
975	cbq_sync_defmap(cl);
976}
977
978static void cbq_unlink_class(struct cbq_class *this)
979{
980	struct cbq_class *cl, **clp;
981	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
982
983	qdisc_class_hash_remove(&q->clhash, &this->common);
984
985	if (this->tparent) {
986		clp = &this->sibling;
987		cl = *clp;
988		do {
989			if (cl == this) {
990				*clp = cl->sibling;
991				break;
992			}
993			clp = &cl->sibling;
994		} while ((cl = *clp) != this->sibling);
995
996		if (this->tparent->children == this) {
997			this->tparent->children = this->sibling;
998			if (this->sibling == this)
999				this->tparent->children = NULL;
1000		}
1001	} else {
1002		WARN_ON(this->sibling != this);
1003	}
1004}
1005
1006static void cbq_link_class(struct cbq_class *this)
1007{
1008	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1009	struct cbq_class *parent = this->tparent;
1010
1011	this->sibling = this;
1012	qdisc_class_hash_insert(&q->clhash, &this->common);
1013
1014	if (parent == NULL)
1015		return;
1016
1017	if (parent->children == NULL) {
1018		parent->children = this;
1019	} else {
1020		this->sibling = parent->children->sibling;
1021		parent->children->sibling = this;
1022	}
1023}
1024
1025static void
1026cbq_reset(struct Qdisc *sch)
1027{
1028	struct cbq_sched_data *q = qdisc_priv(sch);
1029	struct cbq_class *cl;
1030	int prio;
1031	unsigned int h;
1032
1033	q->activemask = 0;
1034	q->pmask = 0;
1035	q->tx_class = NULL;
1036	q->tx_borrowed = NULL;
1037	qdisc_watchdog_cancel(&q->watchdog);
1038	hrtimer_cancel(&q->delay_timer);
1039	q->toplevel = TC_CBQ_MAXLEVEL;
1040	q->now = psched_get_time();
1041
1042	for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1043		q->active[prio] = NULL;
1044
1045	for (h = 0; h < q->clhash.hashsize; h++) {
1046		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1047			qdisc_reset(cl->q);
1048
1049			cl->next_alive = NULL;
1050			cl->undertime = PSCHED_PASTPERFECT;
1051			cl->avgidle = cl->maxidle;
1052			cl->deficit = cl->quantum;
1053			cl->cpriority = cl->priority;
1054		}
1055	}
1056}
1057
1058
1059static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1060{
1061	if (lss->change & TCF_CBQ_LSS_FLAGS) {
1062		cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1063		cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1064	}
1065	if (lss->change & TCF_CBQ_LSS_EWMA)
1066		cl->ewma_log = lss->ewma_log;
1067	if (lss->change & TCF_CBQ_LSS_AVPKT)
1068		cl->avpkt = lss->avpkt;
1069	if (lss->change & TCF_CBQ_LSS_MINIDLE)
1070		cl->minidle = -(long)lss->minidle;
1071	if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
1072		cl->maxidle = lss->maxidle;
1073		cl->avgidle = lss->maxidle;
1074	}
1075	if (lss->change & TCF_CBQ_LSS_OFFTIME)
1076		cl->offtime = lss->offtime;
1077	return 0;
1078}
1079
1080static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1081{
1082	q->nclasses[cl->priority]--;
1083	q->quanta[cl->priority] -= cl->weight;
1084	cbq_normalize_quanta(q, cl->priority);
1085}
1086
1087static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1088{
1089	q->nclasses[cl->priority]++;
1090	q->quanta[cl->priority] += cl->weight;
1091	cbq_normalize_quanta(q, cl->priority);
1092}
1093
1094static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1095{
1096	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1097
1098	if (wrr->allot)
1099		cl->allot = wrr->allot;
1100	if (wrr->weight)
1101		cl->weight = wrr->weight;
1102	if (wrr->priority) {
1103		cl->priority = wrr->priority - 1;
1104		cl->cpriority = cl->priority;
1105		if (cl->priority >= cl->priority2)
1106			cl->priority2 = TC_CBQ_MAXPRIO - 1;
1107	}
1108
1109	cbq_addprio(q, cl);
1110	return 0;
1111}
1112
1113static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1114{
1115	cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1116	return 0;
1117}
1118
1119static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1120	[TCA_CBQ_LSSOPT]	= { .len = sizeof(struct tc_cbq_lssopt) },
1121	[TCA_CBQ_WRROPT]	= { .len = sizeof(struct tc_cbq_wrropt) },
1122	[TCA_CBQ_FOPT]		= { .len = sizeof(struct tc_cbq_fopt) },
1123	[TCA_CBQ_OVL_STRATEGY]	= { .len = sizeof(struct tc_cbq_ovl) },
1124	[TCA_CBQ_RATE]		= { .len = sizeof(struct tc_ratespec) },
1125	[TCA_CBQ_RTAB]		= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1126	[TCA_CBQ_POLICE]	= { .len = sizeof(struct tc_cbq_police) },
1127};
1128
1129static int cbq_opt_parse(struct nlattr *tb[TCA_CBQ_MAX + 1],
1130			 struct nlattr *opt,
1131			 struct netlink_ext_ack *extack)
1132{
1133	int err;
1134
1135	if (!opt) {
1136		NL_SET_ERR_MSG(extack, "CBQ options are required for this operation");
1137		return -EINVAL;
1138	}
1139
1140	err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt,
1141					  cbq_policy, extack);
1142	if (err < 0)
1143		return err;
1144
1145	if (tb[TCA_CBQ_WRROPT]) {
1146		const struct tc_cbq_wrropt *wrr = nla_data(tb[TCA_CBQ_WRROPT]);
1147
1148		if (wrr->priority > TC_CBQ_MAXPRIO) {
1149			NL_SET_ERR_MSG(extack, "priority is bigger than TC_CBQ_MAXPRIO");
1150			err = -EINVAL;
1151		}
1152	}
1153	return err;
1154}
1155
1156static int cbq_init(struct Qdisc *sch, struct nlattr *opt,
1157		    struct netlink_ext_ack *extack)
1158{
1159	struct cbq_sched_data *q = qdisc_priv(sch);
1160	struct nlattr *tb[TCA_CBQ_MAX + 1];
1161	struct tc_ratespec *r;
1162	int err;
1163
1164	qdisc_watchdog_init(&q->watchdog, sch);
1165	hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1166	q->delay_timer.function = cbq_undelay;
1167
1168	err = cbq_opt_parse(tb, opt, extack);
1169	if (err < 0)
1170		return err;
1171
1172	if (!tb[TCA_CBQ_RTAB] || !tb[TCA_CBQ_RATE]) {
1173		NL_SET_ERR_MSG(extack, "Rate specification missing or incomplete");
1174		return -EINVAL;
1175	}
1176
1177	r = nla_data(tb[TCA_CBQ_RATE]);
1178
1179	q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB], extack);
1180	if (!q->link.R_tab)
1181		return -EINVAL;
1182
1183	err = tcf_block_get(&q->link.block, &q->link.filter_list, sch, extack);
1184	if (err)
1185		goto put_rtab;
1186
1187	err = qdisc_class_hash_init(&q->clhash);
1188	if (err < 0)
1189		goto put_block;
1190
1191	q->link.sibling = &q->link;
1192	q->link.common.classid = sch->handle;
1193	q->link.qdisc = sch;
1194	q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1195				      sch->handle, NULL);
1196	if (!q->link.q)
1197		q->link.q = &noop_qdisc;
1198	else
1199		qdisc_hash_add(q->link.q, true);
1200
1201	q->link.priority = TC_CBQ_MAXPRIO - 1;
1202	q->link.priority2 = TC_CBQ_MAXPRIO - 1;
1203	q->link.cpriority = TC_CBQ_MAXPRIO - 1;
1204	q->link.allot = psched_mtu(qdisc_dev(sch));
1205	q->link.quantum = q->link.allot;
1206	q->link.weight = q->link.R_tab->rate.rate;
1207
1208	q->link.ewma_log = TC_CBQ_DEF_EWMA;
1209	q->link.avpkt = q->link.allot/2;
1210	q->link.minidle = -0x7FFFFFFF;
1211
1212	q->toplevel = TC_CBQ_MAXLEVEL;
1213	q->now = psched_get_time();
1214
1215	cbq_link_class(&q->link);
1216
1217	if (tb[TCA_CBQ_LSSOPT])
1218		cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1219
1220	cbq_addprio(q, &q->link);
1221	return 0;
1222
1223put_block:
1224	tcf_block_put(q->link.block);
1225
1226put_rtab:
1227	qdisc_put_rtab(q->link.R_tab);
1228	return err;
1229}
1230
1231static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1232{
1233	unsigned char *b = skb_tail_pointer(skb);
1234
1235	if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
1236		goto nla_put_failure;
1237	return skb->len;
1238
1239nla_put_failure:
1240	nlmsg_trim(skb, b);
1241	return -1;
1242}
1243
1244static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1245{
1246	unsigned char *b = skb_tail_pointer(skb);
1247	struct tc_cbq_lssopt opt;
1248
1249	opt.flags = 0;
1250	if (cl->borrow == NULL)
1251		opt.flags |= TCF_CBQ_LSS_BOUNDED;
1252	if (cl->share == NULL)
1253		opt.flags |= TCF_CBQ_LSS_ISOLATED;
1254	opt.ewma_log = cl->ewma_log;
1255	opt.level = cl->level;
1256	opt.avpkt = cl->avpkt;
1257	opt.maxidle = cl->maxidle;
1258	opt.minidle = (u32)(-cl->minidle);
1259	opt.offtime = cl->offtime;
1260	opt.change = ~0;
1261	if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
1262		goto nla_put_failure;
1263	return skb->len;
1264
1265nla_put_failure:
1266	nlmsg_trim(skb, b);
1267	return -1;
1268}
1269
1270static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1271{
1272	unsigned char *b = skb_tail_pointer(skb);
1273	struct tc_cbq_wrropt opt;
1274
1275	memset(&opt, 0, sizeof(opt));
1276	opt.flags = 0;
1277	opt.allot = cl->allot;
1278	opt.priority = cl->priority + 1;
1279	opt.cpriority = cl->cpriority + 1;
1280	opt.weight = cl->weight;
1281	if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
1282		goto nla_put_failure;
1283	return skb->len;
1284
1285nla_put_failure:
1286	nlmsg_trim(skb, b);
1287	return -1;
1288}
1289
1290static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1291{
1292	unsigned char *b = skb_tail_pointer(skb);
1293	struct tc_cbq_fopt opt;
1294
1295	if (cl->split || cl->defmap) {
1296		opt.split = cl->split ? cl->split->common.classid : 0;
1297		opt.defmap = cl->defmap;
1298		opt.defchange = ~0;
1299		if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
1300			goto nla_put_failure;
1301	}
1302	return skb->len;
1303
1304nla_put_failure:
1305	nlmsg_trim(skb, b);
1306	return -1;
1307}
1308
1309static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1310{
1311	if (cbq_dump_lss(skb, cl) < 0 ||
1312	    cbq_dump_rate(skb, cl) < 0 ||
1313	    cbq_dump_wrr(skb, cl) < 0 ||
1314	    cbq_dump_fopt(skb, cl) < 0)
1315		return -1;
1316	return 0;
1317}
1318
1319static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1320{
1321	struct cbq_sched_data *q = qdisc_priv(sch);
1322	struct nlattr *nest;
1323
1324	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1325	if (nest == NULL)
1326		goto nla_put_failure;
1327	if (cbq_dump_attr(skb, &q->link) < 0)
1328		goto nla_put_failure;
1329	return nla_nest_end(skb, nest);
1330
1331nla_put_failure:
1332	nla_nest_cancel(skb, nest);
1333	return -1;
1334}
1335
1336static int
1337cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1338{
1339	struct cbq_sched_data *q = qdisc_priv(sch);
1340
1341	q->link.xstats.avgidle = q->link.avgidle;
1342	return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1343}
1344
1345static int
1346cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1347	       struct sk_buff *skb, struct tcmsg *tcm)
1348{
1349	struct cbq_class *cl = (struct cbq_class *)arg;
1350	struct nlattr *nest;
1351
1352	if (cl->tparent)
1353		tcm->tcm_parent = cl->tparent->common.classid;
1354	else
1355		tcm->tcm_parent = TC_H_ROOT;
1356	tcm->tcm_handle = cl->common.classid;
1357	tcm->tcm_info = cl->q->handle;
1358
1359	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1360	if (nest == NULL)
1361		goto nla_put_failure;
1362	if (cbq_dump_attr(skb, cl) < 0)
1363		goto nla_put_failure;
1364	return nla_nest_end(skb, nest);
1365
1366nla_put_failure:
1367	nla_nest_cancel(skb, nest);
1368	return -1;
1369}
1370
1371static int
1372cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1373	struct gnet_dump *d)
1374{
1375	struct cbq_sched_data *q = qdisc_priv(sch);
1376	struct cbq_class *cl = (struct cbq_class *)arg;
1377	__u32 qlen;
1378
1379	cl->xstats.avgidle = cl->avgidle;
1380	cl->xstats.undertime = 0;
1381	qdisc_qstats_qlen_backlog(cl->q, &qlen, &cl->qstats.backlog);
1382
1383	if (cl->undertime != PSCHED_PASTPERFECT)
1384		cl->xstats.undertime = cl->undertime - q->now;
1385
1386	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
1387				  d, NULL, &cl->bstats) < 0 ||
1388	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1389	    gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0)
1390		return -1;
1391
1392	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1393}
1394
1395static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1396		     struct Qdisc **old, struct netlink_ext_ack *extack)
1397{
1398	struct cbq_class *cl = (struct cbq_class *)arg;
1399
1400	if (new == NULL) {
1401		new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1402					cl->common.classid, extack);
1403		if (new == NULL)
1404			return -ENOBUFS;
1405	}
1406
1407	*old = qdisc_replace(sch, new, &cl->q);
1408	return 0;
1409}
1410
1411static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
1412{
1413	struct cbq_class *cl = (struct cbq_class *)arg;
1414
1415	return cl->q;
1416}
1417
1418static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1419{
1420	struct cbq_class *cl = (struct cbq_class *)arg;
1421
1422	cbq_deactivate_class(cl);
1423}
1424
1425static unsigned long cbq_find(struct Qdisc *sch, u32 classid)
1426{
1427	struct cbq_sched_data *q = qdisc_priv(sch);
1428
1429	return (unsigned long)cbq_class_lookup(q, classid);
1430}
1431
1432static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1433{
1434	struct cbq_sched_data *q = qdisc_priv(sch);
1435
1436	WARN_ON(cl->filters);
1437
1438	tcf_block_put(cl->block);
1439	qdisc_put(cl->q);
1440	qdisc_put_rtab(cl->R_tab);
1441	gen_kill_estimator(&cl->rate_est);
1442	if (cl != &q->link)
1443		kfree(cl);
1444}
1445
1446static void cbq_destroy(struct Qdisc *sch)
1447{
1448	struct cbq_sched_data *q = qdisc_priv(sch);
1449	struct hlist_node *next;
1450	struct cbq_class *cl;
1451	unsigned int h;
1452
1453#ifdef CONFIG_NET_CLS_ACT
1454	q->rx_class = NULL;
1455#endif
1456	/*
1457	 * Filters must be destroyed first because we don't destroy the
1458	 * classes from root to leafs which means that filters can still
1459	 * be bound to classes which have been destroyed already. --TGR '04
1460	 */
1461	for (h = 0; h < q->clhash.hashsize; h++) {
1462		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1463			tcf_block_put(cl->block);
1464			cl->block = NULL;
1465		}
1466	}
1467	for (h = 0; h < q->clhash.hashsize; h++) {
1468		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
1469					  common.hnode)
1470			cbq_destroy_class(sch, cl);
1471	}
1472	qdisc_class_hash_destroy(&q->clhash);
1473}
1474
1475static int
1476cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1477		 unsigned long *arg, struct netlink_ext_ack *extack)
1478{
1479	int err;
1480	struct cbq_sched_data *q = qdisc_priv(sch);
1481	struct cbq_class *cl = (struct cbq_class *)*arg;
1482	struct nlattr *opt = tca[TCA_OPTIONS];
1483	struct nlattr *tb[TCA_CBQ_MAX + 1];
1484	struct cbq_class *parent;
1485	struct qdisc_rate_table *rtab = NULL;
1486
1487	err = cbq_opt_parse(tb, opt, extack);
1488	if (err < 0)
1489		return err;
1490
1491	if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE]) {
1492		NL_SET_ERR_MSG(extack, "Neither overlimit strategy nor policing attributes can be used for changing class params");
1493		return -EOPNOTSUPP;
1494	}
1495
1496	if (cl) {
1497		/* Check parent */
1498		if (parentid) {
1499			if (cl->tparent &&
1500			    cl->tparent->common.classid != parentid) {
1501				NL_SET_ERR_MSG(extack, "Invalid parent id");
1502				return -EINVAL;
1503			}
1504			if (!cl->tparent && parentid != TC_H_ROOT) {
1505				NL_SET_ERR_MSG(extack, "Parent must be root");
1506				return -EINVAL;
1507			}
1508		}
1509
1510		if (tb[TCA_CBQ_RATE]) {
1511			rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1512					      tb[TCA_CBQ_RTAB], extack);
1513			if (rtab == NULL)
1514				return -EINVAL;
1515		}
1516
1517		if (tca[TCA_RATE]) {
1518			err = gen_replace_estimator(&cl->bstats, NULL,
1519						    &cl->rate_est,
1520						    NULL,
1521						    qdisc_root_sleeping_running(sch),
1522						    tca[TCA_RATE]);
1523			if (err) {
1524				NL_SET_ERR_MSG(extack, "Failed to replace specified rate estimator");
1525				qdisc_put_rtab(rtab);
1526				return err;
1527			}
1528		}
1529
1530		/* Change class parameters */
1531		sch_tree_lock(sch);
1532
1533		if (cl->next_alive != NULL)
1534			cbq_deactivate_class(cl);
1535
1536		if (rtab) {
1537			qdisc_put_rtab(cl->R_tab);
1538			cl->R_tab = rtab;
1539		}
1540
1541		if (tb[TCA_CBQ_LSSOPT])
1542			cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1543
1544		if (tb[TCA_CBQ_WRROPT]) {
1545			cbq_rmprio(q, cl);
1546			cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1547		}
1548
1549		if (tb[TCA_CBQ_FOPT])
1550			cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1551
1552		if (cl->q->q.qlen)
1553			cbq_activate_class(cl);
1554
1555		sch_tree_unlock(sch);
1556
1557		return 0;
1558	}
1559
1560	if (parentid == TC_H_ROOT)
1561		return -EINVAL;
1562
1563	if (!tb[TCA_CBQ_WRROPT] || !tb[TCA_CBQ_RATE] || !tb[TCA_CBQ_LSSOPT]) {
1564		NL_SET_ERR_MSG(extack, "One of the following attributes MUST be specified: WRR, rate or link sharing");
1565		return -EINVAL;
1566	}
1567
1568	rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB],
1569			      extack);
1570	if (rtab == NULL)
1571		return -EINVAL;
1572
1573	if (classid) {
1574		err = -EINVAL;
1575		if (TC_H_MAJ(classid ^ sch->handle) ||
1576		    cbq_class_lookup(q, classid)) {
1577			NL_SET_ERR_MSG(extack, "Specified class not found");
1578			goto failure;
1579		}
1580	} else {
1581		int i;
1582		classid = TC_H_MAKE(sch->handle, 0x8000);
1583
1584		for (i = 0; i < 0x8000; i++) {
1585			if (++q->hgenerator >= 0x8000)
1586				q->hgenerator = 1;
1587			if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1588				break;
1589		}
1590		err = -ENOSR;
1591		if (i >= 0x8000) {
1592			NL_SET_ERR_MSG(extack, "Unable to generate classid");
1593			goto failure;
1594		}
1595		classid = classid|q->hgenerator;
1596	}
1597
1598	parent = &q->link;
1599	if (parentid) {
1600		parent = cbq_class_lookup(q, parentid);
1601		err = -EINVAL;
1602		if (!parent) {
1603			NL_SET_ERR_MSG(extack, "Failed to find parentid");
1604			goto failure;
1605		}
1606	}
1607
1608	err = -ENOBUFS;
1609	cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1610	if (cl == NULL)
1611		goto failure;
1612
1613	err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
1614	if (err) {
1615		kfree(cl);
1616		goto failure;
1617	}
1618
1619	if (tca[TCA_RATE]) {
1620		err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
1621					NULL,
1622					qdisc_root_sleeping_running(sch),
1623					tca[TCA_RATE]);
1624		if (err) {
1625			NL_SET_ERR_MSG(extack, "Couldn't create new estimator");
1626			tcf_block_put(cl->block);
1627			kfree(cl);
1628			goto failure;
1629		}
1630	}
1631
1632	cl->R_tab = rtab;
1633	rtab = NULL;
1634	cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid,
1635				  NULL);
1636	if (!cl->q)
1637		cl->q = &noop_qdisc;
1638	else
1639		qdisc_hash_add(cl->q, true);
1640
1641	cl->common.classid = classid;
1642	cl->tparent = parent;
1643	cl->qdisc = sch;
1644	cl->allot = parent->allot;
1645	cl->quantum = cl->allot;
1646	cl->weight = cl->R_tab->rate.rate;
1647
1648	sch_tree_lock(sch);
1649	cbq_link_class(cl);
1650	cl->borrow = cl->tparent;
1651	if (cl->tparent != &q->link)
1652		cl->share = cl->tparent;
1653	cbq_adjust_levels(parent);
1654	cl->minidle = -0x7FFFFFFF;
1655	cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1656	cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1657	if (cl->ewma_log == 0)
1658		cl->ewma_log = q->link.ewma_log;
1659	if (cl->maxidle == 0)
1660		cl->maxidle = q->link.maxidle;
1661	if (cl->avpkt == 0)
1662		cl->avpkt = q->link.avpkt;
1663	if (tb[TCA_CBQ_FOPT])
1664		cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1665	sch_tree_unlock(sch);
1666
1667	qdisc_class_hash_grow(sch, &q->clhash);
1668
1669	*arg = (unsigned long)cl;
1670	return 0;
1671
1672failure:
1673	qdisc_put_rtab(rtab);
1674	return err;
1675}
1676
1677static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1678{
1679	struct cbq_sched_data *q = qdisc_priv(sch);
1680	struct cbq_class *cl = (struct cbq_class *)arg;
1681
1682	if (cl->filters || cl->children || cl == &q->link)
1683		return -EBUSY;
1684
1685	sch_tree_lock(sch);
1686
1687	qdisc_purge_queue(cl->q);
1688
1689	if (cl->next_alive)
1690		cbq_deactivate_class(cl);
1691
1692	if (q->tx_borrowed == cl)
1693		q->tx_borrowed = q->tx_class;
1694	if (q->tx_class == cl) {
1695		q->tx_class = NULL;
1696		q->tx_borrowed = NULL;
1697	}
1698#ifdef CONFIG_NET_CLS_ACT
1699	if (q->rx_class == cl)
1700		q->rx_class = NULL;
1701#endif
1702
1703	cbq_unlink_class(cl);
1704	cbq_adjust_levels(cl->tparent);
1705	cl->defmap = 0;
1706	cbq_sync_defmap(cl);
1707
1708	cbq_rmprio(q, cl);
1709	sch_tree_unlock(sch);
1710
1711	cbq_destroy_class(sch, cl);
1712	return 0;
1713}
1714
1715static struct tcf_block *cbq_tcf_block(struct Qdisc *sch, unsigned long arg,
1716				       struct netlink_ext_ack *extack)
1717{
1718	struct cbq_sched_data *q = qdisc_priv(sch);
1719	struct cbq_class *cl = (struct cbq_class *)arg;
1720
1721	if (cl == NULL)
1722		cl = &q->link;
1723
1724	return cl->block;
1725}
1726
1727static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1728				     u32 classid)
1729{
1730	struct cbq_sched_data *q = qdisc_priv(sch);
1731	struct cbq_class *p = (struct cbq_class *)parent;
1732	struct cbq_class *cl = cbq_class_lookup(q, classid);
1733
1734	if (cl) {
1735		if (p && p->level <= cl->level)
1736			return 0;
1737		cl->filters++;
1738		return (unsigned long)cl;
1739	}
1740	return 0;
1741}
1742
1743static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
1744{
1745	struct cbq_class *cl = (struct cbq_class *)arg;
1746
1747	cl->filters--;
1748}
1749
1750static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1751{
1752	struct cbq_sched_data *q = qdisc_priv(sch);
1753	struct cbq_class *cl;
1754	unsigned int h;
1755
1756	if (arg->stop)
1757		return;
1758
1759	for (h = 0; h < q->clhash.hashsize; h++) {
1760		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1761			if (arg->count < arg->skip) {
1762				arg->count++;
1763				continue;
1764			}
1765			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1766				arg->stop = 1;
1767				return;
1768			}
1769			arg->count++;
1770		}
1771	}
1772}
1773
1774static const struct Qdisc_class_ops cbq_class_ops = {
1775	.graft		=	cbq_graft,
1776	.leaf		=	cbq_leaf,
1777	.qlen_notify	=	cbq_qlen_notify,
1778	.find		=	cbq_find,
1779	.change		=	cbq_change_class,
1780	.delete		=	cbq_delete,
1781	.walk		=	cbq_walk,
1782	.tcf_block	=	cbq_tcf_block,
1783	.bind_tcf	=	cbq_bind_filter,
1784	.unbind_tcf	=	cbq_unbind_filter,
1785	.dump		=	cbq_dump_class,
1786	.dump_stats	=	cbq_dump_class_stats,
1787};
1788
1789static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
1790	.next		=	NULL,
1791	.cl_ops		=	&cbq_class_ops,
1792	.id		=	"cbq",
1793	.priv_size	=	sizeof(struct cbq_sched_data),
1794	.enqueue	=	cbq_enqueue,
1795	.dequeue	=	cbq_dequeue,
1796	.peek		=	qdisc_peek_dequeued,
1797	.init		=	cbq_init,
1798	.reset		=	cbq_reset,
1799	.destroy	=	cbq_destroy,
1800	.change		=	NULL,
1801	.dump		=	cbq_dump,
1802	.dump_stats	=	cbq_dump_stats,
1803	.owner		=	THIS_MODULE,
1804};
1805
1806static int __init cbq_module_init(void)
1807{
1808	return register_qdisc(&cbq_qdisc_ops);
1809}
1810static void __exit cbq_module_exit(void)
1811{
1812	unregister_qdisc(&cbq_qdisc_ops);
1813}
1814module_init(cbq_module_init)
1815module_exit(cbq_module_exit)
1816MODULE_LICENSE("GPL");
1817