xref: /kernel/linux/linux-5.10/net/sched/sch_api.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * net/sched/sch_api.c	Packet scheduler API.
4 *
5 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6 *
7 * Fixes:
8 *
9 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
10 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
11 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
12 */
13
14#include <linux/module.h>
15#include <linux/types.h>
16#include <linux/kernel.h>
17#include <linux/string.h>
18#include <linux/errno.h>
19#include <linux/skbuff.h>
20#include <linux/init.h>
21#include <linux/proc_fs.h>
22#include <linux/seq_file.h>
23#include <linux/kmod.h>
24#include <linux/list.h>
25#include <linux/hrtimer.h>
26#include <linux/slab.h>
27#include <linux/hashtable.h>
28
29#include <net/net_namespace.h>
30#include <net/sock.h>
31#include <net/netlink.h>
32#include <net/pkt_sched.h>
33#include <net/pkt_cls.h>
34
35#include <trace/events/qdisc.h>
36
37/*
38
39   Short review.
40   -------------
41
42   This file consists of two interrelated parts:
43
44   1. queueing disciplines manager frontend.
45   2. traffic classes manager frontend.
46
47   Generally, queueing discipline ("qdisc") is a black box,
48   which is able to enqueue packets and to dequeue them (when
49   device is ready to send something) in order and at times
50   determined by algorithm hidden in it.
51
52   qdisc's are divided to two categories:
53   - "queues", which have no internal structure visible from outside.
54   - "schedulers", which split all the packets to "traffic classes",
55     using "packet classifiers" (look at cls_api.c)
56
57   In turn, classes may have child qdiscs (as rule, queues)
58   attached to them etc. etc. etc.
59
60   The goal of the routines in this file is to translate
61   information supplied by user in the form of handles
62   to more intelligible for kernel form, to make some sanity
63   checks and part of work, which is common to all qdiscs
64   and to provide rtnetlink notifications.
65
66   All real intelligent work is done inside qdisc modules.
67
68
69
70   Every discipline has two major routines: enqueue and dequeue.
71
72   ---dequeue
73
74   dequeue usually returns a skb to send. It is allowed to return NULL,
75   but it does not mean that queue is empty, it just means that
76   discipline does not want to send anything this time.
77   Queue is really empty if q->q.qlen == 0.
78   For complicated disciplines with multiple queues q->q is not
79   real packet queue, but however q->q.qlen must be valid.
80
81   ---enqueue
82
83   enqueue returns 0, if packet was enqueued successfully.
84   If packet (this one or another one) was dropped, it returns
85   not zero error code.
86   NET_XMIT_DROP 	- this packet dropped
87     Expected action: do not backoff, but wait until queue will clear.
88   NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
89     Expected action: backoff or ignore
90
91   Auxiliary routines:
92
93   ---peek
94
95   like dequeue but without removing a packet from the queue
96
97   ---reset
98
99   returns qdisc to initial state: purge all buffers, clear all
100   timers, counters (except for statistics) etc.
101
102   ---init
103
104   initializes newly created qdisc.
105
106   ---destroy
107
108   destroys resources allocated by init and during lifetime of qdisc.
109
110   ---change
111
112   changes qdisc parameters.
113 */
114
115/* Protects list of registered TC modules. It is pure SMP lock. */
116static DEFINE_RWLOCK(qdisc_mod_lock);
117
118
119/************************************************
120 *	Queueing disciplines manipulation.	*
121 ************************************************/
122
123
124/* The list of all installed queueing disciplines. */
125
126static struct Qdisc_ops *qdisc_base;
127
128/* Register/unregister queueing discipline */
129
130int register_qdisc(struct Qdisc_ops *qops)
131{
132	struct Qdisc_ops *q, **qp;
133	int rc = -EEXIST;
134
135	write_lock(&qdisc_mod_lock);
136	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
137		if (!strcmp(qops->id, q->id))
138			goto out;
139
140	if (qops->enqueue == NULL)
141		qops->enqueue = noop_qdisc_ops.enqueue;
142	if (qops->peek == NULL) {
143		if (qops->dequeue == NULL)
144			qops->peek = noop_qdisc_ops.peek;
145		else
146			goto out_einval;
147	}
148	if (qops->dequeue == NULL)
149		qops->dequeue = noop_qdisc_ops.dequeue;
150
151	if (qops->cl_ops) {
152		const struct Qdisc_class_ops *cops = qops->cl_ops;
153
154		if (!(cops->find && cops->walk && cops->leaf))
155			goto out_einval;
156
157		if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
158			goto out_einval;
159	}
160
161	qops->next = NULL;
162	*qp = qops;
163	rc = 0;
164out:
165	write_unlock(&qdisc_mod_lock);
166	return rc;
167
168out_einval:
169	rc = -EINVAL;
170	goto out;
171}
172EXPORT_SYMBOL(register_qdisc);
173
174int unregister_qdisc(struct Qdisc_ops *qops)
175{
176	struct Qdisc_ops *q, **qp;
177	int err = -ENOENT;
178
179	write_lock(&qdisc_mod_lock);
180	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
181		if (q == qops)
182			break;
183	if (q) {
184		*qp = q->next;
185		q->next = NULL;
186		err = 0;
187	}
188	write_unlock(&qdisc_mod_lock);
189	return err;
190}
191EXPORT_SYMBOL(unregister_qdisc);
192
193/* Get default qdisc if not otherwise specified */
194void qdisc_get_default(char *name, size_t len)
195{
196	read_lock(&qdisc_mod_lock);
197	strlcpy(name, default_qdisc_ops->id, len);
198	read_unlock(&qdisc_mod_lock);
199}
200
201static struct Qdisc_ops *qdisc_lookup_default(const char *name)
202{
203	struct Qdisc_ops *q = NULL;
204
205	for (q = qdisc_base; q; q = q->next) {
206		if (!strcmp(name, q->id)) {
207			if (!try_module_get(q->owner))
208				q = NULL;
209			break;
210		}
211	}
212
213	return q;
214}
215
216/* Set new default qdisc to use */
217int qdisc_set_default(const char *name)
218{
219	const struct Qdisc_ops *ops;
220
221	if (!capable(CAP_NET_ADMIN))
222		return -EPERM;
223
224	write_lock(&qdisc_mod_lock);
225	ops = qdisc_lookup_default(name);
226	if (!ops) {
227		/* Not found, drop lock and try to load module */
228		write_unlock(&qdisc_mod_lock);
229		request_module("sch_%s", name);
230		write_lock(&qdisc_mod_lock);
231
232		ops = qdisc_lookup_default(name);
233	}
234
235	if (ops) {
236		/* Set new default */
237		module_put(default_qdisc_ops->owner);
238		default_qdisc_ops = ops;
239	}
240	write_unlock(&qdisc_mod_lock);
241
242	return ops ? 0 : -ENOENT;
243}
244
245#ifdef CONFIG_NET_SCH_DEFAULT
246/* Set default value from kernel config */
247static int __init sch_default_qdisc(void)
248{
249	return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
250}
251late_initcall(sch_default_qdisc);
252#endif
253
254/* We know handle. Find qdisc among all qdisc's attached to device
255 * (root qdisc, all its children, children of children etc.)
256 * Note: caller either uses rtnl or rcu_read_lock()
257 */
258
259static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
260{
261	struct Qdisc *q;
262
263	if (!qdisc_dev(root))
264		return (root->handle == handle ? root : NULL);
265
266	if (!(root->flags & TCQ_F_BUILTIN) &&
267	    root->handle == handle)
268		return root;
269
270	hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle,
271				   lockdep_rtnl_is_held()) {
272		if (q->handle == handle)
273			return q;
274	}
275	return NULL;
276}
277
278void qdisc_hash_add(struct Qdisc *q, bool invisible)
279{
280	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
281		ASSERT_RTNL();
282		hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
283		if (invisible)
284			q->flags |= TCQ_F_INVISIBLE;
285	}
286}
287EXPORT_SYMBOL(qdisc_hash_add);
288
289void qdisc_hash_del(struct Qdisc *q)
290{
291	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
292		ASSERT_RTNL();
293		hash_del_rcu(&q->hash);
294	}
295}
296EXPORT_SYMBOL(qdisc_hash_del);
297
298struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
299{
300	struct Qdisc *q;
301
302	if (!handle)
303		return NULL;
304	q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle);
305	if (q)
306		goto out;
307
308	if (dev_ingress_queue(dev))
309		q = qdisc_match_from_root(
310			dev_ingress_queue(dev)->qdisc_sleeping,
311			handle);
312out:
313	return q;
314}
315
316struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
317{
318	struct netdev_queue *nq;
319	struct Qdisc *q;
320
321	if (!handle)
322		return NULL;
323	q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle);
324	if (q)
325		goto out;
326
327	nq = dev_ingress_queue_rcu(dev);
328	if (nq)
329		q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
330out:
331	return q;
332}
333
334static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
335{
336	unsigned long cl;
337	const struct Qdisc_class_ops *cops = p->ops->cl_ops;
338
339	if (cops == NULL)
340		return NULL;
341	cl = cops->find(p, classid);
342
343	if (cl == 0)
344		return NULL;
345	return cops->leaf(p, cl);
346}
347
348/* Find queueing discipline by name */
349
350static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
351{
352	struct Qdisc_ops *q = NULL;
353
354	if (kind) {
355		read_lock(&qdisc_mod_lock);
356		for (q = qdisc_base; q; q = q->next) {
357			if (nla_strcmp(kind, q->id) == 0) {
358				if (!try_module_get(q->owner))
359					q = NULL;
360				break;
361			}
362		}
363		read_unlock(&qdisc_mod_lock);
364	}
365	return q;
366}
367
368/* The linklayer setting were not transferred from iproute2, in older
369 * versions, and the rate tables lookup systems have been dropped in
370 * the kernel. To keep backward compatible with older iproute2 tc
371 * utils, we detect the linklayer setting by detecting if the rate
372 * table were modified.
373 *
374 * For linklayer ATM table entries, the rate table will be aligned to
375 * 48 bytes, thus some table entries will contain the same value.  The
376 * mpu (min packet unit) is also encoded into the old rate table, thus
377 * starting from the mpu, we find low and high table entries for
378 * mapping this cell.  If these entries contain the same value, when
379 * the rate tables have been modified for linklayer ATM.
380 *
381 * This is done by rounding mpu to the nearest 48 bytes cell/entry,
382 * and then roundup to the next cell, calc the table entry one below,
383 * and compare.
384 */
385static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
386{
387	int low       = roundup(r->mpu, 48);
388	int high      = roundup(low+1, 48);
389	int cell_low  = low >> r->cell_log;
390	int cell_high = (high >> r->cell_log) - 1;
391
392	/* rtab is too inaccurate at rates > 100Mbit/s */
393	if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
394		pr_debug("TC linklayer: Giving up ATM detection\n");
395		return TC_LINKLAYER_ETHERNET;
396	}
397
398	if ((cell_high > cell_low) && (cell_high < 256)
399	    && (rtab[cell_low] == rtab[cell_high])) {
400		pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
401			 cell_low, cell_high, rtab[cell_high]);
402		return TC_LINKLAYER_ATM;
403	}
404	return TC_LINKLAYER_ETHERNET;
405}
406
407static struct qdisc_rate_table *qdisc_rtab_list;
408
409struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
410					struct nlattr *tab,
411					struct netlink_ext_ack *extack)
412{
413	struct qdisc_rate_table *rtab;
414
415	if (tab == NULL || r->rate == 0 ||
416	    r->cell_log == 0 || r->cell_log >= 32 ||
417	    nla_len(tab) != TC_RTAB_SIZE) {
418		NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
419		return NULL;
420	}
421
422	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
423		if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
424		    !memcmp(&rtab->data, nla_data(tab), 1024)) {
425			rtab->refcnt++;
426			return rtab;
427		}
428	}
429
430	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
431	if (rtab) {
432		rtab->rate = *r;
433		rtab->refcnt = 1;
434		memcpy(rtab->data, nla_data(tab), 1024);
435		if (r->linklayer == TC_LINKLAYER_UNAWARE)
436			r->linklayer = __detect_linklayer(r, rtab->data);
437		rtab->next = qdisc_rtab_list;
438		qdisc_rtab_list = rtab;
439	} else {
440		NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
441	}
442	return rtab;
443}
444EXPORT_SYMBOL(qdisc_get_rtab);
445
446void qdisc_put_rtab(struct qdisc_rate_table *tab)
447{
448	struct qdisc_rate_table *rtab, **rtabp;
449
450	if (!tab || --tab->refcnt)
451		return;
452
453	for (rtabp = &qdisc_rtab_list;
454	     (rtab = *rtabp) != NULL;
455	     rtabp = &rtab->next) {
456		if (rtab == tab) {
457			*rtabp = rtab->next;
458			kfree(rtab);
459			return;
460		}
461	}
462}
463EXPORT_SYMBOL(qdisc_put_rtab);
464
465static LIST_HEAD(qdisc_stab_list);
466
467static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
468	[TCA_STAB_BASE]	= { .len = sizeof(struct tc_sizespec) },
469	[TCA_STAB_DATA] = { .type = NLA_BINARY },
470};
471
472static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
473					       struct netlink_ext_ack *extack)
474{
475	struct nlattr *tb[TCA_STAB_MAX + 1];
476	struct qdisc_size_table *stab;
477	struct tc_sizespec *s;
478	unsigned int tsize = 0;
479	u16 *tab = NULL;
480	int err;
481
482	err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
483					  extack);
484	if (err < 0)
485		return ERR_PTR(err);
486	if (!tb[TCA_STAB_BASE]) {
487		NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
488		return ERR_PTR(-EINVAL);
489	}
490
491	s = nla_data(tb[TCA_STAB_BASE]);
492
493	if (s->tsize > 0) {
494		if (!tb[TCA_STAB_DATA]) {
495			NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
496			return ERR_PTR(-EINVAL);
497		}
498		tab = nla_data(tb[TCA_STAB_DATA]);
499		tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
500	}
501
502	if (tsize != s->tsize || (!tab && tsize > 0)) {
503		NL_SET_ERR_MSG(extack, "Invalid size of size table");
504		return ERR_PTR(-EINVAL);
505	}
506
507	list_for_each_entry(stab, &qdisc_stab_list, list) {
508		if (memcmp(&stab->szopts, s, sizeof(*s)))
509			continue;
510		if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
511			continue;
512		stab->refcnt++;
513		return stab;
514	}
515
516	if (s->size_log > STAB_SIZE_LOG_MAX ||
517	    s->cell_log > STAB_SIZE_LOG_MAX) {
518		NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table");
519		return ERR_PTR(-EINVAL);
520	}
521
522	stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
523	if (!stab)
524		return ERR_PTR(-ENOMEM);
525
526	stab->refcnt = 1;
527	stab->szopts = *s;
528	if (tsize > 0)
529		memcpy(stab->data, tab, tsize * sizeof(u16));
530
531	list_add_tail(&stab->list, &qdisc_stab_list);
532
533	return stab;
534}
535
536void qdisc_put_stab(struct qdisc_size_table *tab)
537{
538	if (!tab)
539		return;
540
541	if (--tab->refcnt == 0) {
542		list_del(&tab->list);
543		kfree_rcu(tab, rcu);
544	}
545}
546EXPORT_SYMBOL(qdisc_put_stab);
547
548static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
549{
550	struct nlattr *nest;
551
552	nest = nla_nest_start_noflag(skb, TCA_STAB);
553	if (nest == NULL)
554		goto nla_put_failure;
555	if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
556		goto nla_put_failure;
557	nla_nest_end(skb, nest);
558
559	return skb->len;
560
561nla_put_failure:
562	return -1;
563}
564
565void __qdisc_calculate_pkt_len(struct sk_buff *skb,
566			       const struct qdisc_size_table *stab)
567{
568	int pkt_len, slot;
569
570	pkt_len = skb->len + stab->szopts.overhead;
571	if (unlikely(!stab->szopts.tsize))
572		goto out;
573
574	slot = pkt_len + stab->szopts.cell_align;
575	if (unlikely(slot < 0))
576		slot = 0;
577
578	slot >>= stab->szopts.cell_log;
579	if (likely(slot < stab->szopts.tsize))
580		pkt_len = stab->data[slot];
581	else
582		pkt_len = stab->data[stab->szopts.tsize - 1] *
583				(slot / stab->szopts.tsize) +
584				stab->data[slot % stab->szopts.tsize];
585
586	pkt_len <<= stab->szopts.size_log;
587out:
588	if (unlikely(pkt_len < 1))
589		pkt_len = 1;
590	qdisc_skb_cb(skb)->pkt_len = pkt_len;
591}
592EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
593
594void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
595{
596	if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
597		pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
598			txt, qdisc->ops->id, qdisc->handle >> 16);
599		qdisc->flags |= TCQ_F_WARN_NONWC;
600	}
601}
602EXPORT_SYMBOL(qdisc_warn_nonwc);
603
604static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
605{
606	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
607						 timer);
608
609	rcu_read_lock();
610	__netif_schedule(qdisc_root(wd->qdisc));
611	rcu_read_unlock();
612
613	return HRTIMER_NORESTART;
614}
615
616void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
617				 clockid_t clockid)
618{
619	hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
620	wd->timer.function = qdisc_watchdog;
621	wd->qdisc = qdisc;
622}
623EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
624
625void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
626{
627	qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
628}
629EXPORT_SYMBOL(qdisc_watchdog_init);
630
631void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
632				      u64 delta_ns)
633{
634	if (test_bit(__QDISC_STATE_DEACTIVATED,
635		     &qdisc_root_sleeping(wd->qdisc)->state))
636		return;
637
638	if (hrtimer_is_queued(&wd->timer)) {
639		/* If timer is already set in [expires, expires + delta_ns],
640		 * do not reprogram it.
641		 */
642		if (wd->last_expires - expires <= delta_ns)
643			return;
644	}
645
646	wd->last_expires = expires;
647	hrtimer_start_range_ns(&wd->timer,
648			       ns_to_ktime(expires),
649			       delta_ns,
650			       HRTIMER_MODE_ABS_PINNED);
651}
652EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns);
653
654void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
655{
656	hrtimer_cancel(&wd->timer);
657}
658EXPORT_SYMBOL(qdisc_watchdog_cancel);
659
660static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
661{
662	struct hlist_head *h;
663	unsigned int i;
664
665	h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
666
667	if (h != NULL) {
668		for (i = 0; i < n; i++)
669			INIT_HLIST_HEAD(&h[i]);
670	}
671	return h;
672}
673
674void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
675{
676	struct Qdisc_class_common *cl;
677	struct hlist_node *next;
678	struct hlist_head *nhash, *ohash;
679	unsigned int nsize, nmask, osize;
680	unsigned int i, h;
681
682	/* Rehash when load factor exceeds 0.75 */
683	if (clhash->hashelems * 4 <= clhash->hashsize * 3)
684		return;
685	nsize = clhash->hashsize * 2;
686	nmask = nsize - 1;
687	nhash = qdisc_class_hash_alloc(nsize);
688	if (nhash == NULL)
689		return;
690
691	ohash = clhash->hash;
692	osize = clhash->hashsize;
693
694	sch_tree_lock(sch);
695	for (i = 0; i < osize; i++) {
696		hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
697			h = qdisc_class_hash(cl->classid, nmask);
698			hlist_add_head(&cl->hnode, &nhash[h]);
699		}
700	}
701	clhash->hash     = nhash;
702	clhash->hashsize = nsize;
703	clhash->hashmask = nmask;
704	sch_tree_unlock(sch);
705
706	kvfree(ohash);
707}
708EXPORT_SYMBOL(qdisc_class_hash_grow);
709
710int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
711{
712	unsigned int size = 4;
713
714	clhash->hash = qdisc_class_hash_alloc(size);
715	if (!clhash->hash)
716		return -ENOMEM;
717	clhash->hashsize  = size;
718	clhash->hashmask  = size - 1;
719	clhash->hashelems = 0;
720	return 0;
721}
722EXPORT_SYMBOL(qdisc_class_hash_init);
723
724void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
725{
726	kvfree(clhash->hash);
727}
728EXPORT_SYMBOL(qdisc_class_hash_destroy);
729
730void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
731			     struct Qdisc_class_common *cl)
732{
733	unsigned int h;
734
735	INIT_HLIST_NODE(&cl->hnode);
736	h = qdisc_class_hash(cl->classid, clhash->hashmask);
737	hlist_add_head(&cl->hnode, &clhash->hash[h]);
738	clhash->hashelems++;
739}
740EXPORT_SYMBOL(qdisc_class_hash_insert);
741
742void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
743			     struct Qdisc_class_common *cl)
744{
745	hlist_del(&cl->hnode);
746	clhash->hashelems--;
747}
748EXPORT_SYMBOL(qdisc_class_hash_remove);
749
750/* Allocate an unique handle from space managed by kernel
751 * Possible range is [8000-FFFF]:0000 (0x8000 values)
752 */
753static u32 qdisc_alloc_handle(struct net_device *dev)
754{
755	int i = 0x8000;
756	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
757
758	do {
759		autohandle += TC_H_MAKE(0x10000U, 0);
760		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
761			autohandle = TC_H_MAKE(0x80000000U, 0);
762		if (!qdisc_lookup(dev, autohandle))
763			return autohandle;
764		cond_resched();
765	} while	(--i > 0);
766
767	return 0;
768}
769
770void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
771{
772	bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
773	const struct Qdisc_class_ops *cops;
774	unsigned long cl;
775	u32 parentid;
776	bool notify;
777	int drops;
778
779	if (n == 0 && len == 0)
780		return;
781	drops = max_t(int, n, 0);
782	rcu_read_lock();
783	while ((parentid = sch->parent)) {
784		if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
785			break;
786
787		if (sch->flags & TCQ_F_NOPARENT)
788			break;
789		/* Notify parent qdisc only if child qdisc becomes empty.
790		 *
791		 * If child was empty even before update then backlog
792		 * counter is screwed and we skip notification because
793		 * parent class is already passive.
794		 *
795		 * If the original child was offloaded then it is allowed
796		 * to be seem as empty, so the parent is notified anyway.
797		 */
798		notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
799						       !qdisc_is_offloaded);
800		/* TODO: perform the search on a per txq basis */
801		sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
802		if (sch == NULL) {
803			WARN_ON_ONCE(parentid != TC_H_ROOT);
804			break;
805		}
806		cops = sch->ops->cl_ops;
807		if (notify && cops->qlen_notify) {
808			cl = cops->find(sch, parentid);
809			cops->qlen_notify(sch, cl);
810		}
811		sch->q.qlen -= n;
812		sch->qstats.backlog -= len;
813		__qdisc_qstats_drop(sch, drops);
814	}
815	rcu_read_unlock();
816}
817EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
818
819int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
820			      void *type_data)
821{
822	struct net_device *dev = qdisc_dev(sch);
823	int err;
824
825	sch->flags &= ~TCQ_F_OFFLOADED;
826	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
827		return 0;
828
829	err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
830	if (err == -EOPNOTSUPP)
831		return 0;
832
833	if (!err)
834		sch->flags |= TCQ_F_OFFLOADED;
835
836	return err;
837}
838EXPORT_SYMBOL(qdisc_offload_dump_helper);
839
840void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
841				struct Qdisc *new, struct Qdisc *old,
842				enum tc_setup_type type, void *type_data,
843				struct netlink_ext_ack *extack)
844{
845	bool any_qdisc_is_offloaded;
846	int err;
847
848	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
849		return;
850
851	err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
852
853	/* Don't report error if the graft is part of destroy operation. */
854	if (!err || !new || new == &noop_qdisc)
855		return;
856
857	/* Don't report error if the parent, the old child and the new
858	 * one are not offloaded.
859	 */
860	any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
861	any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
862	any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
863
864	if (any_qdisc_is_offloaded)
865		NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
866}
867EXPORT_SYMBOL(qdisc_offload_graft_helper);
868
869static void qdisc_offload_graft_root(struct net_device *dev,
870				     struct Qdisc *new, struct Qdisc *old,
871				     struct netlink_ext_ack *extack)
872{
873	struct tc_root_qopt_offload graft_offload = {
874		.command	= TC_ROOT_GRAFT,
875		.handle		= new ? new->handle : 0,
876		.ingress	= (new && new->flags & TCQ_F_INGRESS) ||
877				  (old && old->flags & TCQ_F_INGRESS),
878	};
879
880	qdisc_offload_graft_helper(dev, NULL, new, old,
881				   TC_SETUP_ROOT_QDISC, &graft_offload, extack);
882}
883
884static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
885			 u32 portid, u32 seq, u16 flags, int event)
886{
887	struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
888	struct gnet_stats_queue __percpu *cpu_qstats = NULL;
889	struct tcmsg *tcm;
890	struct nlmsghdr  *nlh;
891	unsigned char *b = skb_tail_pointer(skb);
892	struct gnet_dump d;
893	struct qdisc_size_table *stab;
894	u32 block_index;
895	__u32 qlen;
896
897	cond_resched();
898	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
899	if (!nlh)
900		goto out_nlmsg_trim;
901	tcm = nlmsg_data(nlh);
902	tcm->tcm_family = AF_UNSPEC;
903	tcm->tcm__pad1 = 0;
904	tcm->tcm__pad2 = 0;
905	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
906	tcm->tcm_parent = clid;
907	tcm->tcm_handle = q->handle;
908	tcm->tcm_info = refcount_read(&q->refcnt);
909	if (nla_put_string(skb, TCA_KIND, q->ops->id))
910		goto nla_put_failure;
911	if (q->ops->ingress_block_get) {
912		block_index = q->ops->ingress_block_get(q);
913		if (block_index &&
914		    nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
915			goto nla_put_failure;
916	}
917	if (q->ops->egress_block_get) {
918		block_index = q->ops->egress_block_get(q);
919		if (block_index &&
920		    nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
921			goto nla_put_failure;
922	}
923	if (q->ops->dump && q->ops->dump(q, skb) < 0)
924		goto nla_put_failure;
925	if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
926		goto nla_put_failure;
927	qlen = qdisc_qlen_sum(q);
928
929	stab = rtnl_dereference(q->stab);
930	if (stab && qdisc_dump_stab(skb, stab) < 0)
931		goto nla_put_failure;
932
933	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
934					 NULL, &d, TCA_PAD) < 0)
935		goto nla_put_failure;
936
937	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
938		goto nla_put_failure;
939
940	if (qdisc_is_percpu_stats(q)) {
941		cpu_bstats = q->cpu_bstats;
942		cpu_qstats = q->cpu_qstats;
943	}
944
945	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
946				  &d, cpu_bstats, &q->bstats) < 0 ||
947	    gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
948	    gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
949		goto nla_put_failure;
950
951	if (gnet_stats_finish_copy(&d) < 0)
952		goto nla_put_failure;
953
954	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
955	return skb->len;
956
957out_nlmsg_trim:
958nla_put_failure:
959	nlmsg_trim(skb, b);
960	return -1;
961}
962
963static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
964{
965	if (q->flags & TCQ_F_BUILTIN)
966		return true;
967	if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
968		return true;
969
970	return false;
971}
972
973static int qdisc_notify(struct net *net, struct sk_buff *oskb,
974			struct nlmsghdr *n, u32 clid,
975			struct Qdisc *old, struct Qdisc *new)
976{
977	struct sk_buff *skb;
978	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
979
980	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
981	if (!skb)
982		return -ENOBUFS;
983
984	if (old && !tc_qdisc_dump_ignore(old, false)) {
985		if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
986				  0, RTM_DELQDISC) < 0)
987			goto err_out;
988	}
989	if (new && !tc_qdisc_dump_ignore(new, false)) {
990		if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
991				  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
992			goto err_out;
993	}
994
995	if (skb->len)
996		return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
997				      n->nlmsg_flags & NLM_F_ECHO);
998
999err_out:
1000	kfree_skb(skb);
1001	return -EINVAL;
1002}
1003
1004static void notify_and_destroy(struct net *net, struct sk_buff *skb,
1005			       struct nlmsghdr *n, u32 clid,
1006			       struct Qdisc *old, struct Qdisc *new)
1007{
1008	if (new || old)
1009		qdisc_notify(net, skb, n, clid, old, new);
1010
1011	if (old)
1012		qdisc_put(old);
1013}
1014
1015static void qdisc_clear_nolock(struct Qdisc *sch)
1016{
1017	sch->flags &= ~TCQ_F_NOLOCK;
1018	if (!(sch->flags & TCQ_F_CPUSTATS))
1019		return;
1020
1021	free_percpu(sch->cpu_bstats);
1022	free_percpu(sch->cpu_qstats);
1023	sch->cpu_bstats = NULL;
1024	sch->cpu_qstats = NULL;
1025	sch->flags &= ~TCQ_F_CPUSTATS;
1026}
1027
1028/* Graft qdisc "new" to class "classid" of qdisc "parent" or
1029 * to device "dev".
1030 *
1031 * When appropriate send a netlink notification using 'skb'
1032 * and "n".
1033 *
1034 * On success, destroy old qdisc.
1035 */
1036
1037static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1038		       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1039		       struct Qdisc *new, struct Qdisc *old,
1040		       struct netlink_ext_ack *extack)
1041{
1042	struct Qdisc *q = old;
1043	struct net *net = dev_net(dev);
1044
1045	if (parent == NULL) {
1046		unsigned int i, num_q, ingress;
1047		struct netdev_queue *dev_queue;
1048
1049		ingress = 0;
1050		num_q = dev->num_tx_queues;
1051		if ((q && q->flags & TCQ_F_INGRESS) ||
1052		    (new && new->flags & TCQ_F_INGRESS)) {
1053			ingress = 1;
1054			if (!dev_ingress_queue(dev)) {
1055				NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1056				return -ENOENT;
1057			}
1058		}
1059
1060		if (dev->flags & IFF_UP)
1061			dev_deactivate(dev);
1062
1063		qdisc_offload_graft_root(dev, new, old, extack);
1064
1065		if (new && new->ops->attach)
1066			goto skip;
1067
1068		if (!ingress) {
1069			for (i = 0; i < num_q; i++) {
1070				dev_queue = netdev_get_tx_queue(dev, i);
1071				old = dev_graft_qdisc(dev_queue, new);
1072
1073				if (new && i > 0)
1074					qdisc_refcount_inc(new);
1075				qdisc_put(old);
1076			}
1077		} else {
1078			dev_queue = dev_ingress_queue(dev);
1079			old = dev_graft_qdisc(dev_queue, new);
1080		}
1081
1082skip:
1083		if (!ingress) {
1084			old = rtnl_dereference(dev->qdisc);
1085			if (new && !new->ops->attach)
1086				qdisc_refcount_inc(new);
1087			rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc);
1088
1089			notify_and_destroy(net, skb, n, classid, old, new);
1090
1091			if (new && new->ops->attach)
1092				new->ops->attach(new);
1093		} else {
1094			notify_and_destroy(net, skb, n, classid, old, new);
1095		}
1096
1097		if (dev->flags & IFF_UP)
1098			dev_activate(dev);
1099	} else {
1100		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1101		unsigned long cl;
1102		int err;
1103
1104		/* Only support running class lockless if parent is lockless */
1105		if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK))
1106			qdisc_clear_nolock(new);
1107
1108		if (!cops || !cops->graft)
1109			return -EOPNOTSUPP;
1110
1111		cl = cops->find(parent, classid);
1112		if (!cl) {
1113			NL_SET_ERR_MSG(extack, "Specified class not found");
1114			return -ENOENT;
1115		}
1116
1117		if (new && new->ops == &noqueue_qdisc_ops) {
1118			NL_SET_ERR_MSG(extack, "Cannot assign noqueue to a class");
1119			return -EINVAL;
1120		}
1121
1122		err = cops->graft(parent, cl, new, &old, extack);
1123		if (err)
1124			return err;
1125		notify_and_destroy(net, skb, n, classid, old, new);
1126	}
1127	return 0;
1128}
1129
1130static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1131				   struct netlink_ext_ack *extack)
1132{
1133	u32 block_index;
1134
1135	if (tca[TCA_INGRESS_BLOCK]) {
1136		block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1137
1138		if (!block_index) {
1139			NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1140			return -EINVAL;
1141		}
1142		if (!sch->ops->ingress_block_set) {
1143			NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1144			return -EOPNOTSUPP;
1145		}
1146		sch->ops->ingress_block_set(sch, block_index);
1147	}
1148	if (tca[TCA_EGRESS_BLOCK]) {
1149		block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1150
1151		if (!block_index) {
1152			NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1153			return -EINVAL;
1154		}
1155		if (!sch->ops->egress_block_set) {
1156			NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1157			return -EOPNOTSUPP;
1158		}
1159		sch->ops->egress_block_set(sch, block_index);
1160	}
1161	return 0;
1162}
1163
1164/*
1165   Allocate and initialize new qdisc.
1166
1167   Parameters are passed via opt.
1168 */
1169
1170static struct Qdisc *qdisc_create(struct net_device *dev,
1171				  struct netdev_queue *dev_queue,
1172				  struct Qdisc *p, u32 parent, u32 handle,
1173				  struct nlattr **tca, int *errp,
1174				  struct netlink_ext_ack *extack)
1175{
1176	int err;
1177	struct nlattr *kind = tca[TCA_KIND];
1178	struct Qdisc *sch;
1179	struct Qdisc_ops *ops;
1180	struct qdisc_size_table *stab;
1181
1182	ops = qdisc_lookup_ops(kind);
1183#ifdef CONFIG_MODULES
1184	if (ops == NULL && kind != NULL) {
1185		char name[IFNAMSIZ];
1186		if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
1187			/* We dropped the RTNL semaphore in order to
1188			 * perform the module load.  So, even if we
1189			 * succeeded in loading the module we have to
1190			 * tell the caller to replay the request.  We
1191			 * indicate this using -EAGAIN.
1192			 * We replay the request because the device may
1193			 * go away in the mean time.
1194			 */
1195			rtnl_unlock();
1196			request_module("sch_%s", name);
1197			rtnl_lock();
1198			ops = qdisc_lookup_ops(kind);
1199			if (ops != NULL) {
1200				/* We will try again qdisc_lookup_ops,
1201				 * so don't keep a reference.
1202				 */
1203				module_put(ops->owner);
1204				err = -EAGAIN;
1205				goto err_out;
1206			}
1207		}
1208	}
1209#endif
1210
1211	err = -ENOENT;
1212	if (!ops) {
1213		NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown");
1214		goto err_out;
1215	}
1216
1217	sch = qdisc_alloc(dev_queue, ops, extack);
1218	if (IS_ERR(sch)) {
1219		err = PTR_ERR(sch);
1220		goto err_out2;
1221	}
1222
1223	sch->parent = parent;
1224
1225	if (handle == TC_H_INGRESS) {
1226		if (!(sch->flags & TCQ_F_INGRESS)) {
1227			NL_SET_ERR_MSG(extack,
1228				       "Specified parent ID is reserved for ingress and clsact Qdiscs");
1229			err = -EINVAL;
1230			goto err_out3;
1231		}
1232		handle = TC_H_MAKE(TC_H_INGRESS, 0);
1233	} else {
1234		if (handle == 0) {
1235			handle = qdisc_alloc_handle(dev);
1236			if (handle == 0) {
1237				NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1238				err = -ENOSPC;
1239				goto err_out3;
1240			}
1241		}
1242		if (!netif_is_multiqueue(dev))
1243			sch->flags |= TCQ_F_ONETXQUEUE;
1244	}
1245
1246	sch->handle = handle;
1247
1248	/* This exist to keep backward compatible with a userspace
1249	 * loophole, what allowed userspace to get IFF_NO_QUEUE
1250	 * facility on older kernels by setting tx_queue_len=0 (prior
1251	 * to qdisc init), and then forgot to reinit tx_queue_len
1252	 * before again attaching a qdisc.
1253	 */
1254	if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1255		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1256		netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1257	}
1258
1259	err = qdisc_block_indexes_set(sch, tca, extack);
1260	if (err)
1261		goto err_out3;
1262
1263	if (ops->init) {
1264		err = ops->init(sch, tca[TCA_OPTIONS], extack);
1265		if (err != 0)
1266			goto err_out5;
1267	}
1268
1269	if (tca[TCA_STAB]) {
1270		stab = qdisc_get_stab(tca[TCA_STAB], extack);
1271		if (IS_ERR(stab)) {
1272			err = PTR_ERR(stab);
1273			goto err_out4;
1274		}
1275		rcu_assign_pointer(sch->stab, stab);
1276	}
1277	if (tca[TCA_RATE]) {
1278		seqcount_t *running;
1279
1280		err = -EOPNOTSUPP;
1281		if (sch->flags & TCQ_F_MQROOT) {
1282			NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1283			goto err_out4;
1284		}
1285
1286		if (sch->parent != TC_H_ROOT &&
1287		    !(sch->flags & TCQ_F_INGRESS) &&
1288		    (!p || !(p->flags & TCQ_F_MQROOT)))
1289			running = qdisc_root_sleeping_running(sch);
1290		else
1291			running = &sch->running;
1292
1293		err = gen_new_estimator(&sch->bstats,
1294					sch->cpu_bstats,
1295					&sch->rate_est,
1296					NULL,
1297					running,
1298					tca[TCA_RATE]);
1299		if (err) {
1300			NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1301			goto err_out4;
1302		}
1303	}
1304
1305	qdisc_hash_add(sch, false);
1306	trace_qdisc_create(ops, dev, parent);
1307
1308	return sch;
1309
1310err_out5:
1311	/* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1312	if (ops->destroy)
1313		ops->destroy(sch);
1314err_out3:
1315	dev_put(dev);
1316	qdisc_free(sch);
1317err_out2:
1318	module_put(ops->owner);
1319err_out:
1320	*errp = err;
1321	return NULL;
1322
1323err_out4:
1324	/*
1325	 * Any broken qdiscs that would require a ops->reset() here?
1326	 * The qdisc was never in action so it shouldn't be necessary.
1327	 */
1328	qdisc_put_stab(rtnl_dereference(sch->stab));
1329	if (ops->destroy)
1330		ops->destroy(sch);
1331	goto err_out3;
1332}
1333
1334static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1335			struct netlink_ext_ack *extack)
1336{
1337	struct qdisc_size_table *ostab, *stab = NULL;
1338	int err = 0;
1339
1340	if (tca[TCA_OPTIONS]) {
1341		if (!sch->ops->change) {
1342			NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1343			return -EINVAL;
1344		}
1345		if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1346			NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1347			return -EOPNOTSUPP;
1348		}
1349		err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1350		if (err)
1351			return err;
1352	}
1353
1354	if (tca[TCA_STAB]) {
1355		stab = qdisc_get_stab(tca[TCA_STAB], extack);
1356		if (IS_ERR(stab))
1357			return PTR_ERR(stab);
1358	}
1359
1360	ostab = rtnl_dereference(sch->stab);
1361	rcu_assign_pointer(sch->stab, stab);
1362	qdisc_put_stab(ostab);
1363
1364	if (tca[TCA_RATE]) {
1365		/* NB: ignores errors from replace_estimator
1366		   because change can't be undone. */
1367		if (sch->flags & TCQ_F_MQROOT)
1368			goto out;
1369		gen_replace_estimator(&sch->bstats,
1370				      sch->cpu_bstats,
1371				      &sch->rate_est,
1372				      NULL,
1373				      qdisc_root_sleeping_running(sch),
1374				      tca[TCA_RATE]);
1375	}
1376out:
1377	return 0;
1378}
1379
1380struct check_loop_arg {
1381	struct qdisc_walker	w;
1382	struct Qdisc		*p;
1383	int			depth;
1384};
1385
1386static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1387			 struct qdisc_walker *w);
1388
1389static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1390{
1391	struct check_loop_arg	arg;
1392
1393	if (q->ops->cl_ops == NULL)
1394		return 0;
1395
1396	arg.w.stop = arg.w.skip = arg.w.count = 0;
1397	arg.w.fn = check_loop_fn;
1398	arg.depth = depth;
1399	arg.p = p;
1400	q->ops->cl_ops->walk(q, &arg.w);
1401	return arg.w.stop ? -ELOOP : 0;
1402}
1403
1404static int
1405check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1406{
1407	struct Qdisc *leaf;
1408	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1409	struct check_loop_arg *arg = (struct check_loop_arg *)w;
1410
1411	leaf = cops->leaf(q, cl);
1412	if (leaf) {
1413		if (leaf == arg->p || arg->depth > 7)
1414			return -ELOOP;
1415		return check_loop(leaf, arg->p, arg->depth + 1);
1416	}
1417	return 0;
1418}
1419
1420const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1421	[TCA_KIND]		= { .type = NLA_STRING },
1422	[TCA_RATE]		= { .type = NLA_BINARY,
1423				    .len = sizeof(struct tc_estimator) },
1424	[TCA_STAB]		= { .type = NLA_NESTED },
1425	[TCA_DUMP_INVISIBLE]	= { .type = NLA_FLAG },
1426	[TCA_CHAIN]		= { .type = NLA_U32 },
1427	[TCA_INGRESS_BLOCK]	= { .type = NLA_U32 },
1428	[TCA_EGRESS_BLOCK]	= { .type = NLA_U32 },
1429};
1430
1431/*
1432 * Delete/get qdisc.
1433 */
1434
1435static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1436			struct netlink_ext_ack *extack)
1437{
1438	struct net *net = sock_net(skb->sk);
1439	struct tcmsg *tcm = nlmsg_data(n);
1440	struct nlattr *tca[TCA_MAX + 1];
1441	struct net_device *dev;
1442	u32 clid;
1443	struct Qdisc *q = NULL;
1444	struct Qdisc *p = NULL;
1445	int err;
1446
1447	if ((n->nlmsg_type != RTM_GETQDISC) &&
1448	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1449		return -EPERM;
1450
1451	err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1452				     rtm_tca_policy, extack);
1453	if (err < 0)
1454		return err;
1455
1456	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1457	if (!dev)
1458		return -ENODEV;
1459
1460	clid = tcm->tcm_parent;
1461	if (clid) {
1462		if (clid != TC_H_ROOT) {
1463			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1464				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1465				if (!p) {
1466					NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1467					return -ENOENT;
1468				}
1469				q = qdisc_leaf(p, clid);
1470			} else if (dev_ingress_queue(dev)) {
1471				q = dev_ingress_queue(dev)->qdisc_sleeping;
1472			}
1473		} else {
1474			q = rtnl_dereference(dev->qdisc);
1475		}
1476		if (!q) {
1477			NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1478			return -ENOENT;
1479		}
1480
1481		if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1482			NL_SET_ERR_MSG(extack, "Invalid handle");
1483			return -EINVAL;
1484		}
1485	} else {
1486		q = qdisc_lookup(dev, tcm->tcm_handle);
1487		if (!q) {
1488			NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1489			return -ENOENT;
1490		}
1491	}
1492
1493	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1494		NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1495		return -EINVAL;
1496	}
1497
1498	if (n->nlmsg_type == RTM_DELQDISC) {
1499		if (!clid) {
1500			NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1501			return -EINVAL;
1502		}
1503		if (q->handle == 0) {
1504			NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1505			return -ENOENT;
1506		}
1507		err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1508		if (err != 0)
1509			return err;
1510	} else {
1511		qdisc_notify(net, skb, n, clid, NULL, q);
1512	}
1513	return 0;
1514}
1515
1516static bool req_create_or_replace(struct nlmsghdr *n)
1517{
1518	return (n->nlmsg_flags & NLM_F_CREATE &&
1519		n->nlmsg_flags & NLM_F_REPLACE);
1520}
1521
1522static bool req_create_exclusive(struct nlmsghdr *n)
1523{
1524	return (n->nlmsg_flags & NLM_F_CREATE &&
1525		n->nlmsg_flags & NLM_F_EXCL);
1526}
1527
1528static bool req_change(struct nlmsghdr *n)
1529{
1530	return (!(n->nlmsg_flags & NLM_F_CREATE) &&
1531		!(n->nlmsg_flags & NLM_F_REPLACE) &&
1532		!(n->nlmsg_flags & NLM_F_EXCL));
1533}
1534
1535/*
1536 * Create/change qdisc.
1537 */
1538static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1539			   struct netlink_ext_ack *extack)
1540{
1541	struct net *net = sock_net(skb->sk);
1542	struct tcmsg *tcm;
1543	struct nlattr *tca[TCA_MAX + 1];
1544	struct net_device *dev;
1545	u32 clid;
1546	struct Qdisc *q, *p;
1547	int err;
1548
1549	if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1550		return -EPERM;
1551
1552replay:
1553	/* Reinit, just in case something touches this. */
1554	err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1555				     rtm_tca_policy, extack);
1556	if (err < 0)
1557		return err;
1558
1559	tcm = nlmsg_data(n);
1560	clid = tcm->tcm_parent;
1561	q = p = NULL;
1562
1563	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1564	if (!dev)
1565		return -ENODEV;
1566
1567
1568	if (clid) {
1569		if (clid != TC_H_ROOT) {
1570			if (clid != TC_H_INGRESS) {
1571				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1572				if (!p) {
1573					NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1574					return -ENOENT;
1575				}
1576				q = qdisc_leaf(p, clid);
1577			} else if (dev_ingress_queue_create(dev)) {
1578				q = dev_ingress_queue(dev)->qdisc_sleeping;
1579			}
1580		} else {
1581			q = rtnl_dereference(dev->qdisc);
1582		}
1583
1584		/* It may be default qdisc, ignore it */
1585		if (q && q->handle == 0)
1586			q = NULL;
1587
1588		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1589			if (tcm->tcm_handle) {
1590				if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1591					NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1592					return -EEXIST;
1593				}
1594				if (TC_H_MIN(tcm->tcm_handle)) {
1595					NL_SET_ERR_MSG(extack, "Invalid minor handle");
1596					return -EINVAL;
1597				}
1598				q = qdisc_lookup(dev, tcm->tcm_handle);
1599				if (!q)
1600					goto create_n_graft;
1601				if (n->nlmsg_flags & NLM_F_EXCL) {
1602					NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1603					return -EEXIST;
1604				}
1605				if (tca[TCA_KIND] &&
1606				    nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1607					NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1608					return -EINVAL;
1609				}
1610				if (q->flags & TCQ_F_INGRESS) {
1611					NL_SET_ERR_MSG(extack,
1612						       "Cannot regraft ingress or clsact Qdiscs");
1613					return -EINVAL;
1614				}
1615				if (q == p ||
1616				    (p && check_loop(q, p, 0))) {
1617					NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1618					return -ELOOP;
1619				}
1620				if (clid == TC_H_INGRESS) {
1621					NL_SET_ERR_MSG(extack, "Ingress cannot graft directly");
1622					return -EINVAL;
1623				}
1624				qdisc_refcount_inc(q);
1625				goto graft;
1626			} else {
1627				if (!q)
1628					goto create_n_graft;
1629
1630				/* This magic test requires explanation.
1631				 *
1632				 *   We know, that some child q is already
1633				 *   attached to this parent and have choice:
1634				 *   1) change it or 2) create/graft new one.
1635				 *   If the requested qdisc kind is different
1636				 *   than the existing one, then we choose graft.
1637				 *   If they are the same then this is "change"
1638				 *   operation - just let it fallthrough..
1639				 *
1640				 *   1. We are allowed to create/graft only
1641				 *   if the request is explicitly stating
1642				 *   "please create if it doesn't exist".
1643				 *
1644				 *   2. If the request is to exclusive create
1645				 *   then the qdisc tcm_handle is not expected
1646				 *   to exist, so that we choose create/graft too.
1647				 *
1648				 *   3. The last case is when no flags are set.
1649				 *   This will happen when for example tc
1650				 *   utility issues a "change" command.
1651				 *   Alas, it is sort of hole in API, we
1652				 *   cannot decide what to do unambiguously.
1653				 *   For now we select create/graft.
1654				 */
1655				if (tca[TCA_KIND] &&
1656				    nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1657					if (req_create_or_replace(n) ||
1658					    req_create_exclusive(n))
1659						goto create_n_graft;
1660					else if (req_change(n))
1661						goto create_n_graft2;
1662				}
1663			}
1664		}
1665	} else {
1666		if (!tcm->tcm_handle) {
1667			NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1668			return -EINVAL;
1669		}
1670		q = qdisc_lookup(dev, tcm->tcm_handle);
1671	}
1672
1673	/* Change qdisc parameters */
1674	if (!q) {
1675		NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1676		return -ENOENT;
1677	}
1678	if (n->nlmsg_flags & NLM_F_EXCL) {
1679		NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1680		return -EEXIST;
1681	}
1682	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1683		NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1684		return -EINVAL;
1685	}
1686	err = qdisc_change(q, tca, extack);
1687	if (err == 0)
1688		qdisc_notify(net, skb, n, clid, NULL, q);
1689	return err;
1690
1691create_n_graft:
1692	if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1693		NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1694		return -ENOENT;
1695	}
1696create_n_graft2:
1697	if (clid == TC_H_INGRESS) {
1698		if (dev_ingress_queue(dev)) {
1699			q = qdisc_create(dev, dev_ingress_queue(dev), p,
1700					 tcm->tcm_parent, tcm->tcm_parent,
1701					 tca, &err, extack);
1702		} else {
1703			NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1704			err = -ENOENT;
1705		}
1706	} else {
1707		struct netdev_queue *dev_queue;
1708
1709		if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1710			dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1711		else if (p)
1712			dev_queue = p->dev_queue;
1713		else
1714			dev_queue = netdev_get_tx_queue(dev, 0);
1715
1716		q = qdisc_create(dev, dev_queue, p,
1717				 tcm->tcm_parent, tcm->tcm_handle,
1718				 tca, &err, extack);
1719	}
1720	if (q == NULL) {
1721		if (err == -EAGAIN)
1722			goto replay;
1723		return err;
1724	}
1725
1726graft:
1727	err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1728	if (err) {
1729		if (q)
1730			qdisc_put(q);
1731		return err;
1732	}
1733
1734	return 0;
1735}
1736
1737static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1738			      struct netlink_callback *cb,
1739			      int *q_idx_p, int s_q_idx, bool recur,
1740			      bool dump_invisible)
1741{
1742	int ret = 0, q_idx = *q_idx_p;
1743	struct Qdisc *q;
1744	int b;
1745
1746	if (!root)
1747		return 0;
1748
1749	q = root;
1750	if (q_idx < s_q_idx) {
1751		q_idx++;
1752	} else {
1753		if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1754		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1755				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1756				  RTM_NEWQDISC) <= 0)
1757			goto done;
1758		q_idx++;
1759	}
1760
1761	/* If dumping singletons, there is no qdisc_dev(root) and the singleton
1762	 * itself has already been dumped.
1763	 *
1764	 * If we've already dumped the top-level (ingress) qdisc above and the global
1765	 * qdisc hashtable, we don't want to hit it again
1766	 */
1767	if (!qdisc_dev(root) || !recur)
1768		goto out;
1769
1770	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1771		if (q_idx < s_q_idx) {
1772			q_idx++;
1773			continue;
1774		}
1775		if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1776		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1777				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1778				  RTM_NEWQDISC) <= 0)
1779			goto done;
1780		q_idx++;
1781	}
1782
1783out:
1784	*q_idx_p = q_idx;
1785	return ret;
1786done:
1787	ret = -1;
1788	goto out;
1789}
1790
1791static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1792{
1793	struct net *net = sock_net(skb->sk);
1794	int idx, q_idx;
1795	int s_idx, s_q_idx;
1796	struct net_device *dev;
1797	const struct nlmsghdr *nlh = cb->nlh;
1798	struct nlattr *tca[TCA_MAX + 1];
1799	int err;
1800
1801	s_idx = cb->args[0];
1802	s_q_idx = q_idx = cb->args[1];
1803
1804	idx = 0;
1805	ASSERT_RTNL();
1806
1807	err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1808				     rtm_tca_policy, cb->extack);
1809	if (err < 0)
1810		return err;
1811
1812	for_each_netdev(net, dev) {
1813		struct netdev_queue *dev_queue;
1814
1815		if (idx < s_idx)
1816			goto cont;
1817		if (idx > s_idx)
1818			s_q_idx = 0;
1819		q_idx = 0;
1820
1821		if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc),
1822				       skb, cb, &q_idx, s_q_idx,
1823				       true, tca[TCA_DUMP_INVISIBLE]) < 0)
1824			goto done;
1825
1826		dev_queue = dev_ingress_queue(dev);
1827		if (dev_queue &&
1828		    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1829				       &q_idx, s_q_idx, false,
1830				       tca[TCA_DUMP_INVISIBLE]) < 0)
1831			goto done;
1832
1833cont:
1834		idx++;
1835	}
1836
1837done:
1838	cb->args[0] = idx;
1839	cb->args[1] = q_idx;
1840
1841	return skb->len;
1842}
1843
1844
1845
1846/************************************************
1847 *	Traffic classes manipulation.		*
1848 ************************************************/
1849
1850static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1851			  unsigned long cl,
1852			  u32 portid, u32 seq, u16 flags, int event)
1853{
1854	struct tcmsg *tcm;
1855	struct nlmsghdr  *nlh;
1856	unsigned char *b = skb_tail_pointer(skb);
1857	struct gnet_dump d;
1858	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1859
1860	cond_resched();
1861	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1862	if (!nlh)
1863		goto out_nlmsg_trim;
1864	tcm = nlmsg_data(nlh);
1865	tcm->tcm_family = AF_UNSPEC;
1866	tcm->tcm__pad1 = 0;
1867	tcm->tcm__pad2 = 0;
1868	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1869	tcm->tcm_parent = q->handle;
1870	tcm->tcm_handle = q->handle;
1871	tcm->tcm_info = 0;
1872	if (nla_put_string(skb, TCA_KIND, q->ops->id))
1873		goto nla_put_failure;
1874	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1875		goto nla_put_failure;
1876
1877	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1878					 NULL, &d, TCA_PAD) < 0)
1879		goto nla_put_failure;
1880
1881	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1882		goto nla_put_failure;
1883
1884	if (gnet_stats_finish_copy(&d) < 0)
1885		goto nla_put_failure;
1886
1887	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1888	return skb->len;
1889
1890out_nlmsg_trim:
1891nla_put_failure:
1892	nlmsg_trim(skb, b);
1893	return -1;
1894}
1895
1896static int tclass_notify(struct net *net, struct sk_buff *oskb,
1897			 struct nlmsghdr *n, struct Qdisc *q,
1898			 unsigned long cl, int event)
1899{
1900	struct sk_buff *skb;
1901	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1902	int err = 0;
1903
1904	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1905	if (!skb)
1906		return -ENOBUFS;
1907
1908	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1909		kfree_skb(skb);
1910		return -EINVAL;
1911	}
1912
1913	err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1914			     n->nlmsg_flags & NLM_F_ECHO);
1915	if (err > 0)
1916		err = 0;
1917	return err;
1918}
1919
1920static int tclass_del_notify(struct net *net,
1921			     const struct Qdisc_class_ops *cops,
1922			     struct sk_buff *oskb, struct nlmsghdr *n,
1923			     struct Qdisc *q, unsigned long cl)
1924{
1925	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1926	struct sk_buff *skb;
1927	int err = 0;
1928
1929	if (!cops->delete)
1930		return -EOPNOTSUPP;
1931
1932	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1933	if (!skb)
1934		return -ENOBUFS;
1935
1936	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1937			   RTM_DELTCLASS) < 0) {
1938		kfree_skb(skb);
1939		return -EINVAL;
1940	}
1941
1942	err = cops->delete(q, cl);
1943	if (err) {
1944		kfree_skb(skb);
1945		return err;
1946	}
1947
1948	err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1949			     n->nlmsg_flags & NLM_F_ECHO);
1950	if (err > 0)
1951		err = 0;
1952	return err;
1953}
1954
1955#ifdef CONFIG_NET_CLS
1956
1957struct tcf_bind_args {
1958	struct tcf_walker w;
1959	unsigned long base;
1960	unsigned long cl;
1961	u32 classid;
1962};
1963
1964static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1965{
1966	struct tcf_bind_args *a = (void *)arg;
1967
1968	if (tp->ops->bind_class) {
1969		struct Qdisc *q = tcf_block_q(tp->chain->block);
1970
1971		sch_tree_lock(q);
1972		tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
1973		sch_tree_unlock(q);
1974	}
1975	return 0;
1976}
1977
1978struct tc_bind_class_args {
1979	struct qdisc_walker w;
1980	unsigned long new_cl;
1981	u32 portid;
1982	u32 clid;
1983};
1984
1985static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
1986				struct qdisc_walker *w)
1987{
1988	struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
1989	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1990	struct tcf_block *block;
1991	struct tcf_chain *chain;
1992
1993	block = cops->tcf_block(q, cl, NULL);
1994	if (!block)
1995		return 0;
1996	for (chain = tcf_get_next_chain(block, NULL);
1997	     chain;
1998	     chain = tcf_get_next_chain(block, chain)) {
1999		struct tcf_proto *tp;
2000
2001		for (tp = tcf_get_next_proto(chain, NULL, true);
2002		     tp; tp = tcf_get_next_proto(chain, tp, true)) {
2003			struct tcf_bind_args arg = {};
2004
2005			arg.w.fn = tcf_node_bind;
2006			arg.classid = a->clid;
2007			arg.base = cl;
2008			arg.cl = a->new_cl;
2009			tp->ops->walk(tp, &arg.w, true);
2010		}
2011	}
2012
2013	return 0;
2014}
2015
2016static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
2017			   unsigned long new_cl)
2018{
2019	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
2020	struct tc_bind_class_args args = {};
2021
2022	if (!cops->tcf_block)
2023		return;
2024	args.portid = portid;
2025	args.clid = clid;
2026	args.new_cl = new_cl;
2027	args.w.fn = tc_bind_class_walker;
2028	q->ops->cl_ops->walk(q, &args.w);
2029}
2030
2031#else
2032
2033static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
2034			   unsigned long new_cl)
2035{
2036}
2037
2038#endif
2039
2040static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
2041			 struct netlink_ext_ack *extack)
2042{
2043	struct net *net = sock_net(skb->sk);
2044	struct tcmsg *tcm = nlmsg_data(n);
2045	struct nlattr *tca[TCA_MAX + 1];
2046	struct net_device *dev;
2047	struct Qdisc *q = NULL;
2048	const struct Qdisc_class_ops *cops;
2049	unsigned long cl = 0;
2050	unsigned long new_cl;
2051	u32 portid;
2052	u32 clid;
2053	u32 qid;
2054	int err;
2055
2056	if ((n->nlmsg_type != RTM_GETTCLASS) &&
2057	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
2058		return -EPERM;
2059
2060	err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
2061				     rtm_tca_policy, extack);
2062	if (err < 0)
2063		return err;
2064
2065	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
2066	if (!dev)
2067		return -ENODEV;
2068
2069	/*
2070	   parent == TC_H_UNSPEC - unspecified parent.
2071	   parent == TC_H_ROOT   - class is root, which has no parent.
2072	   parent == X:0	 - parent is root class.
2073	   parent == X:Y	 - parent is a node in hierarchy.
2074	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
2075
2076	   handle == 0:0	 - generate handle from kernel pool.
2077	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
2078	   handle == X:Y	 - clear.
2079	   handle == X:0	 - root class.
2080	 */
2081
2082	/* Step 1. Determine qdisc handle X:0 */
2083
2084	portid = tcm->tcm_parent;
2085	clid = tcm->tcm_handle;
2086	qid = TC_H_MAJ(clid);
2087
2088	if (portid != TC_H_ROOT) {
2089		u32 qid1 = TC_H_MAJ(portid);
2090
2091		if (qid && qid1) {
2092			/* If both majors are known, they must be identical. */
2093			if (qid != qid1)
2094				return -EINVAL;
2095		} else if (qid1) {
2096			qid = qid1;
2097		} else if (qid == 0)
2098			qid = rtnl_dereference(dev->qdisc)->handle;
2099
2100		/* Now qid is genuine qdisc handle consistent
2101		 * both with parent and child.
2102		 *
2103		 * TC_H_MAJ(portid) still may be unspecified, complete it now.
2104		 */
2105		if (portid)
2106			portid = TC_H_MAKE(qid, portid);
2107	} else {
2108		if (qid == 0)
2109			qid = rtnl_dereference(dev->qdisc)->handle;
2110	}
2111
2112	/* OK. Locate qdisc */
2113	q = qdisc_lookup(dev, qid);
2114	if (!q)
2115		return -ENOENT;
2116
2117	/* An check that it supports classes */
2118	cops = q->ops->cl_ops;
2119	if (cops == NULL)
2120		return -EINVAL;
2121
2122	/* Now try to get class */
2123	if (clid == 0) {
2124		if (portid == TC_H_ROOT)
2125			clid = qid;
2126	} else
2127		clid = TC_H_MAKE(qid, clid);
2128
2129	if (clid)
2130		cl = cops->find(q, clid);
2131
2132	if (cl == 0) {
2133		err = -ENOENT;
2134		if (n->nlmsg_type != RTM_NEWTCLASS ||
2135		    !(n->nlmsg_flags & NLM_F_CREATE))
2136			goto out;
2137	} else {
2138		switch (n->nlmsg_type) {
2139		case RTM_NEWTCLASS:
2140			err = -EEXIST;
2141			if (n->nlmsg_flags & NLM_F_EXCL)
2142				goto out;
2143			break;
2144		case RTM_DELTCLASS:
2145			err = tclass_del_notify(net, cops, skb, n, q, cl);
2146			/* Unbind the class with flilters with 0 */
2147			tc_bind_tclass(q, portid, clid, 0);
2148			goto out;
2149		case RTM_GETTCLASS:
2150			err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
2151			goto out;
2152		default:
2153			err = -EINVAL;
2154			goto out;
2155		}
2156	}
2157
2158	if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2159		NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2160		return -EOPNOTSUPP;
2161	}
2162
2163	new_cl = cl;
2164	err = -EOPNOTSUPP;
2165	if (cops->change)
2166		err = cops->change(q, clid, portid, tca, &new_cl, extack);
2167	if (err == 0) {
2168		tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
2169		/* We just create a new class, need to do reverse binding. */
2170		if (cl != new_cl)
2171			tc_bind_tclass(q, portid, clid, new_cl);
2172	}
2173out:
2174	return err;
2175}
2176
2177struct qdisc_dump_args {
2178	struct qdisc_walker	w;
2179	struct sk_buff		*skb;
2180	struct netlink_callback	*cb;
2181};
2182
2183static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2184			    struct qdisc_walker *arg)
2185{
2186	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2187
2188	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2189			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2190			      RTM_NEWTCLASS);
2191}
2192
2193static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2194				struct tcmsg *tcm, struct netlink_callback *cb,
2195				int *t_p, int s_t)
2196{
2197	struct qdisc_dump_args arg;
2198
2199	if (tc_qdisc_dump_ignore(q, false) ||
2200	    *t_p < s_t || !q->ops->cl_ops ||
2201	    (tcm->tcm_parent &&
2202	     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2203		(*t_p)++;
2204		return 0;
2205	}
2206	if (*t_p > s_t)
2207		memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2208	arg.w.fn = qdisc_class_dump;
2209	arg.skb = skb;
2210	arg.cb = cb;
2211	arg.w.stop  = 0;
2212	arg.w.skip = cb->args[1];
2213	arg.w.count = 0;
2214	q->ops->cl_ops->walk(q, &arg.w);
2215	cb->args[1] = arg.w.count;
2216	if (arg.w.stop)
2217		return -1;
2218	(*t_p)++;
2219	return 0;
2220}
2221
2222static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2223			       struct tcmsg *tcm, struct netlink_callback *cb,
2224			       int *t_p, int s_t, bool recur)
2225{
2226	struct Qdisc *q;
2227	int b;
2228
2229	if (!root)
2230		return 0;
2231
2232	if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2233		return -1;
2234
2235	if (!qdisc_dev(root) || !recur)
2236		return 0;
2237
2238	if (tcm->tcm_parent) {
2239		q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2240		if (q && q != root &&
2241		    tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2242			return -1;
2243		return 0;
2244	}
2245	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2246		if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2247			return -1;
2248	}
2249
2250	return 0;
2251}
2252
2253static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2254{
2255	struct tcmsg *tcm = nlmsg_data(cb->nlh);
2256	struct net *net = sock_net(skb->sk);
2257	struct netdev_queue *dev_queue;
2258	struct net_device *dev;
2259	int t, s_t;
2260
2261	if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2262		return 0;
2263	dev = dev_get_by_index(net, tcm->tcm_ifindex);
2264	if (!dev)
2265		return 0;
2266
2267	s_t = cb->args[0];
2268	t = 0;
2269
2270	if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc),
2271				skb, tcm, cb, &t, s_t, true) < 0)
2272		goto done;
2273
2274	dev_queue = dev_ingress_queue(dev);
2275	if (dev_queue &&
2276	    tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2277				&t, s_t, false) < 0)
2278		goto done;
2279
2280done:
2281	cb->args[0] = t;
2282
2283	dev_put(dev);
2284	return skb->len;
2285}
2286
2287#ifdef CONFIG_PROC_FS
2288static int psched_show(struct seq_file *seq, void *v)
2289{
2290	seq_printf(seq, "%08x %08x %08x %08x\n",
2291		   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2292		   1000000,
2293		   (u32)NSEC_PER_SEC / hrtimer_resolution);
2294
2295	return 0;
2296}
2297
2298static int __net_init psched_net_init(struct net *net)
2299{
2300	struct proc_dir_entry *e;
2301
2302	e = proc_create_single("psched", 0, net->proc_net, psched_show);
2303	if (e == NULL)
2304		return -ENOMEM;
2305
2306	return 0;
2307}
2308
2309static void __net_exit psched_net_exit(struct net *net)
2310{
2311	remove_proc_entry("psched", net->proc_net);
2312}
2313#else
2314static int __net_init psched_net_init(struct net *net)
2315{
2316	return 0;
2317}
2318
2319static void __net_exit psched_net_exit(struct net *net)
2320{
2321}
2322#endif
2323
2324static struct pernet_operations psched_net_ops = {
2325	.init = psched_net_init,
2326	.exit = psched_net_exit,
2327};
2328
2329static int __init pktsched_init(void)
2330{
2331	int err;
2332
2333	err = register_pernet_subsys(&psched_net_ops);
2334	if (err) {
2335		pr_err("pktsched_init: "
2336		       "cannot initialize per netns operations\n");
2337		return err;
2338	}
2339
2340	register_qdisc(&pfifo_fast_ops);
2341	register_qdisc(&pfifo_qdisc_ops);
2342	register_qdisc(&bfifo_qdisc_ops);
2343	register_qdisc(&pfifo_head_drop_qdisc_ops);
2344	register_qdisc(&mq_qdisc_ops);
2345	register_qdisc(&noqueue_qdisc_ops);
2346
2347	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2348	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2349	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2350		      0);
2351	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2352	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2353	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2354		      0);
2355
2356	return 0;
2357}
2358
2359subsys_initcall(pktsched_init);
2360