xref: /kernel/linux/linux-5.10/net/sched/sch_choke.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * net/sched/sch_choke.c	CHOKE scheduler
4 *
5 * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
6 * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
7 */
8
9#include <linux/module.h>
10#include <linux/types.h>
11#include <linux/kernel.h>
12#include <linux/skbuff.h>
13#include <linux/vmalloc.h>
14#include <net/pkt_sched.h>
15#include <net/pkt_cls.h>
16#include <net/inet_ecn.h>
17#include <net/red.h>
18#include <net/flow_dissector.h>
19
20/*
21   CHOKe stateless AQM for fair bandwidth allocation
22   =================================================
23
24   CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
25   unresponsive flows) is a variant of RED that penalizes misbehaving flows but
26   maintains no flow state. The difference from RED is an additional step
27   during the enqueuing process. If average queue size is over the
28   low threshold (qmin), a packet is chosen at random from the queue.
29   If both the new and chosen packet are from the same flow, both
30   are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
31   needs to access packets in queue randomly. It has a minimal class
32   interface to allow overriding the builtin flow classifier with
33   filters.
34
35   Source:
36   R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
37   Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
38   IEEE INFOCOM, 2000.
39
40   A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
41   Characteristics", IEEE/ACM Transactions on Networking, 2004
42
43 */
44
45/* Upper bound on size of sk_buff table (packets) */
46#define CHOKE_MAX_QUEUE	(128*1024 - 1)
47
48struct choke_sched_data {
49/* Parameters */
50	u32		 limit;
51	unsigned char	 flags;
52
53	struct red_parms parms;
54
55/* Variables */
56	struct red_vars  vars;
57	struct {
58		u32	prob_drop;	/* Early probability drops */
59		u32	prob_mark;	/* Early probability marks */
60		u32	forced_drop;	/* Forced drops, qavg > max_thresh */
61		u32	forced_mark;	/* Forced marks, qavg > max_thresh */
62		u32	pdrop;          /* Drops due to queue limits */
63		u32	other;          /* Drops due to drop() calls */
64		u32	matched;	/* Drops to flow match */
65	} stats;
66
67	unsigned int	 head;
68	unsigned int	 tail;
69
70	unsigned int	 tab_mask; /* size - 1 */
71
72	struct sk_buff **tab;
73};
74
75/* number of elements in queue including holes */
76static unsigned int choke_len(const struct choke_sched_data *q)
77{
78	return (q->tail - q->head) & q->tab_mask;
79}
80
81/* Is ECN parameter configured */
82static int use_ecn(const struct choke_sched_data *q)
83{
84	return q->flags & TC_RED_ECN;
85}
86
87/* Should packets over max just be dropped (versus marked) */
88static int use_harddrop(const struct choke_sched_data *q)
89{
90	return q->flags & TC_RED_HARDDROP;
91}
92
93/* Move head pointer forward to skip over holes */
94static void choke_zap_head_holes(struct choke_sched_data *q)
95{
96	do {
97		q->head = (q->head + 1) & q->tab_mask;
98		if (q->head == q->tail)
99			break;
100	} while (q->tab[q->head] == NULL);
101}
102
103/* Move tail pointer backwards to reuse holes */
104static void choke_zap_tail_holes(struct choke_sched_data *q)
105{
106	do {
107		q->tail = (q->tail - 1) & q->tab_mask;
108		if (q->head == q->tail)
109			break;
110	} while (q->tab[q->tail] == NULL);
111}
112
113/* Drop packet from queue array by creating a "hole" */
114static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx,
115			      struct sk_buff **to_free)
116{
117	struct choke_sched_data *q = qdisc_priv(sch);
118	struct sk_buff *skb = q->tab[idx];
119
120	q->tab[idx] = NULL;
121
122	if (idx == q->head)
123		choke_zap_head_holes(q);
124	if (idx == q->tail)
125		choke_zap_tail_holes(q);
126
127	qdisc_qstats_backlog_dec(sch, skb);
128	qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
129	qdisc_drop(skb, sch, to_free);
130	--sch->q.qlen;
131}
132
133struct choke_skb_cb {
134	u8			keys_valid;
135	struct			flow_keys_digest keys;
136};
137
138static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
139{
140	qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb));
141	return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
142}
143
144/*
145 * Compare flow of two packets
146 *  Returns true only if source and destination address and port match.
147 *          false for special cases
148 */
149static bool choke_match_flow(struct sk_buff *skb1,
150			     struct sk_buff *skb2)
151{
152	struct flow_keys temp;
153
154	if (skb1->protocol != skb2->protocol)
155		return false;
156
157	if (!choke_skb_cb(skb1)->keys_valid) {
158		choke_skb_cb(skb1)->keys_valid = 1;
159		skb_flow_dissect_flow_keys(skb1, &temp, 0);
160		make_flow_keys_digest(&choke_skb_cb(skb1)->keys, &temp);
161	}
162
163	if (!choke_skb_cb(skb2)->keys_valid) {
164		choke_skb_cb(skb2)->keys_valid = 1;
165		skb_flow_dissect_flow_keys(skb2, &temp, 0);
166		make_flow_keys_digest(&choke_skb_cb(skb2)->keys, &temp);
167	}
168
169	return !memcmp(&choke_skb_cb(skb1)->keys,
170		       &choke_skb_cb(skb2)->keys,
171		       sizeof(choke_skb_cb(skb1)->keys));
172}
173
174/*
175 * Select a packet at random from queue
176 * HACK: since queue can have holes from previous deletion; retry several
177 *   times to find a random skb but then just give up and return the head
178 * Will return NULL if queue is empty (q->head == q->tail)
179 */
180static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
181					 unsigned int *pidx)
182{
183	struct sk_buff *skb;
184	int retrys = 3;
185
186	do {
187		*pidx = (q->head + prandom_u32_max(choke_len(q))) & q->tab_mask;
188		skb = q->tab[*pidx];
189		if (skb)
190			return skb;
191	} while (--retrys > 0);
192
193	return q->tab[*pidx = q->head];
194}
195
196/*
197 * Compare new packet with random packet in queue
198 * returns true if matched and sets *pidx
199 */
200static bool choke_match_random(const struct choke_sched_data *q,
201			       struct sk_buff *nskb,
202			       unsigned int *pidx)
203{
204	struct sk_buff *oskb;
205
206	if (q->head == q->tail)
207		return false;
208
209	oskb = choke_peek_random(q, pidx);
210	return choke_match_flow(oskb, nskb);
211}
212
213static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch,
214			 struct sk_buff **to_free)
215{
216	struct choke_sched_data *q = qdisc_priv(sch);
217	const struct red_parms *p = &q->parms;
218
219	choke_skb_cb(skb)->keys_valid = 0;
220	/* Compute average queue usage (see RED) */
221	q->vars.qavg = red_calc_qavg(p, &q->vars, sch->q.qlen);
222	if (red_is_idling(&q->vars))
223		red_end_of_idle_period(&q->vars);
224
225	/* Is queue small? */
226	if (q->vars.qavg <= p->qth_min)
227		q->vars.qcount = -1;
228	else {
229		unsigned int idx;
230
231		/* Draw a packet at random from queue and compare flow */
232		if (choke_match_random(q, skb, &idx)) {
233			q->stats.matched++;
234			choke_drop_by_idx(sch, idx, to_free);
235			goto congestion_drop;
236		}
237
238		/* Queue is large, always mark/drop */
239		if (q->vars.qavg > p->qth_max) {
240			q->vars.qcount = -1;
241
242			qdisc_qstats_overlimit(sch);
243			if (use_harddrop(q) || !use_ecn(q) ||
244			    !INET_ECN_set_ce(skb)) {
245				q->stats.forced_drop++;
246				goto congestion_drop;
247			}
248
249			q->stats.forced_mark++;
250		} else if (++q->vars.qcount) {
251			if (red_mark_probability(p, &q->vars, q->vars.qavg)) {
252				q->vars.qcount = 0;
253				q->vars.qR = red_random(p);
254
255				qdisc_qstats_overlimit(sch);
256				if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
257					q->stats.prob_drop++;
258					goto congestion_drop;
259				}
260
261				q->stats.prob_mark++;
262			}
263		} else
264			q->vars.qR = red_random(p);
265	}
266
267	/* Admit new packet */
268	if (sch->q.qlen < q->limit) {
269		q->tab[q->tail] = skb;
270		q->tail = (q->tail + 1) & q->tab_mask;
271		++sch->q.qlen;
272		qdisc_qstats_backlog_inc(sch, skb);
273		return NET_XMIT_SUCCESS;
274	}
275
276	q->stats.pdrop++;
277	return qdisc_drop(skb, sch, to_free);
278
279congestion_drop:
280	qdisc_drop(skb, sch, to_free);
281	return NET_XMIT_CN;
282}
283
284static struct sk_buff *choke_dequeue(struct Qdisc *sch)
285{
286	struct choke_sched_data *q = qdisc_priv(sch);
287	struct sk_buff *skb;
288
289	if (q->head == q->tail) {
290		if (!red_is_idling(&q->vars))
291			red_start_of_idle_period(&q->vars);
292		return NULL;
293	}
294
295	skb = q->tab[q->head];
296	q->tab[q->head] = NULL;
297	choke_zap_head_holes(q);
298	--sch->q.qlen;
299	qdisc_qstats_backlog_dec(sch, skb);
300	qdisc_bstats_update(sch, skb);
301
302	return skb;
303}
304
305static void choke_reset(struct Qdisc *sch)
306{
307	struct choke_sched_data *q = qdisc_priv(sch);
308
309	while (q->head != q->tail) {
310		struct sk_buff *skb = q->tab[q->head];
311
312		q->head = (q->head + 1) & q->tab_mask;
313		if (!skb)
314			continue;
315		rtnl_qdisc_drop(skb, sch);
316	}
317
318	if (q->tab)
319		memset(q->tab, 0, (q->tab_mask + 1) * sizeof(struct sk_buff *));
320	q->head = q->tail = 0;
321	red_restart(&q->vars);
322}
323
324static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
325	[TCA_CHOKE_PARMS]	= { .len = sizeof(struct tc_red_qopt) },
326	[TCA_CHOKE_STAB]	= { .len = RED_STAB_SIZE },
327	[TCA_CHOKE_MAX_P]	= { .type = NLA_U32 },
328};
329
330
331static void choke_free(void *addr)
332{
333	kvfree(addr);
334}
335
336static int choke_change(struct Qdisc *sch, struct nlattr *opt,
337			struct netlink_ext_ack *extack)
338{
339	struct choke_sched_data *q = qdisc_priv(sch);
340	struct nlattr *tb[TCA_CHOKE_MAX + 1];
341	const struct tc_red_qopt *ctl;
342	int err;
343	struct sk_buff **old = NULL;
344	unsigned int mask;
345	u32 max_P;
346	u8 *stab;
347
348	if (opt == NULL)
349		return -EINVAL;
350
351	err = nla_parse_nested_deprecated(tb, TCA_CHOKE_MAX, opt,
352					  choke_policy, NULL);
353	if (err < 0)
354		return err;
355
356	if (tb[TCA_CHOKE_PARMS] == NULL ||
357	    tb[TCA_CHOKE_STAB] == NULL)
358		return -EINVAL;
359
360	max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0;
361
362	ctl = nla_data(tb[TCA_CHOKE_PARMS]);
363	stab = nla_data(tb[TCA_CHOKE_STAB]);
364	if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Scell_log, stab))
365		return -EINVAL;
366
367	if (ctl->limit > CHOKE_MAX_QUEUE)
368		return -EINVAL;
369
370	mask = roundup_pow_of_two(ctl->limit + 1) - 1;
371	if (mask != q->tab_mask) {
372		struct sk_buff **ntab;
373
374		ntab = kvcalloc(mask + 1, sizeof(struct sk_buff *), GFP_KERNEL);
375		if (!ntab)
376			return -ENOMEM;
377
378		sch_tree_lock(sch);
379		old = q->tab;
380		if (old) {
381			unsigned int oqlen = sch->q.qlen, tail = 0;
382			unsigned dropped = 0;
383
384			while (q->head != q->tail) {
385				struct sk_buff *skb = q->tab[q->head];
386
387				q->head = (q->head + 1) & q->tab_mask;
388				if (!skb)
389					continue;
390				if (tail < mask) {
391					ntab[tail++] = skb;
392					continue;
393				}
394				dropped += qdisc_pkt_len(skb);
395				qdisc_qstats_backlog_dec(sch, skb);
396				--sch->q.qlen;
397				rtnl_qdisc_drop(skb, sch);
398			}
399			qdisc_tree_reduce_backlog(sch, oqlen - sch->q.qlen, dropped);
400			q->head = 0;
401			q->tail = tail;
402		}
403
404		q->tab_mask = mask;
405		q->tab = ntab;
406	} else
407		sch_tree_lock(sch);
408
409	q->flags = ctl->flags;
410	q->limit = ctl->limit;
411
412	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
413		      ctl->Plog, ctl->Scell_log,
414		      stab,
415		      max_P);
416	red_set_vars(&q->vars);
417
418	if (q->head == q->tail)
419		red_end_of_idle_period(&q->vars);
420
421	sch_tree_unlock(sch);
422	choke_free(old);
423	return 0;
424}
425
426static int choke_init(struct Qdisc *sch, struct nlattr *opt,
427		      struct netlink_ext_ack *extack)
428{
429	return choke_change(sch, opt, extack);
430}
431
432static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
433{
434	struct choke_sched_data *q = qdisc_priv(sch);
435	struct nlattr *opts = NULL;
436	struct tc_red_qopt opt = {
437		.limit		= q->limit,
438		.flags		= q->flags,
439		.qth_min	= q->parms.qth_min >> q->parms.Wlog,
440		.qth_max	= q->parms.qth_max >> q->parms.Wlog,
441		.Wlog		= q->parms.Wlog,
442		.Plog		= q->parms.Plog,
443		.Scell_log	= q->parms.Scell_log,
444	};
445
446	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
447	if (opts == NULL)
448		goto nla_put_failure;
449
450	if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) ||
451	    nla_put_u32(skb, TCA_CHOKE_MAX_P, q->parms.max_P))
452		goto nla_put_failure;
453	return nla_nest_end(skb, opts);
454
455nla_put_failure:
456	nla_nest_cancel(skb, opts);
457	return -EMSGSIZE;
458}
459
460static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
461{
462	struct choke_sched_data *q = qdisc_priv(sch);
463	struct tc_choke_xstats st = {
464		.early	= q->stats.prob_drop + q->stats.forced_drop,
465		.marked	= q->stats.prob_mark + q->stats.forced_mark,
466		.pdrop	= q->stats.pdrop,
467		.other	= q->stats.other,
468		.matched = q->stats.matched,
469	};
470
471	return gnet_stats_copy_app(d, &st, sizeof(st));
472}
473
474static void choke_destroy(struct Qdisc *sch)
475{
476	struct choke_sched_data *q = qdisc_priv(sch);
477
478	choke_free(q->tab);
479}
480
481static struct sk_buff *choke_peek_head(struct Qdisc *sch)
482{
483	struct choke_sched_data *q = qdisc_priv(sch);
484
485	return (q->head != q->tail) ? q->tab[q->head] : NULL;
486}
487
488static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
489	.id		=	"choke",
490	.priv_size	=	sizeof(struct choke_sched_data),
491
492	.enqueue	=	choke_enqueue,
493	.dequeue	=	choke_dequeue,
494	.peek		=	choke_peek_head,
495	.init		=	choke_init,
496	.destroy	=	choke_destroy,
497	.reset		=	choke_reset,
498	.change		=	choke_change,
499	.dump		=	choke_dump,
500	.dump_stats	=	choke_dump_stats,
501	.owner		=	THIS_MODULE,
502};
503
504static int __init choke_module_init(void)
505{
506	return register_qdisc(&choke_qdisc_ops);
507}
508
509static void __exit choke_module_exit(void)
510{
511	unregister_qdisc(&choke_qdisc_ops);
512}
513
514module_init(choke_module_init)
515module_exit(choke_module_exit)
516
517MODULE_LICENSE("GPL");
518