18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only
28c2ecf20Sopenharmony_ci/* Copyright (C) 2013 Cisco Systems, Inc, 2013.
38c2ecf20Sopenharmony_ci *
48c2ecf20Sopenharmony_ci * Author: Vijay Subramanian <vijaynsu@cisco.com>
58c2ecf20Sopenharmony_ci * Author: Mythili Prabhu <mysuryan@cisco.com>
68c2ecf20Sopenharmony_ci *
78c2ecf20Sopenharmony_ci * ECN support is added by Naeem Khademi <naeemk@ifi.uio.no>
88c2ecf20Sopenharmony_ci * University of Oslo, Norway.
98c2ecf20Sopenharmony_ci *
108c2ecf20Sopenharmony_ci * References:
118c2ecf20Sopenharmony_ci * RFC 8033: https://tools.ietf.org/html/rfc8033
128c2ecf20Sopenharmony_ci */
138c2ecf20Sopenharmony_ci
148c2ecf20Sopenharmony_ci#include <linux/module.h>
158c2ecf20Sopenharmony_ci#include <linux/slab.h>
168c2ecf20Sopenharmony_ci#include <linux/types.h>
178c2ecf20Sopenharmony_ci#include <linux/kernel.h>
188c2ecf20Sopenharmony_ci#include <linux/errno.h>
198c2ecf20Sopenharmony_ci#include <linux/skbuff.h>
208c2ecf20Sopenharmony_ci#include <net/pkt_sched.h>
218c2ecf20Sopenharmony_ci#include <net/inet_ecn.h>
228c2ecf20Sopenharmony_ci#include <net/pie.h>
238c2ecf20Sopenharmony_ci
248c2ecf20Sopenharmony_ci/* private data for the Qdisc */
258c2ecf20Sopenharmony_cistruct pie_sched_data {
268c2ecf20Sopenharmony_ci	struct pie_vars vars;
278c2ecf20Sopenharmony_ci	struct pie_params params;
288c2ecf20Sopenharmony_ci	struct pie_stats stats;
298c2ecf20Sopenharmony_ci	struct timer_list adapt_timer;
308c2ecf20Sopenharmony_ci	struct Qdisc *sch;
318c2ecf20Sopenharmony_ci};
328c2ecf20Sopenharmony_ci
338c2ecf20Sopenharmony_cibool pie_drop_early(struct Qdisc *sch, struct pie_params *params,
348c2ecf20Sopenharmony_ci		    struct pie_vars *vars, u32 backlog, u32 packet_size)
358c2ecf20Sopenharmony_ci{
368c2ecf20Sopenharmony_ci	u64 rnd;
378c2ecf20Sopenharmony_ci	u64 local_prob = vars->prob;
388c2ecf20Sopenharmony_ci	u32 mtu = psched_mtu(qdisc_dev(sch));
398c2ecf20Sopenharmony_ci
408c2ecf20Sopenharmony_ci	/* If there is still burst allowance left skip random early drop */
418c2ecf20Sopenharmony_ci	if (vars->burst_time > 0)
428c2ecf20Sopenharmony_ci		return false;
438c2ecf20Sopenharmony_ci
448c2ecf20Sopenharmony_ci	/* If current delay is less than half of target, and
458c2ecf20Sopenharmony_ci	 * if drop prob is low already, disable early_drop
468c2ecf20Sopenharmony_ci	 */
478c2ecf20Sopenharmony_ci	if ((vars->qdelay < params->target / 2) &&
488c2ecf20Sopenharmony_ci	    (vars->prob < MAX_PROB / 5))
498c2ecf20Sopenharmony_ci		return false;
508c2ecf20Sopenharmony_ci
518c2ecf20Sopenharmony_ci	/* If we have fewer than 2 mtu-sized packets, disable pie_drop_early,
528c2ecf20Sopenharmony_ci	 * similar to min_th in RED
538c2ecf20Sopenharmony_ci	 */
548c2ecf20Sopenharmony_ci	if (backlog < 2 * mtu)
558c2ecf20Sopenharmony_ci		return false;
568c2ecf20Sopenharmony_ci
578c2ecf20Sopenharmony_ci	/* If bytemode is turned on, use packet size to compute new
588c2ecf20Sopenharmony_ci	 * probablity. Smaller packets will have lower drop prob in this case
598c2ecf20Sopenharmony_ci	 */
608c2ecf20Sopenharmony_ci	if (params->bytemode && packet_size <= mtu)
618c2ecf20Sopenharmony_ci		local_prob = (u64)packet_size * div_u64(local_prob, mtu);
628c2ecf20Sopenharmony_ci	else
638c2ecf20Sopenharmony_ci		local_prob = vars->prob;
648c2ecf20Sopenharmony_ci
658c2ecf20Sopenharmony_ci	if (local_prob == 0)
668c2ecf20Sopenharmony_ci		vars->accu_prob = 0;
678c2ecf20Sopenharmony_ci	else
688c2ecf20Sopenharmony_ci		vars->accu_prob += local_prob;
698c2ecf20Sopenharmony_ci
708c2ecf20Sopenharmony_ci	if (vars->accu_prob < (MAX_PROB / 100) * 85)
718c2ecf20Sopenharmony_ci		return false;
728c2ecf20Sopenharmony_ci	if (vars->accu_prob >= (MAX_PROB / 2) * 17)
738c2ecf20Sopenharmony_ci		return true;
748c2ecf20Sopenharmony_ci
758c2ecf20Sopenharmony_ci	prandom_bytes(&rnd, 8);
768c2ecf20Sopenharmony_ci	if ((rnd >> BITS_PER_BYTE) < local_prob) {
778c2ecf20Sopenharmony_ci		vars->accu_prob = 0;
788c2ecf20Sopenharmony_ci		return true;
798c2ecf20Sopenharmony_ci	}
808c2ecf20Sopenharmony_ci
818c2ecf20Sopenharmony_ci	return false;
828c2ecf20Sopenharmony_ci}
838c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(pie_drop_early);
848c2ecf20Sopenharmony_ci
858c2ecf20Sopenharmony_cistatic int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
868c2ecf20Sopenharmony_ci			     struct sk_buff **to_free)
878c2ecf20Sopenharmony_ci{
888c2ecf20Sopenharmony_ci	struct pie_sched_data *q = qdisc_priv(sch);
898c2ecf20Sopenharmony_ci	bool enqueue = false;
908c2ecf20Sopenharmony_ci
918c2ecf20Sopenharmony_ci	if (unlikely(qdisc_qlen(sch) >= sch->limit)) {
928c2ecf20Sopenharmony_ci		q->stats.overlimit++;
938c2ecf20Sopenharmony_ci		goto out;
948c2ecf20Sopenharmony_ci	}
958c2ecf20Sopenharmony_ci
968c2ecf20Sopenharmony_ci	if (!pie_drop_early(sch, &q->params, &q->vars, sch->qstats.backlog,
978c2ecf20Sopenharmony_ci			    skb->len)) {
988c2ecf20Sopenharmony_ci		enqueue = true;
998c2ecf20Sopenharmony_ci	} else if (q->params.ecn && (q->vars.prob <= MAX_PROB / 10) &&
1008c2ecf20Sopenharmony_ci		   INET_ECN_set_ce(skb)) {
1018c2ecf20Sopenharmony_ci		/* If packet is ecn capable, mark it if drop probability
1028c2ecf20Sopenharmony_ci		 * is lower than 10%, else drop it.
1038c2ecf20Sopenharmony_ci		 */
1048c2ecf20Sopenharmony_ci		q->stats.ecn_mark++;
1058c2ecf20Sopenharmony_ci		enqueue = true;
1068c2ecf20Sopenharmony_ci	}
1078c2ecf20Sopenharmony_ci
1088c2ecf20Sopenharmony_ci	/* we can enqueue the packet */
1098c2ecf20Sopenharmony_ci	if (enqueue) {
1108c2ecf20Sopenharmony_ci		/* Set enqueue time only when dq_rate_estimator is disabled. */
1118c2ecf20Sopenharmony_ci		if (!q->params.dq_rate_estimator)
1128c2ecf20Sopenharmony_ci			pie_set_enqueue_time(skb);
1138c2ecf20Sopenharmony_ci
1148c2ecf20Sopenharmony_ci		q->stats.packets_in++;
1158c2ecf20Sopenharmony_ci		if (qdisc_qlen(sch) > q->stats.maxq)
1168c2ecf20Sopenharmony_ci			q->stats.maxq = qdisc_qlen(sch);
1178c2ecf20Sopenharmony_ci
1188c2ecf20Sopenharmony_ci		return qdisc_enqueue_tail(skb, sch);
1198c2ecf20Sopenharmony_ci	}
1208c2ecf20Sopenharmony_ci
1218c2ecf20Sopenharmony_ciout:
1228c2ecf20Sopenharmony_ci	q->stats.dropped++;
1238c2ecf20Sopenharmony_ci	q->vars.accu_prob = 0;
1248c2ecf20Sopenharmony_ci	return qdisc_drop(skb, sch, to_free);
1258c2ecf20Sopenharmony_ci}
1268c2ecf20Sopenharmony_ci
1278c2ecf20Sopenharmony_cistatic const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = {
1288c2ecf20Sopenharmony_ci	[TCA_PIE_TARGET]		= {.type = NLA_U32},
1298c2ecf20Sopenharmony_ci	[TCA_PIE_LIMIT]			= {.type = NLA_U32},
1308c2ecf20Sopenharmony_ci	[TCA_PIE_TUPDATE]		= {.type = NLA_U32},
1318c2ecf20Sopenharmony_ci	[TCA_PIE_ALPHA]			= {.type = NLA_U32},
1328c2ecf20Sopenharmony_ci	[TCA_PIE_BETA]			= {.type = NLA_U32},
1338c2ecf20Sopenharmony_ci	[TCA_PIE_ECN]			= {.type = NLA_U32},
1348c2ecf20Sopenharmony_ci	[TCA_PIE_BYTEMODE]		= {.type = NLA_U32},
1358c2ecf20Sopenharmony_ci	[TCA_PIE_DQ_RATE_ESTIMATOR]	= {.type = NLA_U32},
1368c2ecf20Sopenharmony_ci};
1378c2ecf20Sopenharmony_ci
1388c2ecf20Sopenharmony_cistatic int pie_change(struct Qdisc *sch, struct nlattr *opt,
1398c2ecf20Sopenharmony_ci		      struct netlink_ext_ack *extack)
1408c2ecf20Sopenharmony_ci{
1418c2ecf20Sopenharmony_ci	struct pie_sched_data *q = qdisc_priv(sch);
1428c2ecf20Sopenharmony_ci	struct nlattr *tb[TCA_PIE_MAX + 1];
1438c2ecf20Sopenharmony_ci	unsigned int qlen, dropped = 0;
1448c2ecf20Sopenharmony_ci	int err;
1458c2ecf20Sopenharmony_ci
1468c2ecf20Sopenharmony_ci	if (!opt)
1478c2ecf20Sopenharmony_ci		return -EINVAL;
1488c2ecf20Sopenharmony_ci
1498c2ecf20Sopenharmony_ci	err = nla_parse_nested_deprecated(tb, TCA_PIE_MAX, opt, pie_policy,
1508c2ecf20Sopenharmony_ci					  NULL);
1518c2ecf20Sopenharmony_ci	if (err < 0)
1528c2ecf20Sopenharmony_ci		return err;
1538c2ecf20Sopenharmony_ci
1548c2ecf20Sopenharmony_ci	sch_tree_lock(sch);
1558c2ecf20Sopenharmony_ci
1568c2ecf20Sopenharmony_ci	/* convert from microseconds to pschedtime */
1578c2ecf20Sopenharmony_ci	if (tb[TCA_PIE_TARGET]) {
1588c2ecf20Sopenharmony_ci		/* target is in us */
1598c2ecf20Sopenharmony_ci		u32 target = nla_get_u32(tb[TCA_PIE_TARGET]);
1608c2ecf20Sopenharmony_ci
1618c2ecf20Sopenharmony_ci		/* convert to pschedtime */
1628c2ecf20Sopenharmony_ci		q->params.target = PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC);
1638c2ecf20Sopenharmony_ci	}
1648c2ecf20Sopenharmony_ci
1658c2ecf20Sopenharmony_ci	/* tupdate is in jiffies */
1668c2ecf20Sopenharmony_ci	if (tb[TCA_PIE_TUPDATE])
1678c2ecf20Sopenharmony_ci		q->params.tupdate =
1688c2ecf20Sopenharmony_ci			usecs_to_jiffies(nla_get_u32(tb[TCA_PIE_TUPDATE]));
1698c2ecf20Sopenharmony_ci
1708c2ecf20Sopenharmony_ci	if (tb[TCA_PIE_LIMIT]) {
1718c2ecf20Sopenharmony_ci		u32 limit = nla_get_u32(tb[TCA_PIE_LIMIT]);
1728c2ecf20Sopenharmony_ci
1738c2ecf20Sopenharmony_ci		q->params.limit = limit;
1748c2ecf20Sopenharmony_ci		sch->limit = limit;
1758c2ecf20Sopenharmony_ci	}
1768c2ecf20Sopenharmony_ci
1778c2ecf20Sopenharmony_ci	if (tb[TCA_PIE_ALPHA])
1788c2ecf20Sopenharmony_ci		q->params.alpha = nla_get_u32(tb[TCA_PIE_ALPHA]);
1798c2ecf20Sopenharmony_ci
1808c2ecf20Sopenharmony_ci	if (tb[TCA_PIE_BETA])
1818c2ecf20Sopenharmony_ci		q->params.beta = nla_get_u32(tb[TCA_PIE_BETA]);
1828c2ecf20Sopenharmony_ci
1838c2ecf20Sopenharmony_ci	if (tb[TCA_PIE_ECN])
1848c2ecf20Sopenharmony_ci		q->params.ecn = nla_get_u32(tb[TCA_PIE_ECN]);
1858c2ecf20Sopenharmony_ci
1868c2ecf20Sopenharmony_ci	if (tb[TCA_PIE_BYTEMODE])
1878c2ecf20Sopenharmony_ci		q->params.bytemode = nla_get_u32(tb[TCA_PIE_BYTEMODE]);
1888c2ecf20Sopenharmony_ci
1898c2ecf20Sopenharmony_ci	if (tb[TCA_PIE_DQ_RATE_ESTIMATOR])
1908c2ecf20Sopenharmony_ci		q->params.dq_rate_estimator =
1918c2ecf20Sopenharmony_ci				nla_get_u32(tb[TCA_PIE_DQ_RATE_ESTIMATOR]);
1928c2ecf20Sopenharmony_ci
1938c2ecf20Sopenharmony_ci	/* Drop excess packets if new limit is lower */
1948c2ecf20Sopenharmony_ci	qlen = sch->q.qlen;
1958c2ecf20Sopenharmony_ci	while (sch->q.qlen > sch->limit) {
1968c2ecf20Sopenharmony_ci		struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);
1978c2ecf20Sopenharmony_ci
1988c2ecf20Sopenharmony_ci		dropped += qdisc_pkt_len(skb);
1998c2ecf20Sopenharmony_ci		qdisc_qstats_backlog_dec(sch, skb);
2008c2ecf20Sopenharmony_ci		rtnl_qdisc_drop(skb, sch);
2018c2ecf20Sopenharmony_ci	}
2028c2ecf20Sopenharmony_ci	qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
2038c2ecf20Sopenharmony_ci
2048c2ecf20Sopenharmony_ci	sch_tree_unlock(sch);
2058c2ecf20Sopenharmony_ci	return 0;
2068c2ecf20Sopenharmony_ci}
2078c2ecf20Sopenharmony_ci
2088c2ecf20Sopenharmony_civoid pie_process_dequeue(struct sk_buff *skb, struct pie_params *params,
2098c2ecf20Sopenharmony_ci			 struct pie_vars *vars, u32 backlog)
2108c2ecf20Sopenharmony_ci{
2118c2ecf20Sopenharmony_ci	psched_time_t now = psched_get_time();
2128c2ecf20Sopenharmony_ci	u32 dtime = 0;
2138c2ecf20Sopenharmony_ci
2148c2ecf20Sopenharmony_ci	/* If dq_rate_estimator is disabled, calculate qdelay using the
2158c2ecf20Sopenharmony_ci	 * packet timestamp.
2168c2ecf20Sopenharmony_ci	 */
2178c2ecf20Sopenharmony_ci	if (!params->dq_rate_estimator) {
2188c2ecf20Sopenharmony_ci		vars->qdelay = now - pie_get_enqueue_time(skb);
2198c2ecf20Sopenharmony_ci
2208c2ecf20Sopenharmony_ci		if (vars->dq_tstamp != DTIME_INVALID)
2218c2ecf20Sopenharmony_ci			dtime = now - vars->dq_tstamp;
2228c2ecf20Sopenharmony_ci
2238c2ecf20Sopenharmony_ci		vars->dq_tstamp = now;
2248c2ecf20Sopenharmony_ci
2258c2ecf20Sopenharmony_ci		if (backlog == 0)
2268c2ecf20Sopenharmony_ci			vars->qdelay = 0;
2278c2ecf20Sopenharmony_ci
2288c2ecf20Sopenharmony_ci		if (dtime == 0)
2298c2ecf20Sopenharmony_ci			return;
2308c2ecf20Sopenharmony_ci
2318c2ecf20Sopenharmony_ci		goto burst_allowance_reduction;
2328c2ecf20Sopenharmony_ci	}
2338c2ecf20Sopenharmony_ci
2348c2ecf20Sopenharmony_ci	/* If current queue is about 10 packets or more and dq_count is unset
2358c2ecf20Sopenharmony_ci	 * we have enough packets to calculate the drain rate. Save
2368c2ecf20Sopenharmony_ci	 * current time as dq_tstamp and start measurement cycle.
2378c2ecf20Sopenharmony_ci	 */
2388c2ecf20Sopenharmony_ci	if (backlog >= QUEUE_THRESHOLD && vars->dq_count == DQCOUNT_INVALID) {
2398c2ecf20Sopenharmony_ci		vars->dq_tstamp = psched_get_time();
2408c2ecf20Sopenharmony_ci		vars->dq_count = 0;
2418c2ecf20Sopenharmony_ci	}
2428c2ecf20Sopenharmony_ci
2438c2ecf20Sopenharmony_ci	/* Calculate the average drain rate from this value. If queue length
2448c2ecf20Sopenharmony_ci	 * has receded to a small value viz., <= QUEUE_THRESHOLD bytes, reset
2458c2ecf20Sopenharmony_ci	 * the dq_count to -1 as we don't have enough packets to calculate the
2468c2ecf20Sopenharmony_ci	 * drain rate anymore. The following if block is entered only when we
2478c2ecf20Sopenharmony_ci	 * have a substantial queue built up (QUEUE_THRESHOLD bytes or more)
2488c2ecf20Sopenharmony_ci	 * and we calculate the drain rate for the threshold here.  dq_count is
2498c2ecf20Sopenharmony_ci	 * in bytes, time difference in psched_time, hence rate is in
2508c2ecf20Sopenharmony_ci	 * bytes/psched_time.
2518c2ecf20Sopenharmony_ci	 */
2528c2ecf20Sopenharmony_ci	if (vars->dq_count != DQCOUNT_INVALID) {
2538c2ecf20Sopenharmony_ci		vars->dq_count += skb->len;
2548c2ecf20Sopenharmony_ci
2558c2ecf20Sopenharmony_ci		if (vars->dq_count >= QUEUE_THRESHOLD) {
2568c2ecf20Sopenharmony_ci			u32 count = vars->dq_count << PIE_SCALE;
2578c2ecf20Sopenharmony_ci
2588c2ecf20Sopenharmony_ci			dtime = now - vars->dq_tstamp;
2598c2ecf20Sopenharmony_ci
2608c2ecf20Sopenharmony_ci			if (dtime == 0)
2618c2ecf20Sopenharmony_ci				return;
2628c2ecf20Sopenharmony_ci
2638c2ecf20Sopenharmony_ci			count = count / dtime;
2648c2ecf20Sopenharmony_ci
2658c2ecf20Sopenharmony_ci			if (vars->avg_dq_rate == 0)
2668c2ecf20Sopenharmony_ci				vars->avg_dq_rate = count;
2678c2ecf20Sopenharmony_ci			else
2688c2ecf20Sopenharmony_ci				vars->avg_dq_rate =
2698c2ecf20Sopenharmony_ci				    (vars->avg_dq_rate -
2708c2ecf20Sopenharmony_ci				     (vars->avg_dq_rate >> 3)) + (count >> 3);
2718c2ecf20Sopenharmony_ci
2728c2ecf20Sopenharmony_ci			/* If the queue has receded below the threshold, we hold
2738c2ecf20Sopenharmony_ci			 * on to the last drain rate calculated, else we reset
2748c2ecf20Sopenharmony_ci			 * dq_count to 0 to re-enter the if block when the next
2758c2ecf20Sopenharmony_ci			 * packet is dequeued
2768c2ecf20Sopenharmony_ci			 */
2778c2ecf20Sopenharmony_ci			if (backlog < QUEUE_THRESHOLD) {
2788c2ecf20Sopenharmony_ci				vars->dq_count = DQCOUNT_INVALID;
2798c2ecf20Sopenharmony_ci			} else {
2808c2ecf20Sopenharmony_ci				vars->dq_count = 0;
2818c2ecf20Sopenharmony_ci				vars->dq_tstamp = psched_get_time();
2828c2ecf20Sopenharmony_ci			}
2838c2ecf20Sopenharmony_ci
2848c2ecf20Sopenharmony_ci			goto burst_allowance_reduction;
2858c2ecf20Sopenharmony_ci		}
2868c2ecf20Sopenharmony_ci	}
2878c2ecf20Sopenharmony_ci
2888c2ecf20Sopenharmony_ci	return;
2898c2ecf20Sopenharmony_ci
2908c2ecf20Sopenharmony_ciburst_allowance_reduction:
2918c2ecf20Sopenharmony_ci	if (vars->burst_time > 0) {
2928c2ecf20Sopenharmony_ci		if (vars->burst_time > dtime)
2938c2ecf20Sopenharmony_ci			vars->burst_time -= dtime;
2948c2ecf20Sopenharmony_ci		else
2958c2ecf20Sopenharmony_ci			vars->burst_time = 0;
2968c2ecf20Sopenharmony_ci	}
2978c2ecf20Sopenharmony_ci}
2988c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(pie_process_dequeue);
2998c2ecf20Sopenharmony_ci
3008c2ecf20Sopenharmony_civoid pie_calculate_probability(struct pie_params *params, struct pie_vars *vars,
3018c2ecf20Sopenharmony_ci			       u32 backlog)
3028c2ecf20Sopenharmony_ci{
3038c2ecf20Sopenharmony_ci	psched_time_t qdelay = 0;	/* in pschedtime */
3048c2ecf20Sopenharmony_ci	psched_time_t qdelay_old = 0;	/* in pschedtime */
3058c2ecf20Sopenharmony_ci	s64 delta = 0;		/* determines the change in probability */
3068c2ecf20Sopenharmony_ci	u64 oldprob;
3078c2ecf20Sopenharmony_ci	u64 alpha, beta;
3088c2ecf20Sopenharmony_ci	u32 power;
3098c2ecf20Sopenharmony_ci	bool update_prob = true;
3108c2ecf20Sopenharmony_ci
3118c2ecf20Sopenharmony_ci	if (params->dq_rate_estimator) {
3128c2ecf20Sopenharmony_ci		qdelay_old = vars->qdelay;
3138c2ecf20Sopenharmony_ci		vars->qdelay_old = vars->qdelay;
3148c2ecf20Sopenharmony_ci
3158c2ecf20Sopenharmony_ci		if (vars->avg_dq_rate > 0)
3168c2ecf20Sopenharmony_ci			qdelay = (backlog << PIE_SCALE) / vars->avg_dq_rate;
3178c2ecf20Sopenharmony_ci		else
3188c2ecf20Sopenharmony_ci			qdelay = 0;
3198c2ecf20Sopenharmony_ci	} else {
3208c2ecf20Sopenharmony_ci		qdelay = vars->qdelay;
3218c2ecf20Sopenharmony_ci		qdelay_old = vars->qdelay_old;
3228c2ecf20Sopenharmony_ci	}
3238c2ecf20Sopenharmony_ci
3248c2ecf20Sopenharmony_ci	/* If qdelay is zero and backlog is not, it means backlog is very small,
3258c2ecf20Sopenharmony_ci	 * so we do not update probabilty in this round.
3268c2ecf20Sopenharmony_ci	 */
3278c2ecf20Sopenharmony_ci	if (qdelay == 0 && backlog != 0)
3288c2ecf20Sopenharmony_ci		update_prob = false;
3298c2ecf20Sopenharmony_ci
3308c2ecf20Sopenharmony_ci	/* In the algorithm, alpha and beta are between 0 and 2 with typical
3318c2ecf20Sopenharmony_ci	 * value for alpha as 0.125. In this implementation, we use values 0-32
3328c2ecf20Sopenharmony_ci	 * passed from user space to represent this. Also, alpha and beta have
3338c2ecf20Sopenharmony_ci	 * unit of HZ and need to be scaled before they can used to update
3348c2ecf20Sopenharmony_ci	 * probability. alpha/beta are updated locally below by scaling down
3358c2ecf20Sopenharmony_ci	 * by 16 to come to 0-2 range.
3368c2ecf20Sopenharmony_ci	 */
3378c2ecf20Sopenharmony_ci	alpha = ((u64)params->alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
3388c2ecf20Sopenharmony_ci	beta = ((u64)params->beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
3398c2ecf20Sopenharmony_ci
3408c2ecf20Sopenharmony_ci	/* We scale alpha and beta differently depending on how heavy the
3418c2ecf20Sopenharmony_ci	 * congestion is. Please see RFC 8033 for details.
3428c2ecf20Sopenharmony_ci	 */
3438c2ecf20Sopenharmony_ci	if (vars->prob < MAX_PROB / 10) {
3448c2ecf20Sopenharmony_ci		alpha >>= 1;
3458c2ecf20Sopenharmony_ci		beta >>= 1;
3468c2ecf20Sopenharmony_ci
3478c2ecf20Sopenharmony_ci		power = 100;
3488c2ecf20Sopenharmony_ci		while (vars->prob < div_u64(MAX_PROB, power) &&
3498c2ecf20Sopenharmony_ci		       power <= 1000000) {
3508c2ecf20Sopenharmony_ci			alpha >>= 2;
3518c2ecf20Sopenharmony_ci			beta >>= 2;
3528c2ecf20Sopenharmony_ci			power *= 10;
3538c2ecf20Sopenharmony_ci		}
3548c2ecf20Sopenharmony_ci	}
3558c2ecf20Sopenharmony_ci
3568c2ecf20Sopenharmony_ci	/* alpha and beta should be between 0 and 32, in multiples of 1/16 */
3578c2ecf20Sopenharmony_ci	delta += alpha * (qdelay - params->target);
3588c2ecf20Sopenharmony_ci	delta += beta * (qdelay - qdelay_old);
3598c2ecf20Sopenharmony_ci
3608c2ecf20Sopenharmony_ci	oldprob = vars->prob;
3618c2ecf20Sopenharmony_ci
3628c2ecf20Sopenharmony_ci	/* to ensure we increase probability in steps of no more than 2% */
3638c2ecf20Sopenharmony_ci	if (delta > (s64)(MAX_PROB / (100 / 2)) &&
3648c2ecf20Sopenharmony_ci	    vars->prob >= MAX_PROB / 10)
3658c2ecf20Sopenharmony_ci		delta = (MAX_PROB / 100) * 2;
3668c2ecf20Sopenharmony_ci
3678c2ecf20Sopenharmony_ci	/* Non-linear drop:
3688c2ecf20Sopenharmony_ci	 * Tune drop probability to increase quickly for high delays(>= 250ms)
3698c2ecf20Sopenharmony_ci	 * 250ms is derived through experiments and provides error protection
3708c2ecf20Sopenharmony_ci	 */
3718c2ecf20Sopenharmony_ci
3728c2ecf20Sopenharmony_ci	if (qdelay > (PSCHED_NS2TICKS(250 * NSEC_PER_MSEC)))
3738c2ecf20Sopenharmony_ci		delta += MAX_PROB / (100 / 2);
3748c2ecf20Sopenharmony_ci
3758c2ecf20Sopenharmony_ci	vars->prob += delta;
3768c2ecf20Sopenharmony_ci
3778c2ecf20Sopenharmony_ci	if (delta > 0) {
3788c2ecf20Sopenharmony_ci		/* prevent overflow */
3798c2ecf20Sopenharmony_ci		if (vars->prob < oldprob) {
3808c2ecf20Sopenharmony_ci			vars->prob = MAX_PROB;
3818c2ecf20Sopenharmony_ci			/* Prevent normalization error. If probability is at
3828c2ecf20Sopenharmony_ci			 * maximum value already, we normalize it here, and
3838c2ecf20Sopenharmony_ci			 * skip the check to do a non-linear drop in the next
3848c2ecf20Sopenharmony_ci			 * section.
3858c2ecf20Sopenharmony_ci			 */
3868c2ecf20Sopenharmony_ci			update_prob = false;
3878c2ecf20Sopenharmony_ci		}
3888c2ecf20Sopenharmony_ci	} else {
3898c2ecf20Sopenharmony_ci		/* prevent underflow */
3908c2ecf20Sopenharmony_ci		if (vars->prob > oldprob)
3918c2ecf20Sopenharmony_ci			vars->prob = 0;
3928c2ecf20Sopenharmony_ci	}
3938c2ecf20Sopenharmony_ci
3948c2ecf20Sopenharmony_ci	/* Non-linear drop in probability: Reduce drop probability quickly if
3958c2ecf20Sopenharmony_ci	 * delay is 0 for 2 consecutive Tupdate periods.
3968c2ecf20Sopenharmony_ci	 */
3978c2ecf20Sopenharmony_ci
3988c2ecf20Sopenharmony_ci	if (qdelay == 0 && qdelay_old == 0 && update_prob)
3998c2ecf20Sopenharmony_ci		/* Reduce drop probability to 98.4% */
4008c2ecf20Sopenharmony_ci		vars->prob -= vars->prob / 64;
4018c2ecf20Sopenharmony_ci
4028c2ecf20Sopenharmony_ci	vars->qdelay = qdelay;
4038c2ecf20Sopenharmony_ci	vars->backlog_old = backlog;
4048c2ecf20Sopenharmony_ci
4058c2ecf20Sopenharmony_ci	/* We restart the measurement cycle if the following conditions are met
4068c2ecf20Sopenharmony_ci	 * 1. If the delay has been low for 2 consecutive Tupdate periods
4078c2ecf20Sopenharmony_ci	 * 2. Calculated drop probability is zero
4088c2ecf20Sopenharmony_ci	 * 3. If average dq_rate_estimator is enabled, we have atleast one
4098c2ecf20Sopenharmony_ci	 *    estimate for the avg_dq_rate ie., is a non-zero value
4108c2ecf20Sopenharmony_ci	 */
4118c2ecf20Sopenharmony_ci	if ((vars->qdelay < params->target / 2) &&
4128c2ecf20Sopenharmony_ci	    (vars->qdelay_old < params->target / 2) &&
4138c2ecf20Sopenharmony_ci	    vars->prob == 0 &&
4148c2ecf20Sopenharmony_ci	    (!params->dq_rate_estimator || vars->avg_dq_rate > 0)) {
4158c2ecf20Sopenharmony_ci		pie_vars_init(vars);
4168c2ecf20Sopenharmony_ci	}
4178c2ecf20Sopenharmony_ci
4188c2ecf20Sopenharmony_ci	if (!params->dq_rate_estimator)
4198c2ecf20Sopenharmony_ci		vars->qdelay_old = qdelay;
4208c2ecf20Sopenharmony_ci}
4218c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(pie_calculate_probability);
4228c2ecf20Sopenharmony_ci
4238c2ecf20Sopenharmony_cistatic void pie_timer(struct timer_list *t)
4248c2ecf20Sopenharmony_ci{
4258c2ecf20Sopenharmony_ci	struct pie_sched_data *q = from_timer(q, t, adapt_timer);
4268c2ecf20Sopenharmony_ci	struct Qdisc *sch = q->sch;
4278c2ecf20Sopenharmony_ci	spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
4288c2ecf20Sopenharmony_ci
4298c2ecf20Sopenharmony_ci	spin_lock(root_lock);
4308c2ecf20Sopenharmony_ci	pie_calculate_probability(&q->params, &q->vars, sch->qstats.backlog);
4318c2ecf20Sopenharmony_ci
4328c2ecf20Sopenharmony_ci	/* reset the timer to fire after 'tupdate'. tupdate is in jiffies. */
4338c2ecf20Sopenharmony_ci	if (q->params.tupdate)
4348c2ecf20Sopenharmony_ci		mod_timer(&q->adapt_timer, jiffies + q->params.tupdate);
4358c2ecf20Sopenharmony_ci	spin_unlock(root_lock);
4368c2ecf20Sopenharmony_ci}
4378c2ecf20Sopenharmony_ci
4388c2ecf20Sopenharmony_cistatic int pie_init(struct Qdisc *sch, struct nlattr *opt,
4398c2ecf20Sopenharmony_ci		    struct netlink_ext_ack *extack)
4408c2ecf20Sopenharmony_ci{
4418c2ecf20Sopenharmony_ci	struct pie_sched_data *q = qdisc_priv(sch);
4428c2ecf20Sopenharmony_ci
4438c2ecf20Sopenharmony_ci	pie_params_init(&q->params);
4448c2ecf20Sopenharmony_ci	pie_vars_init(&q->vars);
4458c2ecf20Sopenharmony_ci	sch->limit = q->params.limit;
4468c2ecf20Sopenharmony_ci
4478c2ecf20Sopenharmony_ci	q->sch = sch;
4488c2ecf20Sopenharmony_ci	timer_setup(&q->adapt_timer, pie_timer, 0);
4498c2ecf20Sopenharmony_ci
4508c2ecf20Sopenharmony_ci	if (opt) {
4518c2ecf20Sopenharmony_ci		int err = pie_change(sch, opt, extack);
4528c2ecf20Sopenharmony_ci
4538c2ecf20Sopenharmony_ci		if (err)
4548c2ecf20Sopenharmony_ci			return err;
4558c2ecf20Sopenharmony_ci	}
4568c2ecf20Sopenharmony_ci
4578c2ecf20Sopenharmony_ci	mod_timer(&q->adapt_timer, jiffies + HZ / 2);
4588c2ecf20Sopenharmony_ci	return 0;
4598c2ecf20Sopenharmony_ci}
4608c2ecf20Sopenharmony_ci
4618c2ecf20Sopenharmony_cistatic int pie_dump(struct Qdisc *sch, struct sk_buff *skb)
4628c2ecf20Sopenharmony_ci{
4638c2ecf20Sopenharmony_ci	struct pie_sched_data *q = qdisc_priv(sch);
4648c2ecf20Sopenharmony_ci	struct nlattr *opts;
4658c2ecf20Sopenharmony_ci
4668c2ecf20Sopenharmony_ci	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
4678c2ecf20Sopenharmony_ci	if (!opts)
4688c2ecf20Sopenharmony_ci		goto nla_put_failure;
4698c2ecf20Sopenharmony_ci
4708c2ecf20Sopenharmony_ci	/* convert target from pschedtime to us */
4718c2ecf20Sopenharmony_ci	if (nla_put_u32(skb, TCA_PIE_TARGET,
4728c2ecf20Sopenharmony_ci			((u32)PSCHED_TICKS2NS(q->params.target)) /
4738c2ecf20Sopenharmony_ci			NSEC_PER_USEC) ||
4748c2ecf20Sopenharmony_ci	    nla_put_u32(skb, TCA_PIE_LIMIT, sch->limit) ||
4758c2ecf20Sopenharmony_ci	    nla_put_u32(skb, TCA_PIE_TUPDATE,
4768c2ecf20Sopenharmony_ci			jiffies_to_usecs(q->params.tupdate)) ||
4778c2ecf20Sopenharmony_ci	    nla_put_u32(skb, TCA_PIE_ALPHA, q->params.alpha) ||
4788c2ecf20Sopenharmony_ci	    nla_put_u32(skb, TCA_PIE_BETA, q->params.beta) ||
4798c2ecf20Sopenharmony_ci	    nla_put_u32(skb, TCA_PIE_ECN, q->params.ecn) ||
4808c2ecf20Sopenharmony_ci	    nla_put_u32(skb, TCA_PIE_BYTEMODE, q->params.bytemode) ||
4818c2ecf20Sopenharmony_ci	    nla_put_u32(skb, TCA_PIE_DQ_RATE_ESTIMATOR,
4828c2ecf20Sopenharmony_ci			q->params.dq_rate_estimator))
4838c2ecf20Sopenharmony_ci		goto nla_put_failure;
4848c2ecf20Sopenharmony_ci
4858c2ecf20Sopenharmony_ci	return nla_nest_end(skb, opts);
4868c2ecf20Sopenharmony_ci
4878c2ecf20Sopenharmony_cinla_put_failure:
4888c2ecf20Sopenharmony_ci	nla_nest_cancel(skb, opts);
4898c2ecf20Sopenharmony_ci	return -1;
4908c2ecf20Sopenharmony_ci}
4918c2ecf20Sopenharmony_ci
4928c2ecf20Sopenharmony_cistatic int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
4938c2ecf20Sopenharmony_ci{
4948c2ecf20Sopenharmony_ci	struct pie_sched_data *q = qdisc_priv(sch);
4958c2ecf20Sopenharmony_ci	struct tc_pie_xstats st = {
4968c2ecf20Sopenharmony_ci		.prob		= q->vars.prob << BITS_PER_BYTE,
4978c2ecf20Sopenharmony_ci		.delay		= ((u32)PSCHED_TICKS2NS(q->vars.qdelay)) /
4988c2ecf20Sopenharmony_ci				   NSEC_PER_USEC,
4998c2ecf20Sopenharmony_ci		.packets_in	= q->stats.packets_in,
5008c2ecf20Sopenharmony_ci		.overlimit	= q->stats.overlimit,
5018c2ecf20Sopenharmony_ci		.maxq		= q->stats.maxq,
5028c2ecf20Sopenharmony_ci		.dropped	= q->stats.dropped,
5038c2ecf20Sopenharmony_ci		.ecn_mark	= q->stats.ecn_mark,
5048c2ecf20Sopenharmony_ci	};
5058c2ecf20Sopenharmony_ci
5068c2ecf20Sopenharmony_ci	/* avg_dq_rate is only valid if dq_rate_estimator is enabled */
5078c2ecf20Sopenharmony_ci	st.dq_rate_estimating = q->params.dq_rate_estimator;
5088c2ecf20Sopenharmony_ci
5098c2ecf20Sopenharmony_ci	/* unscale and return dq_rate in bytes per sec */
5108c2ecf20Sopenharmony_ci	if (q->params.dq_rate_estimator)
5118c2ecf20Sopenharmony_ci		st.avg_dq_rate = q->vars.avg_dq_rate *
5128c2ecf20Sopenharmony_ci				 (PSCHED_TICKS_PER_SEC) >> PIE_SCALE;
5138c2ecf20Sopenharmony_ci
5148c2ecf20Sopenharmony_ci	return gnet_stats_copy_app(d, &st, sizeof(st));
5158c2ecf20Sopenharmony_ci}
5168c2ecf20Sopenharmony_ci
5178c2ecf20Sopenharmony_cistatic struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch)
5188c2ecf20Sopenharmony_ci{
5198c2ecf20Sopenharmony_ci	struct pie_sched_data *q = qdisc_priv(sch);
5208c2ecf20Sopenharmony_ci	struct sk_buff *skb = qdisc_dequeue_head(sch);
5218c2ecf20Sopenharmony_ci
5228c2ecf20Sopenharmony_ci	if (!skb)
5238c2ecf20Sopenharmony_ci		return NULL;
5248c2ecf20Sopenharmony_ci
5258c2ecf20Sopenharmony_ci	pie_process_dequeue(skb, &q->params, &q->vars, sch->qstats.backlog);
5268c2ecf20Sopenharmony_ci	return skb;
5278c2ecf20Sopenharmony_ci}
5288c2ecf20Sopenharmony_ci
5298c2ecf20Sopenharmony_cistatic void pie_reset(struct Qdisc *sch)
5308c2ecf20Sopenharmony_ci{
5318c2ecf20Sopenharmony_ci	struct pie_sched_data *q = qdisc_priv(sch);
5328c2ecf20Sopenharmony_ci
5338c2ecf20Sopenharmony_ci	qdisc_reset_queue(sch);
5348c2ecf20Sopenharmony_ci	pie_vars_init(&q->vars);
5358c2ecf20Sopenharmony_ci}
5368c2ecf20Sopenharmony_ci
5378c2ecf20Sopenharmony_cistatic void pie_destroy(struct Qdisc *sch)
5388c2ecf20Sopenharmony_ci{
5398c2ecf20Sopenharmony_ci	struct pie_sched_data *q = qdisc_priv(sch);
5408c2ecf20Sopenharmony_ci
5418c2ecf20Sopenharmony_ci	q->params.tupdate = 0;
5428c2ecf20Sopenharmony_ci	del_timer_sync(&q->adapt_timer);
5438c2ecf20Sopenharmony_ci}
5448c2ecf20Sopenharmony_ci
5458c2ecf20Sopenharmony_cistatic struct Qdisc_ops pie_qdisc_ops __read_mostly = {
5468c2ecf20Sopenharmony_ci	.id		= "pie",
5478c2ecf20Sopenharmony_ci	.priv_size	= sizeof(struct pie_sched_data),
5488c2ecf20Sopenharmony_ci	.enqueue	= pie_qdisc_enqueue,
5498c2ecf20Sopenharmony_ci	.dequeue	= pie_qdisc_dequeue,
5508c2ecf20Sopenharmony_ci	.peek		= qdisc_peek_dequeued,
5518c2ecf20Sopenharmony_ci	.init		= pie_init,
5528c2ecf20Sopenharmony_ci	.destroy	= pie_destroy,
5538c2ecf20Sopenharmony_ci	.reset		= pie_reset,
5548c2ecf20Sopenharmony_ci	.change		= pie_change,
5558c2ecf20Sopenharmony_ci	.dump		= pie_dump,
5568c2ecf20Sopenharmony_ci	.dump_stats	= pie_dump_stats,
5578c2ecf20Sopenharmony_ci	.owner		= THIS_MODULE,
5588c2ecf20Sopenharmony_ci};
5598c2ecf20Sopenharmony_ci
5608c2ecf20Sopenharmony_cistatic int __init pie_module_init(void)
5618c2ecf20Sopenharmony_ci{
5628c2ecf20Sopenharmony_ci	return register_qdisc(&pie_qdisc_ops);
5638c2ecf20Sopenharmony_ci}
5648c2ecf20Sopenharmony_ci
5658c2ecf20Sopenharmony_cistatic void __exit pie_module_exit(void)
5668c2ecf20Sopenharmony_ci{
5678c2ecf20Sopenharmony_ci	unregister_qdisc(&pie_qdisc_ops);
5688c2ecf20Sopenharmony_ci}
5698c2ecf20Sopenharmony_ci
5708c2ecf20Sopenharmony_cimodule_init(pie_module_init);
5718c2ecf20Sopenharmony_cimodule_exit(pie_module_exit);
5728c2ecf20Sopenharmony_ci
5738c2ecf20Sopenharmony_ciMODULE_DESCRIPTION("Proportional Integral controller Enhanced (PIE) scheduler");
5748c2ecf20Sopenharmony_ciMODULE_AUTHOR("Vijay Subramanian");
5758c2ecf20Sopenharmony_ciMODULE_AUTHOR("Mythili Prabhu");
5768c2ecf20Sopenharmony_ciMODULE_LICENSE("GPL");
577