162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later
262306a36Sopenharmony_ci/* DataCenter TCP (DCTCP) congestion control.
362306a36Sopenharmony_ci *
462306a36Sopenharmony_ci * http://simula.stanford.edu/~alizade/Site/DCTCP.html
562306a36Sopenharmony_ci *
662306a36Sopenharmony_ci * This is an implementation of DCTCP over Reno, an enhancement to the
762306a36Sopenharmony_ci * TCP congestion control algorithm designed for data centers. DCTCP
862306a36Sopenharmony_ci * leverages Explicit Congestion Notification (ECN) in the network to
962306a36Sopenharmony_ci * provide multi-bit feedback to the end hosts. DCTCP's goal is to meet
1062306a36Sopenharmony_ci * the following three data center transport requirements:
1162306a36Sopenharmony_ci *
1262306a36Sopenharmony_ci *  - High burst tolerance (incast due to partition/aggregate)
1362306a36Sopenharmony_ci *  - Low latency (short flows, queries)
1462306a36Sopenharmony_ci *  - High throughput (continuous data updates, large file transfers)
1562306a36Sopenharmony_ci *    with commodity shallow buffered switches
1662306a36Sopenharmony_ci *
1762306a36Sopenharmony_ci * The algorithm is described in detail in the following two papers:
1862306a36Sopenharmony_ci *
1962306a36Sopenharmony_ci * 1) Mohammad Alizadeh, Albert Greenberg, David A. Maltz, Jitendra Padhye,
2062306a36Sopenharmony_ci *    Parveen Patel, Balaji Prabhakar, Sudipta Sengupta, and Murari Sridharan:
2162306a36Sopenharmony_ci *      "Data Center TCP (DCTCP)", Data Center Networks session
2262306a36Sopenharmony_ci *      Proc. ACM SIGCOMM, New Delhi, 2010.
2362306a36Sopenharmony_ci *   http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
2462306a36Sopenharmony_ci *
2562306a36Sopenharmony_ci * 2) Mohammad Alizadeh, Adel Javanmard, and Balaji Prabhakar:
2662306a36Sopenharmony_ci *      "Analysis of DCTCP: Stability, Convergence, and Fairness"
2762306a36Sopenharmony_ci *      Proc. ACM SIGMETRICS, San Jose, 2011.
2862306a36Sopenharmony_ci *   http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf
2962306a36Sopenharmony_ci *
3062306a36Sopenharmony_ci * Initial prototype from Abdul Kabbani, Masato Yasuda and Mohammad Alizadeh.
3162306a36Sopenharmony_ci *
3262306a36Sopenharmony_ci * Authors:
3362306a36Sopenharmony_ci *
3462306a36Sopenharmony_ci *	Daniel Borkmann <dborkman@redhat.com>
3562306a36Sopenharmony_ci *	Florian Westphal <fw@strlen.de>
3662306a36Sopenharmony_ci *	Glenn Judd <glenn.judd@morganstanley.com>
3762306a36Sopenharmony_ci */
3862306a36Sopenharmony_ci
3962306a36Sopenharmony_ci#include <linux/btf.h>
4062306a36Sopenharmony_ci#include <linux/btf_ids.h>
4162306a36Sopenharmony_ci#include <linux/module.h>
4262306a36Sopenharmony_ci#include <linux/mm.h>
4362306a36Sopenharmony_ci#include <net/tcp.h>
4462306a36Sopenharmony_ci#include <linux/inet_diag.h>
4562306a36Sopenharmony_ci#include "tcp_dctcp.h"
4662306a36Sopenharmony_ci
4762306a36Sopenharmony_ci#define DCTCP_MAX_ALPHA	1024U
4862306a36Sopenharmony_ci
4962306a36Sopenharmony_cistruct dctcp {
5062306a36Sopenharmony_ci	u32 old_delivered;
5162306a36Sopenharmony_ci	u32 old_delivered_ce;
5262306a36Sopenharmony_ci	u32 prior_rcv_nxt;
5362306a36Sopenharmony_ci	u32 dctcp_alpha;
5462306a36Sopenharmony_ci	u32 next_seq;
5562306a36Sopenharmony_ci	u32 ce_state;
5662306a36Sopenharmony_ci	u32 loss_cwnd;
5762306a36Sopenharmony_ci	struct tcp_plb_state plb;
5862306a36Sopenharmony_ci};
5962306a36Sopenharmony_ci
6062306a36Sopenharmony_cistatic unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */
6162306a36Sopenharmony_cimodule_param(dctcp_shift_g, uint, 0644);
6262306a36Sopenharmony_ciMODULE_PARM_DESC(dctcp_shift_g, "parameter g for updating dctcp_alpha");
6362306a36Sopenharmony_ci
6462306a36Sopenharmony_cistatic unsigned int dctcp_alpha_on_init __read_mostly = DCTCP_MAX_ALPHA;
6562306a36Sopenharmony_cimodule_param(dctcp_alpha_on_init, uint, 0644);
6662306a36Sopenharmony_ciMODULE_PARM_DESC(dctcp_alpha_on_init, "parameter for initial alpha value");
6762306a36Sopenharmony_ci
6862306a36Sopenharmony_cistatic struct tcp_congestion_ops dctcp_reno;
6962306a36Sopenharmony_ci
7062306a36Sopenharmony_cistatic void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca)
7162306a36Sopenharmony_ci{
7262306a36Sopenharmony_ci	ca->next_seq = tp->snd_nxt;
7362306a36Sopenharmony_ci
7462306a36Sopenharmony_ci	ca->old_delivered = tp->delivered;
7562306a36Sopenharmony_ci	ca->old_delivered_ce = tp->delivered_ce;
7662306a36Sopenharmony_ci}
7762306a36Sopenharmony_ci
7862306a36Sopenharmony_ci__bpf_kfunc static void dctcp_init(struct sock *sk)
7962306a36Sopenharmony_ci{
8062306a36Sopenharmony_ci	const struct tcp_sock *tp = tcp_sk(sk);
8162306a36Sopenharmony_ci
8262306a36Sopenharmony_ci	if ((tp->ecn_flags & TCP_ECN_OK) ||
8362306a36Sopenharmony_ci	    (sk->sk_state == TCP_LISTEN ||
8462306a36Sopenharmony_ci	     sk->sk_state == TCP_CLOSE)) {
8562306a36Sopenharmony_ci		struct dctcp *ca = inet_csk_ca(sk);
8662306a36Sopenharmony_ci
8762306a36Sopenharmony_ci		ca->prior_rcv_nxt = tp->rcv_nxt;
8862306a36Sopenharmony_ci
8962306a36Sopenharmony_ci		ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA);
9062306a36Sopenharmony_ci
9162306a36Sopenharmony_ci		ca->loss_cwnd = 0;
9262306a36Sopenharmony_ci		ca->ce_state = 0;
9362306a36Sopenharmony_ci
9462306a36Sopenharmony_ci		dctcp_reset(tp, ca);
9562306a36Sopenharmony_ci		tcp_plb_init(sk, &ca->plb);
9662306a36Sopenharmony_ci
9762306a36Sopenharmony_ci		return;
9862306a36Sopenharmony_ci	}
9962306a36Sopenharmony_ci
10062306a36Sopenharmony_ci	/* No ECN support? Fall back to Reno. Also need to clear
10162306a36Sopenharmony_ci	 * ECT from sk since it is set during 3WHS for DCTCP.
10262306a36Sopenharmony_ci	 */
10362306a36Sopenharmony_ci	inet_csk(sk)->icsk_ca_ops = &dctcp_reno;
10462306a36Sopenharmony_ci	INET_ECN_dontxmit(sk);
10562306a36Sopenharmony_ci}
10662306a36Sopenharmony_ci
10762306a36Sopenharmony_ci__bpf_kfunc static u32 dctcp_ssthresh(struct sock *sk)
10862306a36Sopenharmony_ci{
10962306a36Sopenharmony_ci	struct dctcp *ca = inet_csk_ca(sk);
11062306a36Sopenharmony_ci	struct tcp_sock *tp = tcp_sk(sk);
11162306a36Sopenharmony_ci
11262306a36Sopenharmony_ci	ca->loss_cwnd = tcp_snd_cwnd(tp);
11362306a36Sopenharmony_ci	return max(tcp_snd_cwnd(tp) - ((tcp_snd_cwnd(tp) * ca->dctcp_alpha) >> 11U), 2U);
11462306a36Sopenharmony_ci}
11562306a36Sopenharmony_ci
11662306a36Sopenharmony_ci__bpf_kfunc static void dctcp_update_alpha(struct sock *sk, u32 flags)
11762306a36Sopenharmony_ci{
11862306a36Sopenharmony_ci	const struct tcp_sock *tp = tcp_sk(sk);
11962306a36Sopenharmony_ci	struct dctcp *ca = inet_csk_ca(sk);
12062306a36Sopenharmony_ci
12162306a36Sopenharmony_ci	/* Expired RTT */
12262306a36Sopenharmony_ci	if (!before(tp->snd_una, ca->next_seq)) {
12362306a36Sopenharmony_ci		u32 delivered = tp->delivered - ca->old_delivered;
12462306a36Sopenharmony_ci		u32 delivered_ce = tp->delivered_ce - ca->old_delivered_ce;
12562306a36Sopenharmony_ci		u32 alpha = ca->dctcp_alpha;
12662306a36Sopenharmony_ci		u32 ce_ratio = 0;
12762306a36Sopenharmony_ci
12862306a36Sopenharmony_ci		if (delivered > 0) {
12962306a36Sopenharmony_ci			/* dctcp_alpha keeps EWMA of fraction of ECN marked
13062306a36Sopenharmony_ci			 * packets. Because of EWMA smoothing, PLB reaction can
13162306a36Sopenharmony_ci			 * be slow so we use ce_ratio which is an instantaneous
13262306a36Sopenharmony_ci			 * measure of congestion. ce_ratio is the fraction of
13362306a36Sopenharmony_ci			 * ECN marked packets in the previous RTT.
13462306a36Sopenharmony_ci			 */
13562306a36Sopenharmony_ci			if (delivered_ce > 0)
13662306a36Sopenharmony_ci				ce_ratio = (delivered_ce << TCP_PLB_SCALE) / delivered;
13762306a36Sopenharmony_ci			tcp_plb_update_state(sk, &ca->plb, (int)ce_ratio);
13862306a36Sopenharmony_ci			tcp_plb_check_rehash(sk, &ca->plb);
13962306a36Sopenharmony_ci		}
14062306a36Sopenharmony_ci
14162306a36Sopenharmony_ci		/* alpha = (1 - g) * alpha + g * F */
14262306a36Sopenharmony_ci
14362306a36Sopenharmony_ci		alpha -= min_not_zero(alpha, alpha >> dctcp_shift_g);
14462306a36Sopenharmony_ci		if (delivered_ce) {
14562306a36Sopenharmony_ci
14662306a36Sopenharmony_ci			/* If dctcp_shift_g == 1, a 32bit value would overflow
14762306a36Sopenharmony_ci			 * after 8 M packets.
14862306a36Sopenharmony_ci			 */
14962306a36Sopenharmony_ci			delivered_ce <<= (10 - dctcp_shift_g);
15062306a36Sopenharmony_ci			delivered_ce /= max(1U, delivered);
15162306a36Sopenharmony_ci
15262306a36Sopenharmony_ci			alpha = min(alpha + delivered_ce, DCTCP_MAX_ALPHA);
15362306a36Sopenharmony_ci		}
15462306a36Sopenharmony_ci		/* dctcp_alpha can be read from dctcp_get_info() without
15562306a36Sopenharmony_ci		 * synchro, so we ask compiler to not use dctcp_alpha
15662306a36Sopenharmony_ci		 * as a temporary variable in prior operations.
15762306a36Sopenharmony_ci		 */
15862306a36Sopenharmony_ci		WRITE_ONCE(ca->dctcp_alpha, alpha);
15962306a36Sopenharmony_ci		dctcp_reset(tp, ca);
16062306a36Sopenharmony_ci	}
16162306a36Sopenharmony_ci}
16262306a36Sopenharmony_ci
16362306a36Sopenharmony_cistatic void dctcp_react_to_loss(struct sock *sk)
16462306a36Sopenharmony_ci{
16562306a36Sopenharmony_ci	struct dctcp *ca = inet_csk_ca(sk);
16662306a36Sopenharmony_ci	struct tcp_sock *tp = tcp_sk(sk);
16762306a36Sopenharmony_ci
16862306a36Sopenharmony_ci	ca->loss_cwnd = tcp_snd_cwnd(tp);
16962306a36Sopenharmony_ci	tp->snd_ssthresh = max(tcp_snd_cwnd(tp) >> 1U, 2U);
17062306a36Sopenharmony_ci}
17162306a36Sopenharmony_ci
17262306a36Sopenharmony_ci__bpf_kfunc static void dctcp_state(struct sock *sk, u8 new_state)
17362306a36Sopenharmony_ci{
17462306a36Sopenharmony_ci	if (new_state == TCP_CA_Recovery &&
17562306a36Sopenharmony_ci	    new_state != inet_csk(sk)->icsk_ca_state)
17662306a36Sopenharmony_ci		dctcp_react_to_loss(sk);
17762306a36Sopenharmony_ci	/* We handle RTO in dctcp_cwnd_event to ensure that we perform only
17862306a36Sopenharmony_ci	 * one loss-adjustment per RTT.
17962306a36Sopenharmony_ci	 */
18062306a36Sopenharmony_ci}
18162306a36Sopenharmony_ci
18262306a36Sopenharmony_ci__bpf_kfunc static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
18362306a36Sopenharmony_ci{
18462306a36Sopenharmony_ci	struct dctcp *ca = inet_csk_ca(sk);
18562306a36Sopenharmony_ci
18662306a36Sopenharmony_ci	switch (ev) {
18762306a36Sopenharmony_ci	case CA_EVENT_ECN_IS_CE:
18862306a36Sopenharmony_ci	case CA_EVENT_ECN_NO_CE:
18962306a36Sopenharmony_ci		dctcp_ece_ack_update(sk, ev, &ca->prior_rcv_nxt, &ca->ce_state);
19062306a36Sopenharmony_ci		break;
19162306a36Sopenharmony_ci	case CA_EVENT_LOSS:
19262306a36Sopenharmony_ci		tcp_plb_update_state_upon_rto(sk, &ca->plb);
19362306a36Sopenharmony_ci		dctcp_react_to_loss(sk);
19462306a36Sopenharmony_ci		break;
19562306a36Sopenharmony_ci	case CA_EVENT_TX_START:
19662306a36Sopenharmony_ci		tcp_plb_check_rehash(sk, &ca->plb); /* Maybe rehash when inflight is 0 */
19762306a36Sopenharmony_ci		break;
19862306a36Sopenharmony_ci	default:
19962306a36Sopenharmony_ci		/* Don't care for the rest. */
20062306a36Sopenharmony_ci		break;
20162306a36Sopenharmony_ci	}
20262306a36Sopenharmony_ci}
20362306a36Sopenharmony_ci
20462306a36Sopenharmony_cistatic size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr,
20562306a36Sopenharmony_ci			     union tcp_cc_info *info)
20662306a36Sopenharmony_ci{
20762306a36Sopenharmony_ci	const struct dctcp *ca = inet_csk_ca(sk);
20862306a36Sopenharmony_ci	const struct tcp_sock *tp = tcp_sk(sk);
20962306a36Sopenharmony_ci
21062306a36Sopenharmony_ci	/* Fill it also in case of VEGASINFO due to req struct limits.
21162306a36Sopenharmony_ci	 * We can still correctly retrieve it later.
21262306a36Sopenharmony_ci	 */
21362306a36Sopenharmony_ci	if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) ||
21462306a36Sopenharmony_ci	    ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
21562306a36Sopenharmony_ci		memset(&info->dctcp, 0, sizeof(info->dctcp));
21662306a36Sopenharmony_ci		if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) {
21762306a36Sopenharmony_ci			info->dctcp.dctcp_enabled = 1;
21862306a36Sopenharmony_ci			info->dctcp.dctcp_ce_state = (u16) ca->ce_state;
21962306a36Sopenharmony_ci			info->dctcp.dctcp_alpha = ca->dctcp_alpha;
22062306a36Sopenharmony_ci			info->dctcp.dctcp_ab_ecn = tp->mss_cache *
22162306a36Sopenharmony_ci						   (tp->delivered_ce - ca->old_delivered_ce);
22262306a36Sopenharmony_ci			info->dctcp.dctcp_ab_tot = tp->mss_cache *
22362306a36Sopenharmony_ci						   (tp->delivered - ca->old_delivered);
22462306a36Sopenharmony_ci		}
22562306a36Sopenharmony_ci
22662306a36Sopenharmony_ci		*attr = INET_DIAG_DCTCPINFO;
22762306a36Sopenharmony_ci		return sizeof(info->dctcp);
22862306a36Sopenharmony_ci	}
22962306a36Sopenharmony_ci	return 0;
23062306a36Sopenharmony_ci}
23162306a36Sopenharmony_ci
23262306a36Sopenharmony_ci__bpf_kfunc static u32 dctcp_cwnd_undo(struct sock *sk)
23362306a36Sopenharmony_ci{
23462306a36Sopenharmony_ci	const struct dctcp *ca = inet_csk_ca(sk);
23562306a36Sopenharmony_ci	struct tcp_sock *tp = tcp_sk(sk);
23662306a36Sopenharmony_ci
23762306a36Sopenharmony_ci	return max(tcp_snd_cwnd(tp), ca->loss_cwnd);
23862306a36Sopenharmony_ci}
23962306a36Sopenharmony_ci
24062306a36Sopenharmony_cistatic struct tcp_congestion_ops dctcp __read_mostly = {
24162306a36Sopenharmony_ci	.init		= dctcp_init,
24262306a36Sopenharmony_ci	.in_ack_event   = dctcp_update_alpha,
24362306a36Sopenharmony_ci	.cwnd_event	= dctcp_cwnd_event,
24462306a36Sopenharmony_ci	.ssthresh	= dctcp_ssthresh,
24562306a36Sopenharmony_ci	.cong_avoid	= tcp_reno_cong_avoid,
24662306a36Sopenharmony_ci	.undo_cwnd	= dctcp_cwnd_undo,
24762306a36Sopenharmony_ci	.set_state	= dctcp_state,
24862306a36Sopenharmony_ci	.get_info	= dctcp_get_info,
24962306a36Sopenharmony_ci	.flags		= TCP_CONG_NEEDS_ECN,
25062306a36Sopenharmony_ci	.owner		= THIS_MODULE,
25162306a36Sopenharmony_ci	.name		= "dctcp",
25262306a36Sopenharmony_ci};
25362306a36Sopenharmony_ci
25462306a36Sopenharmony_cistatic struct tcp_congestion_ops dctcp_reno __read_mostly = {
25562306a36Sopenharmony_ci	.ssthresh	= tcp_reno_ssthresh,
25662306a36Sopenharmony_ci	.cong_avoid	= tcp_reno_cong_avoid,
25762306a36Sopenharmony_ci	.undo_cwnd	= tcp_reno_undo_cwnd,
25862306a36Sopenharmony_ci	.get_info	= dctcp_get_info,
25962306a36Sopenharmony_ci	.owner		= THIS_MODULE,
26062306a36Sopenharmony_ci	.name		= "dctcp-reno",
26162306a36Sopenharmony_ci};
26262306a36Sopenharmony_ci
26362306a36Sopenharmony_ciBTF_SET8_START(tcp_dctcp_check_kfunc_ids)
26462306a36Sopenharmony_ci#ifdef CONFIG_X86
26562306a36Sopenharmony_ci#ifdef CONFIG_DYNAMIC_FTRACE
26662306a36Sopenharmony_ciBTF_ID_FLAGS(func, dctcp_init)
26762306a36Sopenharmony_ciBTF_ID_FLAGS(func, dctcp_update_alpha)
26862306a36Sopenharmony_ciBTF_ID_FLAGS(func, dctcp_cwnd_event)
26962306a36Sopenharmony_ciBTF_ID_FLAGS(func, dctcp_ssthresh)
27062306a36Sopenharmony_ciBTF_ID_FLAGS(func, dctcp_cwnd_undo)
27162306a36Sopenharmony_ciBTF_ID_FLAGS(func, dctcp_state)
27262306a36Sopenharmony_ci#endif
27362306a36Sopenharmony_ci#endif
27462306a36Sopenharmony_ciBTF_SET8_END(tcp_dctcp_check_kfunc_ids)
27562306a36Sopenharmony_ci
27662306a36Sopenharmony_cistatic const struct btf_kfunc_id_set tcp_dctcp_kfunc_set = {
27762306a36Sopenharmony_ci	.owner = THIS_MODULE,
27862306a36Sopenharmony_ci	.set   = &tcp_dctcp_check_kfunc_ids,
27962306a36Sopenharmony_ci};
28062306a36Sopenharmony_ci
28162306a36Sopenharmony_cistatic int __init dctcp_register(void)
28262306a36Sopenharmony_ci{
28362306a36Sopenharmony_ci	int ret;
28462306a36Sopenharmony_ci
28562306a36Sopenharmony_ci	BUILD_BUG_ON(sizeof(struct dctcp) > ICSK_CA_PRIV_SIZE);
28662306a36Sopenharmony_ci
28762306a36Sopenharmony_ci	ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_dctcp_kfunc_set);
28862306a36Sopenharmony_ci	if (ret < 0)
28962306a36Sopenharmony_ci		return ret;
29062306a36Sopenharmony_ci	return tcp_register_congestion_control(&dctcp);
29162306a36Sopenharmony_ci}
29262306a36Sopenharmony_ci
29362306a36Sopenharmony_cistatic void __exit dctcp_unregister(void)
29462306a36Sopenharmony_ci{
29562306a36Sopenharmony_ci	tcp_unregister_congestion_control(&dctcp);
29662306a36Sopenharmony_ci}
29762306a36Sopenharmony_ci
29862306a36Sopenharmony_cimodule_init(dctcp_register);
29962306a36Sopenharmony_cimodule_exit(dctcp_unregister);
30062306a36Sopenharmony_ci
30162306a36Sopenharmony_ciMODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
30262306a36Sopenharmony_ciMODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
30362306a36Sopenharmony_ciMODULE_AUTHOR("Glenn Judd <glenn.judd@morganstanley.com>");
30462306a36Sopenharmony_ci
30562306a36Sopenharmony_ciMODULE_LICENSE("GPL v2");
30662306a36Sopenharmony_ciMODULE_DESCRIPTION("DataCenter TCP (DCTCP)");
307