162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later 262306a36Sopenharmony_ci/* DataCenter TCP (DCTCP) congestion control. 362306a36Sopenharmony_ci * 462306a36Sopenharmony_ci * http://simula.stanford.edu/~alizade/Site/DCTCP.html 562306a36Sopenharmony_ci * 662306a36Sopenharmony_ci * This is an implementation of DCTCP over Reno, an enhancement to the 762306a36Sopenharmony_ci * TCP congestion control algorithm designed for data centers. DCTCP 862306a36Sopenharmony_ci * leverages Explicit Congestion Notification (ECN) in the network to 962306a36Sopenharmony_ci * provide multi-bit feedback to the end hosts. DCTCP's goal is to meet 1062306a36Sopenharmony_ci * the following three data center transport requirements: 1162306a36Sopenharmony_ci * 1262306a36Sopenharmony_ci * - High burst tolerance (incast due to partition/aggregate) 1362306a36Sopenharmony_ci * - Low latency (short flows, queries) 1462306a36Sopenharmony_ci * - High throughput (continuous data updates, large file transfers) 1562306a36Sopenharmony_ci * with commodity shallow buffered switches 1662306a36Sopenharmony_ci * 1762306a36Sopenharmony_ci * The algorithm is described in detail in the following two papers: 1862306a36Sopenharmony_ci * 1962306a36Sopenharmony_ci * 1) Mohammad Alizadeh, Albert Greenberg, David A. Maltz, Jitendra Padhye, 2062306a36Sopenharmony_ci * Parveen Patel, Balaji Prabhakar, Sudipta Sengupta, and Murari Sridharan: 2162306a36Sopenharmony_ci * "Data Center TCP (DCTCP)", Data Center Networks session 2262306a36Sopenharmony_ci * Proc. ACM SIGCOMM, New Delhi, 2010. 2362306a36Sopenharmony_ci * http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf 2462306a36Sopenharmony_ci * 2562306a36Sopenharmony_ci * 2) Mohammad Alizadeh, Adel Javanmard, and Balaji Prabhakar: 2662306a36Sopenharmony_ci * "Analysis of DCTCP: Stability, Convergence, and Fairness" 2762306a36Sopenharmony_ci * Proc. ACM SIGMETRICS, San Jose, 2011. 2862306a36Sopenharmony_ci * http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf 2962306a36Sopenharmony_ci * 3062306a36Sopenharmony_ci * Initial prototype from Abdul Kabbani, Masato Yasuda and Mohammad Alizadeh. 3162306a36Sopenharmony_ci * 3262306a36Sopenharmony_ci * Authors: 3362306a36Sopenharmony_ci * 3462306a36Sopenharmony_ci * Daniel Borkmann <dborkman@redhat.com> 3562306a36Sopenharmony_ci * Florian Westphal <fw@strlen.de> 3662306a36Sopenharmony_ci * Glenn Judd <glenn.judd@morganstanley.com> 3762306a36Sopenharmony_ci */ 3862306a36Sopenharmony_ci 3962306a36Sopenharmony_ci#include <linux/btf.h> 4062306a36Sopenharmony_ci#include <linux/btf_ids.h> 4162306a36Sopenharmony_ci#include <linux/module.h> 4262306a36Sopenharmony_ci#include <linux/mm.h> 4362306a36Sopenharmony_ci#include <net/tcp.h> 4462306a36Sopenharmony_ci#include <linux/inet_diag.h> 4562306a36Sopenharmony_ci#include "tcp_dctcp.h" 4662306a36Sopenharmony_ci 4762306a36Sopenharmony_ci#define DCTCP_MAX_ALPHA 1024U 4862306a36Sopenharmony_ci 4962306a36Sopenharmony_cistruct dctcp { 5062306a36Sopenharmony_ci u32 old_delivered; 5162306a36Sopenharmony_ci u32 old_delivered_ce; 5262306a36Sopenharmony_ci u32 prior_rcv_nxt; 5362306a36Sopenharmony_ci u32 dctcp_alpha; 5462306a36Sopenharmony_ci u32 next_seq; 5562306a36Sopenharmony_ci u32 ce_state; 5662306a36Sopenharmony_ci u32 loss_cwnd; 5762306a36Sopenharmony_ci struct tcp_plb_state plb; 5862306a36Sopenharmony_ci}; 5962306a36Sopenharmony_ci 6062306a36Sopenharmony_cistatic unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */ 6162306a36Sopenharmony_cimodule_param(dctcp_shift_g, uint, 0644); 6262306a36Sopenharmony_ciMODULE_PARM_DESC(dctcp_shift_g, "parameter g for updating dctcp_alpha"); 6362306a36Sopenharmony_ci 6462306a36Sopenharmony_cistatic unsigned int dctcp_alpha_on_init __read_mostly = DCTCP_MAX_ALPHA; 6562306a36Sopenharmony_cimodule_param(dctcp_alpha_on_init, uint, 0644); 6662306a36Sopenharmony_ciMODULE_PARM_DESC(dctcp_alpha_on_init, "parameter for initial alpha value"); 6762306a36Sopenharmony_ci 6862306a36Sopenharmony_cistatic struct tcp_congestion_ops dctcp_reno; 6962306a36Sopenharmony_ci 7062306a36Sopenharmony_cistatic void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca) 7162306a36Sopenharmony_ci{ 7262306a36Sopenharmony_ci ca->next_seq = tp->snd_nxt; 7362306a36Sopenharmony_ci 7462306a36Sopenharmony_ci ca->old_delivered = tp->delivered; 7562306a36Sopenharmony_ci ca->old_delivered_ce = tp->delivered_ce; 7662306a36Sopenharmony_ci} 7762306a36Sopenharmony_ci 7862306a36Sopenharmony_ci__bpf_kfunc static void dctcp_init(struct sock *sk) 7962306a36Sopenharmony_ci{ 8062306a36Sopenharmony_ci const struct tcp_sock *tp = tcp_sk(sk); 8162306a36Sopenharmony_ci 8262306a36Sopenharmony_ci if ((tp->ecn_flags & TCP_ECN_OK) || 8362306a36Sopenharmony_ci (sk->sk_state == TCP_LISTEN || 8462306a36Sopenharmony_ci sk->sk_state == TCP_CLOSE)) { 8562306a36Sopenharmony_ci struct dctcp *ca = inet_csk_ca(sk); 8662306a36Sopenharmony_ci 8762306a36Sopenharmony_ci ca->prior_rcv_nxt = tp->rcv_nxt; 8862306a36Sopenharmony_ci 8962306a36Sopenharmony_ci ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA); 9062306a36Sopenharmony_ci 9162306a36Sopenharmony_ci ca->loss_cwnd = 0; 9262306a36Sopenharmony_ci ca->ce_state = 0; 9362306a36Sopenharmony_ci 9462306a36Sopenharmony_ci dctcp_reset(tp, ca); 9562306a36Sopenharmony_ci tcp_plb_init(sk, &ca->plb); 9662306a36Sopenharmony_ci 9762306a36Sopenharmony_ci return; 9862306a36Sopenharmony_ci } 9962306a36Sopenharmony_ci 10062306a36Sopenharmony_ci /* No ECN support? Fall back to Reno. Also need to clear 10162306a36Sopenharmony_ci * ECT from sk since it is set during 3WHS for DCTCP. 10262306a36Sopenharmony_ci */ 10362306a36Sopenharmony_ci inet_csk(sk)->icsk_ca_ops = &dctcp_reno; 10462306a36Sopenharmony_ci INET_ECN_dontxmit(sk); 10562306a36Sopenharmony_ci} 10662306a36Sopenharmony_ci 10762306a36Sopenharmony_ci__bpf_kfunc static u32 dctcp_ssthresh(struct sock *sk) 10862306a36Sopenharmony_ci{ 10962306a36Sopenharmony_ci struct dctcp *ca = inet_csk_ca(sk); 11062306a36Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 11162306a36Sopenharmony_ci 11262306a36Sopenharmony_ci ca->loss_cwnd = tcp_snd_cwnd(tp); 11362306a36Sopenharmony_ci return max(tcp_snd_cwnd(tp) - ((tcp_snd_cwnd(tp) * ca->dctcp_alpha) >> 11U), 2U); 11462306a36Sopenharmony_ci} 11562306a36Sopenharmony_ci 11662306a36Sopenharmony_ci__bpf_kfunc static void dctcp_update_alpha(struct sock *sk, u32 flags) 11762306a36Sopenharmony_ci{ 11862306a36Sopenharmony_ci const struct tcp_sock *tp = tcp_sk(sk); 11962306a36Sopenharmony_ci struct dctcp *ca = inet_csk_ca(sk); 12062306a36Sopenharmony_ci 12162306a36Sopenharmony_ci /* Expired RTT */ 12262306a36Sopenharmony_ci if (!before(tp->snd_una, ca->next_seq)) { 12362306a36Sopenharmony_ci u32 delivered = tp->delivered - ca->old_delivered; 12462306a36Sopenharmony_ci u32 delivered_ce = tp->delivered_ce - ca->old_delivered_ce; 12562306a36Sopenharmony_ci u32 alpha = ca->dctcp_alpha; 12662306a36Sopenharmony_ci u32 ce_ratio = 0; 12762306a36Sopenharmony_ci 12862306a36Sopenharmony_ci if (delivered > 0) { 12962306a36Sopenharmony_ci /* dctcp_alpha keeps EWMA of fraction of ECN marked 13062306a36Sopenharmony_ci * packets. Because of EWMA smoothing, PLB reaction can 13162306a36Sopenharmony_ci * be slow so we use ce_ratio which is an instantaneous 13262306a36Sopenharmony_ci * measure of congestion. ce_ratio is the fraction of 13362306a36Sopenharmony_ci * ECN marked packets in the previous RTT. 13462306a36Sopenharmony_ci */ 13562306a36Sopenharmony_ci if (delivered_ce > 0) 13662306a36Sopenharmony_ci ce_ratio = (delivered_ce << TCP_PLB_SCALE) / delivered; 13762306a36Sopenharmony_ci tcp_plb_update_state(sk, &ca->plb, (int)ce_ratio); 13862306a36Sopenharmony_ci tcp_plb_check_rehash(sk, &ca->plb); 13962306a36Sopenharmony_ci } 14062306a36Sopenharmony_ci 14162306a36Sopenharmony_ci /* alpha = (1 - g) * alpha + g * F */ 14262306a36Sopenharmony_ci 14362306a36Sopenharmony_ci alpha -= min_not_zero(alpha, alpha >> dctcp_shift_g); 14462306a36Sopenharmony_ci if (delivered_ce) { 14562306a36Sopenharmony_ci 14662306a36Sopenharmony_ci /* If dctcp_shift_g == 1, a 32bit value would overflow 14762306a36Sopenharmony_ci * after 8 M packets. 14862306a36Sopenharmony_ci */ 14962306a36Sopenharmony_ci delivered_ce <<= (10 - dctcp_shift_g); 15062306a36Sopenharmony_ci delivered_ce /= max(1U, delivered); 15162306a36Sopenharmony_ci 15262306a36Sopenharmony_ci alpha = min(alpha + delivered_ce, DCTCP_MAX_ALPHA); 15362306a36Sopenharmony_ci } 15462306a36Sopenharmony_ci /* dctcp_alpha can be read from dctcp_get_info() without 15562306a36Sopenharmony_ci * synchro, so we ask compiler to not use dctcp_alpha 15662306a36Sopenharmony_ci * as a temporary variable in prior operations. 15762306a36Sopenharmony_ci */ 15862306a36Sopenharmony_ci WRITE_ONCE(ca->dctcp_alpha, alpha); 15962306a36Sopenharmony_ci dctcp_reset(tp, ca); 16062306a36Sopenharmony_ci } 16162306a36Sopenharmony_ci} 16262306a36Sopenharmony_ci 16362306a36Sopenharmony_cistatic void dctcp_react_to_loss(struct sock *sk) 16462306a36Sopenharmony_ci{ 16562306a36Sopenharmony_ci struct dctcp *ca = inet_csk_ca(sk); 16662306a36Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 16762306a36Sopenharmony_ci 16862306a36Sopenharmony_ci ca->loss_cwnd = tcp_snd_cwnd(tp); 16962306a36Sopenharmony_ci tp->snd_ssthresh = max(tcp_snd_cwnd(tp) >> 1U, 2U); 17062306a36Sopenharmony_ci} 17162306a36Sopenharmony_ci 17262306a36Sopenharmony_ci__bpf_kfunc static void dctcp_state(struct sock *sk, u8 new_state) 17362306a36Sopenharmony_ci{ 17462306a36Sopenharmony_ci if (new_state == TCP_CA_Recovery && 17562306a36Sopenharmony_ci new_state != inet_csk(sk)->icsk_ca_state) 17662306a36Sopenharmony_ci dctcp_react_to_loss(sk); 17762306a36Sopenharmony_ci /* We handle RTO in dctcp_cwnd_event to ensure that we perform only 17862306a36Sopenharmony_ci * one loss-adjustment per RTT. 17962306a36Sopenharmony_ci */ 18062306a36Sopenharmony_ci} 18162306a36Sopenharmony_ci 18262306a36Sopenharmony_ci__bpf_kfunc static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) 18362306a36Sopenharmony_ci{ 18462306a36Sopenharmony_ci struct dctcp *ca = inet_csk_ca(sk); 18562306a36Sopenharmony_ci 18662306a36Sopenharmony_ci switch (ev) { 18762306a36Sopenharmony_ci case CA_EVENT_ECN_IS_CE: 18862306a36Sopenharmony_ci case CA_EVENT_ECN_NO_CE: 18962306a36Sopenharmony_ci dctcp_ece_ack_update(sk, ev, &ca->prior_rcv_nxt, &ca->ce_state); 19062306a36Sopenharmony_ci break; 19162306a36Sopenharmony_ci case CA_EVENT_LOSS: 19262306a36Sopenharmony_ci tcp_plb_update_state_upon_rto(sk, &ca->plb); 19362306a36Sopenharmony_ci dctcp_react_to_loss(sk); 19462306a36Sopenharmony_ci break; 19562306a36Sopenharmony_ci case CA_EVENT_TX_START: 19662306a36Sopenharmony_ci tcp_plb_check_rehash(sk, &ca->plb); /* Maybe rehash when inflight is 0 */ 19762306a36Sopenharmony_ci break; 19862306a36Sopenharmony_ci default: 19962306a36Sopenharmony_ci /* Don't care for the rest. */ 20062306a36Sopenharmony_ci break; 20162306a36Sopenharmony_ci } 20262306a36Sopenharmony_ci} 20362306a36Sopenharmony_ci 20462306a36Sopenharmony_cistatic size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr, 20562306a36Sopenharmony_ci union tcp_cc_info *info) 20662306a36Sopenharmony_ci{ 20762306a36Sopenharmony_ci const struct dctcp *ca = inet_csk_ca(sk); 20862306a36Sopenharmony_ci const struct tcp_sock *tp = tcp_sk(sk); 20962306a36Sopenharmony_ci 21062306a36Sopenharmony_ci /* Fill it also in case of VEGASINFO due to req struct limits. 21162306a36Sopenharmony_ci * We can still correctly retrieve it later. 21262306a36Sopenharmony_ci */ 21362306a36Sopenharmony_ci if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) || 21462306a36Sopenharmony_ci ext & (1 << (INET_DIAG_VEGASINFO - 1))) { 21562306a36Sopenharmony_ci memset(&info->dctcp, 0, sizeof(info->dctcp)); 21662306a36Sopenharmony_ci if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) { 21762306a36Sopenharmony_ci info->dctcp.dctcp_enabled = 1; 21862306a36Sopenharmony_ci info->dctcp.dctcp_ce_state = (u16) ca->ce_state; 21962306a36Sopenharmony_ci info->dctcp.dctcp_alpha = ca->dctcp_alpha; 22062306a36Sopenharmony_ci info->dctcp.dctcp_ab_ecn = tp->mss_cache * 22162306a36Sopenharmony_ci (tp->delivered_ce - ca->old_delivered_ce); 22262306a36Sopenharmony_ci info->dctcp.dctcp_ab_tot = tp->mss_cache * 22362306a36Sopenharmony_ci (tp->delivered - ca->old_delivered); 22462306a36Sopenharmony_ci } 22562306a36Sopenharmony_ci 22662306a36Sopenharmony_ci *attr = INET_DIAG_DCTCPINFO; 22762306a36Sopenharmony_ci return sizeof(info->dctcp); 22862306a36Sopenharmony_ci } 22962306a36Sopenharmony_ci return 0; 23062306a36Sopenharmony_ci} 23162306a36Sopenharmony_ci 23262306a36Sopenharmony_ci__bpf_kfunc static u32 dctcp_cwnd_undo(struct sock *sk) 23362306a36Sopenharmony_ci{ 23462306a36Sopenharmony_ci const struct dctcp *ca = inet_csk_ca(sk); 23562306a36Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 23662306a36Sopenharmony_ci 23762306a36Sopenharmony_ci return max(tcp_snd_cwnd(tp), ca->loss_cwnd); 23862306a36Sopenharmony_ci} 23962306a36Sopenharmony_ci 24062306a36Sopenharmony_cistatic struct tcp_congestion_ops dctcp __read_mostly = { 24162306a36Sopenharmony_ci .init = dctcp_init, 24262306a36Sopenharmony_ci .in_ack_event = dctcp_update_alpha, 24362306a36Sopenharmony_ci .cwnd_event = dctcp_cwnd_event, 24462306a36Sopenharmony_ci .ssthresh = dctcp_ssthresh, 24562306a36Sopenharmony_ci .cong_avoid = tcp_reno_cong_avoid, 24662306a36Sopenharmony_ci .undo_cwnd = dctcp_cwnd_undo, 24762306a36Sopenharmony_ci .set_state = dctcp_state, 24862306a36Sopenharmony_ci .get_info = dctcp_get_info, 24962306a36Sopenharmony_ci .flags = TCP_CONG_NEEDS_ECN, 25062306a36Sopenharmony_ci .owner = THIS_MODULE, 25162306a36Sopenharmony_ci .name = "dctcp", 25262306a36Sopenharmony_ci}; 25362306a36Sopenharmony_ci 25462306a36Sopenharmony_cistatic struct tcp_congestion_ops dctcp_reno __read_mostly = { 25562306a36Sopenharmony_ci .ssthresh = tcp_reno_ssthresh, 25662306a36Sopenharmony_ci .cong_avoid = tcp_reno_cong_avoid, 25762306a36Sopenharmony_ci .undo_cwnd = tcp_reno_undo_cwnd, 25862306a36Sopenharmony_ci .get_info = dctcp_get_info, 25962306a36Sopenharmony_ci .owner = THIS_MODULE, 26062306a36Sopenharmony_ci .name = "dctcp-reno", 26162306a36Sopenharmony_ci}; 26262306a36Sopenharmony_ci 26362306a36Sopenharmony_ciBTF_SET8_START(tcp_dctcp_check_kfunc_ids) 26462306a36Sopenharmony_ci#ifdef CONFIG_X86 26562306a36Sopenharmony_ci#ifdef CONFIG_DYNAMIC_FTRACE 26662306a36Sopenharmony_ciBTF_ID_FLAGS(func, dctcp_init) 26762306a36Sopenharmony_ciBTF_ID_FLAGS(func, dctcp_update_alpha) 26862306a36Sopenharmony_ciBTF_ID_FLAGS(func, dctcp_cwnd_event) 26962306a36Sopenharmony_ciBTF_ID_FLAGS(func, dctcp_ssthresh) 27062306a36Sopenharmony_ciBTF_ID_FLAGS(func, dctcp_cwnd_undo) 27162306a36Sopenharmony_ciBTF_ID_FLAGS(func, dctcp_state) 27262306a36Sopenharmony_ci#endif 27362306a36Sopenharmony_ci#endif 27462306a36Sopenharmony_ciBTF_SET8_END(tcp_dctcp_check_kfunc_ids) 27562306a36Sopenharmony_ci 27662306a36Sopenharmony_cistatic const struct btf_kfunc_id_set tcp_dctcp_kfunc_set = { 27762306a36Sopenharmony_ci .owner = THIS_MODULE, 27862306a36Sopenharmony_ci .set = &tcp_dctcp_check_kfunc_ids, 27962306a36Sopenharmony_ci}; 28062306a36Sopenharmony_ci 28162306a36Sopenharmony_cistatic int __init dctcp_register(void) 28262306a36Sopenharmony_ci{ 28362306a36Sopenharmony_ci int ret; 28462306a36Sopenharmony_ci 28562306a36Sopenharmony_ci BUILD_BUG_ON(sizeof(struct dctcp) > ICSK_CA_PRIV_SIZE); 28662306a36Sopenharmony_ci 28762306a36Sopenharmony_ci ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_dctcp_kfunc_set); 28862306a36Sopenharmony_ci if (ret < 0) 28962306a36Sopenharmony_ci return ret; 29062306a36Sopenharmony_ci return tcp_register_congestion_control(&dctcp); 29162306a36Sopenharmony_ci} 29262306a36Sopenharmony_ci 29362306a36Sopenharmony_cistatic void __exit dctcp_unregister(void) 29462306a36Sopenharmony_ci{ 29562306a36Sopenharmony_ci tcp_unregister_congestion_control(&dctcp); 29662306a36Sopenharmony_ci} 29762306a36Sopenharmony_ci 29862306a36Sopenharmony_cimodule_init(dctcp_register); 29962306a36Sopenharmony_cimodule_exit(dctcp_unregister); 30062306a36Sopenharmony_ci 30162306a36Sopenharmony_ciMODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>"); 30262306a36Sopenharmony_ciMODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); 30362306a36Sopenharmony_ciMODULE_AUTHOR("Glenn Judd <glenn.judd@morganstanley.com>"); 30462306a36Sopenharmony_ci 30562306a36Sopenharmony_ciMODULE_LICENSE("GPL v2"); 30662306a36Sopenharmony_ciMODULE_DESCRIPTION("DataCenter TCP (DCTCP)"); 307