162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * TCP CUBIC: Binary Increase Congestion control for TCP v2.3 462306a36Sopenharmony_ci * Home page: 562306a36Sopenharmony_ci * http://netsrv.csc.ncsu.edu/twiki/bin/view/Main/BIC 662306a36Sopenharmony_ci * This is from the implementation of CUBIC TCP in 762306a36Sopenharmony_ci * Sangtae Ha, Injong Rhee and Lisong Xu, 862306a36Sopenharmony_ci * "CUBIC: A New TCP-Friendly High-Speed TCP Variant" 962306a36Sopenharmony_ci * in ACM SIGOPS Operating System Review, July 2008. 1062306a36Sopenharmony_ci * Available from: 1162306a36Sopenharmony_ci * http://netsrv.csc.ncsu.edu/export/cubic_a_new_tcp_2008.pdf 1262306a36Sopenharmony_ci * 1362306a36Sopenharmony_ci * CUBIC integrates a new slow start algorithm, called HyStart. 1462306a36Sopenharmony_ci * The details of HyStart are presented in 1562306a36Sopenharmony_ci * Sangtae Ha and Injong Rhee, 1662306a36Sopenharmony_ci * "Taming the Elephants: New TCP Slow Start", NCSU TechReport 2008. 1762306a36Sopenharmony_ci * Available from: 1862306a36Sopenharmony_ci * http://netsrv.csc.ncsu.edu/export/hystart_techreport_2008.pdf 1962306a36Sopenharmony_ci * 2062306a36Sopenharmony_ci * All testing results are available from: 2162306a36Sopenharmony_ci * http://netsrv.csc.ncsu.edu/wiki/index.php/TCP_Testing 2262306a36Sopenharmony_ci * 2362306a36Sopenharmony_ci * Unless CUBIC is enabled and congestion window is large 2462306a36Sopenharmony_ci * this behaves the same as the original Reno. 2562306a36Sopenharmony_ci */ 2662306a36Sopenharmony_ci 2762306a36Sopenharmony_ci#include <linux/mm.h> 2862306a36Sopenharmony_ci#include <linux/btf.h> 2962306a36Sopenharmony_ci#include <linux/btf_ids.h> 3062306a36Sopenharmony_ci#include <linux/module.h> 3162306a36Sopenharmony_ci#include <linux/math64.h> 3262306a36Sopenharmony_ci#include <net/tcp.h> 3362306a36Sopenharmony_ci 3462306a36Sopenharmony_ci#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation 3562306a36Sopenharmony_ci * max_cwnd = snd_cwnd * beta 3662306a36Sopenharmony_ci */ 3762306a36Sopenharmony_ci#define BICTCP_HZ 10 /* BIC HZ 2^10 = 1024 */ 3862306a36Sopenharmony_ci 3962306a36Sopenharmony_ci/* Two methods of hybrid slow start */ 4062306a36Sopenharmony_ci#define HYSTART_ACK_TRAIN 0x1 4162306a36Sopenharmony_ci#define HYSTART_DELAY 0x2 4262306a36Sopenharmony_ci 4362306a36Sopenharmony_ci/* Number of delay samples for detecting the increase of delay */ 4462306a36Sopenharmony_ci#define HYSTART_MIN_SAMPLES 8 4562306a36Sopenharmony_ci#define HYSTART_DELAY_MIN (4000U) /* 4 ms */ 4662306a36Sopenharmony_ci#define HYSTART_DELAY_MAX (16000U) /* 16 ms */ 4762306a36Sopenharmony_ci#define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX) 4862306a36Sopenharmony_ci 4962306a36Sopenharmony_cistatic int fast_convergence __read_mostly = 1; 5062306a36Sopenharmony_cistatic int beta __read_mostly = 717; /* = 717/1024 (BICTCP_BETA_SCALE) */ 5162306a36Sopenharmony_cistatic int initial_ssthresh __read_mostly; 5262306a36Sopenharmony_cistatic int bic_scale __read_mostly = 41; 5362306a36Sopenharmony_cistatic int tcp_friendliness __read_mostly = 1; 5462306a36Sopenharmony_ci 5562306a36Sopenharmony_cistatic int hystart __read_mostly = 1; 5662306a36Sopenharmony_cistatic int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY; 5762306a36Sopenharmony_cistatic int hystart_low_window __read_mostly = 16; 5862306a36Sopenharmony_cistatic int hystart_ack_delta_us __read_mostly = 2000; 5962306a36Sopenharmony_ci 6062306a36Sopenharmony_cistatic u32 cube_rtt_scale __read_mostly; 6162306a36Sopenharmony_cistatic u32 beta_scale __read_mostly; 6262306a36Sopenharmony_cistatic u64 cube_factor __read_mostly; 6362306a36Sopenharmony_ci 6462306a36Sopenharmony_ci/* Note parameters that are used for precomputing scale factors are read-only */ 6562306a36Sopenharmony_cimodule_param(fast_convergence, int, 0644); 6662306a36Sopenharmony_ciMODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence"); 6762306a36Sopenharmony_cimodule_param(beta, int, 0644); 6862306a36Sopenharmony_ciMODULE_PARM_DESC(beta, "beta for multiplicative increase"); 6962306a36Sopenharmony_cimodule_param(initial_ssthresh, int, 0644); 7062306a36Sopenharmony_ciMODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold"); 7162306a36Sopenharmony_cimodule_param(bic_scale, int, 0444); 7262306a36Sopenharmony_ciMODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_scale/1024)"); 7362306a36Sopenharmony_cimodule_param(tcp_friendliness, int, 0644); 7462306a36Sopenharmony_ciMODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness"); 7562306a36Sopenharmony_cimodule_param(hystart, int, 0644); 7662306a36Sopenharmony_ciMODULE_PARM_DESC(hystart, "turn on/off hybrid slow start algorithm"); 7762306a36Sopenharmony_cimodule_param(hystart_detect, int, 0644); 7862306a36Sopenharmony_ciMODULE_PARM_DESC(hystart_detect, "hybrid slow start detection mechanisms" 7962306a36Sopenharmony_ci " 1: packet-train 2: delay 3: both packet-train and delay"); 8062306a36Sopenharmony_cimodule_param(hystart_low_window, int, 0644); 8162306a36Sopenharmony_ciMODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start"); 8262306a36Sopenharmony_cimodule_param(hystart_ack_delta_us, int, 0644); 8362306a36Sopenharmony_ciMODULE_PARM_DESC(hystart_ack_delta_us, "spacing between ack's indicating train (usecs)"); 8462306a36Sopenharmony_ci 8562306a36Sopenharmony_ci/* BIC TCP Parameters */ 8662306a36Sopenharmony_cistruct bictcp { 8762306a36Sopenharmony_ci u32 cnt; /* increase cwnd by 1 after ACKs */ 8862306a36Sopenharmony_ci u32 last_max_cwnd; /* last maximum snd_cwnd */ 8962306a36Sopenharmony_ci u32 last_cwnd; /* the last snd_cwnd */ 9062306a36Sopenharmony_ci u32 last_time; /* time when updated last_cwnd */ 9162306a36Sopenharmony_ci u32 bic_origin_point;/* origin point of bic function */ 9262306a36Sopenharmony_ci u32 bic_K; /* time to origin point 9362306a36Sopenharmony_ci from the beginning of the current epoch */ 9462306a36Sopenharmony_ci u32 delay_min; /* min delay (usec) */ 9562306a36Sopenharmony_ci u32 epoch_start; /* beginning of an epoch */ 9662306a36Sopenharmony_ci u32 ack_cnt; /* number of acks */ 9762306a36Sopenharmony_ci u32 tcp_cwnd; /* estimated tcp cwnd */ 9862306a36Sopenharmony_ci u16 unused; 9962306a36Sopenharmony_ci u8 sample_cnt; /* number of samples to decide curr_rtt */ 10062306a36Sopenharmony_ci u8 found; /* the exit point is found? */ 10162306a36Sopenharmony_ci u32 round_start; /* beginning of each round */ 10262306a36Sopenharmony_ci u32 end_seq; /* end_seq of the round */ 10362306a36Sopenharmony_ci u32 last_ack; /* last time when the ACK spacing is close */ 10462306a36Sopenharmony_ci u32 curr_rtt; /* the minimum rtt of current round */ 10562306a36Sopenharmony_ci}; 10662306a36Sopenharmony_ci 10762306a36Sopenharmony_cistatic inline void bictcp_reset(struct bictcp *ca) 10862306a36Sopenharmony_ci{ 10962306a36Sopenharmony_ci memset(ca, 0, offsetof(struct bictcp, unused)); 11062306a36Sopenharmony_ci ca->found = 0; 11162306a36Sopenharmony_ci} 11262306a36Sopenharmony_ci 11362306a36Sopenharmony_cistatic inline u32 bictcp_clock_us(const struct sock *sk) 11462306a36Sopenharmony_ci{ 11562306a36Sopenharmony_ci return tcp_sk(sk)->tcp_mstamp; 11662306a36Sopenharmony_ci} 11762306a36Sopenharmony_ci 11862306a36Sopenharmony_cistatic inline void bictcp_hystart_reset(struct sock *sk) 11962306a36Sopenharmony_ci{ 12062306a36Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 12162306a36Sopenharmony_ci struct bictcp *ca = inet_csk_ca(sk); 12262306a36Sopenharmony_ci 12362306a36Sopenharmony_ci ca->round_start = ca->last_ack = bictcp_clock_us(sk); 12462306a36Sopenharmony_ci ca->end_seq = tp->snd_nxt; 12562306a36Sopenharmony_ci ca->curr_rtt = ~0U; 12662306a36Sopenharmony_ci ca->sample_cnt = 0; 12762306a36Sopenharmony_ci} 12862306a36Sopenharmony_ci 12962306a36Sopenharmony_ci__bpf_kfunc static void cubictcp_init(struct sock *sk) 13062306a36Sopenharmony_ci{ 13162306a36Sopenharmony_ci struct bictcp *ca = inet_csk_ca(sk); 13262306a36Sopenharmony_ci 13362306a36Sopenharmony_ci bictcp_reset(ca); 13462306a36Sopenharmony_ci 13562306a36Sopenharmony_ci if (hystart) 13662306a36Sopenharmony_ci bictcp_hystart_reset(sk); 13762306a36Sopenharmony_ci 13862306a36Sopenharmony_ci if (!hystart && initial_ssthresh) 13962306a36Sopenharmony_ci tcp_sk(sk)->snd_ssthresh = initial_ssthresh; 14062306a36Sopenharmony_ci} 14162306a36Sopenharmony_ci 14262306a36Sopenharmony_ci__bpf_kfunc static void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) 14362306a36Sopenharmony_ci{ 14462306a36Sopenharmony_ci if (event == CA_EVENT_TX_START) { 14562306a36Sopenharmony_ci struct bictcp *ca = inet_csk_ca(sk); 14662306a36Sopenharmony_ci u32 now = tcp_jiffies32; 14762306a36Sopenharmony_ci s32 delta; 14862306a36Sopenharmony_ci 14962306a36Sopenharmony_ci delta = now - tcp_sk(sk)->lsndtime; 15062306a36Sopenharmony_ci 15162306a36Sopenharmony_ci /* We were application limited (idle) for a while. 15262306a36Sopenharmony_ci * Shift epoch_start to keep cwnd growth to cubic curve. 15362306a36Sopenharmony_ci */ 15462306a36Sopenharmony_ci if (ca->epoch_start && delta > 0) { 15562306a36Sopenharmony_ci ca->epoch_start += delta; 15662306a36Sopenharmony_ci if (after(ca->epoch_start, now)) 15762306a36Sopenharmony_ci ca->epoch_start = now; 15862306a36Sopenharmony_ci } 15962306a36Sopenharmony_ci return; 16062306a36Sopenharmony_ci } 16162306a36Sopenharmony_ci} 16262306a36Sopenharmony_ci 16362306a36Sopenharmony_ci/* calculate the cubic root of x using a table lookup followed by one 16462306a36Sopenharmony_ci * Newton-Raphson iteration. 16562306a36Sopenharmony_ci * Avg err ~= 0.195% 16662306a36Sopenharmony_ci */ 16762306a36Sopenharmony_cistatic u32 cubic_root(u64 a) 16862306a36Sopenharmony_ci{ 16962306a36Sopenharmony_ci u32 x, b, shift; 17062306a36Sopenharmony_ci /* 17162306a36Sopenharmony_ci * cbrt(x) MSB values for x MSB values in [0..63]. 17262306a36Sopenharmony_ci * Precomputed then refined by hand - Willy Tarreau 17362306a36Sopenharmony_ci * 17462306a36Sopenharmony_ci * For x in [0..63], 17562306a36Sopenharmony_ci * v = cbrt(x << 18) - 1 17662306a36Sopenharmony_ci * cbrt(x) = (v[x] + 10) >> 6 17762306a36Sopenharmony_ci */ 17862306a36Sopenharmony_ci static const u8 v[] = { 17962306a36Sopenharmony_ci /* 0x00 */ 0, 54, 54, 54, 118, 118, 118, 118, 18062306a36Sopenharmony_ci /* 0x08 */ 123, 129, 134, 138, 143, 147, 151, 156, 18162306a36Sopenharmony_ci /* 0x10 */ 157, 161, 164, 168, 170, 173, 176, 179, 18262306a36Sopenharmony_ci /* 0x18 */ 181, 185, 187, 190, 192, 194, 197, 199, 18362306a36Sopenharmony_ci /* 0x20 */ 200, 202, 204, 206, 209, 211, 213, 215, 18462306a36Sopenharmony_ci /* 0x28 */ 217, 219, 221, 222, 224, 225, 227, 229, 18562306a36Sopenharmony_ci /* 0x30 */ 231, 232, 234, 236, 237, 239, 240, 242, 18662306a36Sopenharmony_ci /* 0x38 */ 244, 245, 246, 248, 250, 251, 252, 254, 18762306a36Sopenharmony_ci }; 18862306a36Sopenharmony_ci 18962306a36Sopenharmony_ci b = fls64(a); 19062306a36Sopenharmony_ci if (b < 7) { 19162306a36Sopenharmony_ci /* a in [0..63] */ 19262306a36Sopenharmony_ci return ((u32)v[(u32)a] + 35) >> 6; 19362306a36Sopenharmony_ci } 19462306a36Sopenharmony_ci 19562306a36Sopenharmony_ci b = ((b * 84) >> 8) - 1; 19662306a36Sopenharmony_ci shift = (a >> (b * 3)); 19762306a36Sopenharmony_ci 19862306a36Sopenharmony_ci x = ((u32)(((u32)v[shift] + 10) << b)) >> 6; 19962306a36Sopenharmony_ci 20062306a36Sopenharmony_ci /* 20162306a36Sopenharmony_ci * Newton-Raphson iteration 20262306a36Sopenharmony_ci * 2 20362306a36Sopenharmony_ci * x = ( 2 * x + a / x ) / 3 20462306a36Sopenharmony_ci * k+1 k k 20562306a36Sopenharmony_ci */ 20662306a36Sopenharmony_ci x = (2 * x + (u32)div64_u64(a, (u64)x * (u64)(x - 1))); 20762306a36Sopenharmony_ci x = ((x * 341) >> 10); 20862306a36Sopenharmony_ci return x; 20962306a36Sopenharmony_ci} 21062306a36Sopenharmony_ci 21162306a36Sopenharmony_ci/* 21262306a36Sopenharmony_ci * Compute congestion window to use. 21362306a36Sopenharmony_ci */ 21462306a36Sopenharmony_cistatic inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked) 21562306a36Sopenharmony_ci{ 21662306a36Sopenharmony_ci u32 delta, bic_target, max_cnt; 21762306a36Sopenharmony_ci u64 offs, t; 21862306a36Sopenharmony_ci 21962306a36Sopenharmony_ci ca->ack_cnt += acked; /* count the number of ACKed packets */ 22062306a36Sopenharmony_ci 22162306a36Sopenharmony_ci if (ca->last_cwnd == cwnd && 22262306a36Sopenharmony_ci (s32)(tcp_jiffies32 - ca->last_time) <= HZ / 32) 22362306a36Sopenharmony_ci return; 22462306a36Sopenharmony_ci 22562306a36Sopenharmony_ci /* The CUBIC function can update ca->cnt at most once per jiffy. 22662306a36Sopenharmony_ci * On all cwnd reduction events, ca->epoch_start is set to 0, 22762306a36Sopenharmony_ci * which will force a recalculation of ca->cnt. 22862306a36Sopenharmony_ci */ 22962306a36Sopenharmony_ci if (ca->epoch_start && tcp_jiffies32 == ca->last_time) 23062306a36Sopenharmony_ci goto tcp_friendliness; 23162306a36Sopenharmony_ci 23262306a36Sopenharmony_ci ca->last_cwnd = cwnd; 23362306a36Sopenharmony_ci ca->last_time = tcp_jiffies32; 23462306a36Sopenharmony_ci 23562306a36Sopenharmony_ci if (ca->epoch_start == 0) { 23662306a36Sopenharmony_ci ca->epoch_start = tcp_jiffies32; /* record beginning */ 23762306a36Sopenharmony_ci ca->ack_cnt = acked; /* start counting */ 23862306a36Sopenharmony_ci ca->tcp_cwnd = cwnd; /* syn with cubic */ 23962306a36Sopenharmony_ci 24062306a36Sopenharmony_ci if (ca->last_max_cwnd <= cwnd) { 24162306a36Sopenharmony_ci ca->bic_K = 0; 24262306a36Sopenharmony_ci ca->bic_origin_point = cwnd; 24362306a36Sopenharmony_ci } else { 24462306a36Sopenharmony_ci /* Compute new K based on 24562306a36Sopenharmony_ci * (wmax-cwnd) * (srtt>>3 / HZ) / c * 2^(3*bictcp_HZ) 24662306a36Sopenharmony_ci */ 24762306a36Sopenharmony_ci ca->bic_K = cubic_root(cube_factor 24862306a36Sopenharmony_ci * (ca->last_max_cwnd - cwnd)); 24962306a36Sopenharmony_ci ca->bic_origin_point = ca->last_max_cwnd; 25062306a36Sopenharmony_ci } 25162306a36Sopenharmony_ci } 25262306a36Sopenharmony_ci 25362306a36Sopenharmony_ci /* cubic function - calc*/ 25462306a36Sopenharmony_ci /* calculate c * time^3 / rtt, 25562306a36Sopenharmony_ci * while considering overflow in calculation of time^3 25662306a36Sopenharmony_ci * (so time^3 is done by using 64 bit) 25762306a36Sopenharmony_ci * and without the support of division of 64bit numbers 25862306a36Sopenharmony_ci * (so all divisions are done by using 32 bit) 25962306a36Sopenharmony_ci * also NOTE the unit of those veriables 26062306a36Sopenharmony_ci * time = (t - K) / 2^bictcp_HZ 26162306a36Sopenharmony_ci * c = bic_scale >> 10 26262306a36Sopenharmony_ci * rtt = (srtt >> 3) / HZ 26362306a36Sopenharmony_ci * !!! The following code does not have overflow problems, 26462306a36Sopenharmony_ci * if the cwnd < 1 million packets !!! 26562306a36Sopenharmony_ci */ 26662306a36Sopenharmony_ci 26762306a36Sopenharmony_ci t = (s32)(tcp_jiffies32 - ca->epoch_start); 26862306a36Sopenharmony_ci t += usecs_to_jiffies(ca->delay_min); 26962306a36Sopenharmony_ci /* change the unit from HZ to bictcp_HZ */ 27062306a36Sopenharmony_ci t <<= BICTCP_HZ; 27162306a36Sopenharmony_ci do_div(t, HZ); 27262306a36Sopenharmony_ci 27362306a36Sopenharmony_ci if (t < ca->bic_K) /* t - K */ 27462306a36Sopenharmony_ci offs = ca->bic_K - t; 27562306a36Sopenharmony_ci else 27662306a36Sopenharmony_ci offs = t - ca->bic_K; 27762306a36Sopenharmony_ci 27862306a36Sopenharmony_ci /* c/rtt * (t-K)^3 */ 27962306a36Sopenharmony_ci delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ); 28062306a36Sopenharmony_ci if (t < ca->bic_K) /* below origin*/ 28162306a36Sopenharmony_ci bic_target = ca->bic_origin_point - delta; 28262306a36Sopenharmony_ci else /* above origin*/ 28362306a36Sopenharmony_ci bic_target = ca->bic_origin_point + delta; 28462306a36Sopenharmony_ci 28562306a36Sopenharmony_ci /* cubic function - calc bictcp_cnt*/ 28662306a36Sopenharmony_ci if (bic_target > cwnd) { 28762306a36Sopenharmony_ci ca->cnt = cwnd / (bic_target - cwnd); 28862306a36Sopenharmony_ci } else { 28962306a36Sopenharmony_ci ca->cnt = 100 * cwnd; /* very small increment*/ 29062306a36Sopenharmony_ci } 29162306a36Sopenharmony_ci 29262306a36Sopenharmony_ci /* 29362306a36Sopenharmony_ci * The initial growth of cubic function may be too conservative 29462306a36Sopenharmony_ci * when the available bandwidth is still unknown. 29562306a36Sopenharmony_ci */ 29662306a36Sopenharmony_ci if (ca->last_max_cwnd == 0 && ca->cnt > 20) 29762306a36Sopenharmony_ci ca->cnt = 20; /* increase cwnd 5% per RTT */ 29862306a36Sopenharmony_ci 29962306a36Sopenharmony_citcp_friendliness: 30062306a36Sopenharmony_ci /* TCP Friendly */ 30162306a36Sopenharmony_ci if (tcp_friendliness) { 30262306a36Sopenharmony_ci u32 scale = beta_scale; 30362306a36Sopenharmony_ci 30462306a36Sopenharmony_ci delta = (cwnd * scale) >> 3; 30562306a36Sopenharmony_ci while (ca->ack_cnt > delta) { /* update tcp cwnd */ 30662306a36Sopenharmony_ci ca->ack_cnt -= delta; 30762306a36Sopenharmony_ci ca->tcp_cwnd++; 30862306a36Sopenharmony_ci } 30962306a36Sopenharmony_ci 31062306a36Sopenharmony_ci if (ca->tcp_cwnd > cwnd) { /* if bic is slower than tcp */ 31162306a36Sopenharmony_ci delta = ca->tcp_cwnd - cwnd; 31262306a36Sopenharmony_ci max_cnt = cwnd / delta; 31362306a36Sopenharmony_ci if (ca->cnt > max_cnt) 31462306a36Sopenharmony_ci ca->cnt = max_cnt; 31562306a36Sopenharmony_ci } 31662306a36Sopenharmony_ci } 31762306a36Sopenharmony_ci 31862306a36Sopenharmony_ci /* The maximum rate of cwnd increase CUBIC allows is 1 packet per 31962306a36Sopenharmony_ci * 2 packets ACKed, meaning cwnd grows at 1.5x per RTT. 32062306a36Sopenharmony_ci */ 32162306a36Sopenharmony_ci ca->cnt = max(ca->cnt, 2U); 32262306a36Sopenharmony_ci} 32362306a36Sopenharmony_ci 32462306a36Sopenharmony_ci__bpf_kfunc static void cubictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) 32562306a36Sopenharmony_ci{ 32662306a36Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 32762306a36Sopenharmony_ci struct bictcp *ca = inet_csk_ca(sk); 32862306a36Sopenharmony_ci 32962306a36Sopenharmony_ci if (!tcp_is_cwnd_limited(sk)) 33062306a36Sopenharmony_ci return; 33162306a36Sopenharmony_ci 33262306a36Sopenharmony_ci if (tcp_in_slow_start(tp)) { 33362306a36Sopenharmony_ci acked = tcp_slow_start(tp, acked); 33462306a36Sopenharmony_ci if (!acked) 33562306a36Sopenharmony_ci return; 33662306a36Sopenharmony_ci } 33762306a36Sopenharmony_ci bictcp_update(ca, tcp_snd_cwnd(tp), acked); 33862306a36Sopenharmony_ci tcp_cong_avoid_ai(tp, ca->cnt, acked); 33962306a36Sopenharmony_ci} 34062306a36Sopenharmony_ci 34162306a36Sopenharmony_ci__bpf_kfunc static u32 cubictcp_recalc_ssthresh(struct sock *sk) 34262306a36Sopenharmony_ci{ 34362306a36Sopenharmony_ci const struct tcp_sock *tp = tcp_sk(sk); 34462306a36Sopenharmony_ci struct bictcp *ca = inet_csk_ca(sk); 34562306a36Sopenharmony_ci 34662306a36Sopenharmony_ci ca->epoch_start = 0; /* end of epoch */ 34762306a36Sopenharmony_ci 34862306a36Sopenharmony_ci /* Wmax and fast convergence */ 34962306a36Sopenharmony_ci if (tcp_snd_cwnd(tp) < ca->last_max_cwnd && fast_convergence) 35062306a36Sopenharmony_ci ca->last_max_cwnd = (tcp_snd_cwnd(tp) * (BICTCP_BETA_SCALE + beta)) 35162306a36Sopenharmony_ci / (2 * BICTCP_BETA_SCALE); 35262306a36Sopenharmony_ci else 35362306a36Sopenharmony_ci ca->last_max_cwnd = tcp_snd_cwnd(tp); 35462306a36Sopenharmony_ci 35562306a36Sopenharmony_ci return max((tcp_snd_cwnd(tp) * beta) / BICTCP_BETA_SCALE, 2U); 35662306a36Sopenharmony_ci} 35762306a36Sopenharmony_ci 35862306a36Sopenharmony_ci__bpf_kfunc static void cubictcp_state(struct sock *sk, u8 new_state) 35962306a36Sopenharmony_ci{ 36062306a36Sopenharmony_ci if (new_state == TCP_CA_Loss) { 36162306a36Sopenharmony_ci bictcp_reset(inet_csk_ca(sk)); 36262306a36Sopenharmony_ci bictcp_hystart_reset(sk); 36362306a36Sopenharmony_ci } 36462306a36Sopenharmony_ci} 36562306a36Sopenharmony_ci 36662306a36Sopenharmony_ci/* Account for TSO/GRO delays. 36762306a36Sopenharmony_ci * Otherwise short RTT flows could get too small ssthresh, since during 36862306a36Sopenharmony_ci * slow start we begin with small TSO packets and ca->delay_min would 36962306a36Sopenharmony_ci * not account for long aggregation delay when TSO packets get bigger. 37062306a36Sopenharmony_ci * Ideally even with a very small RTT we would like to have at least one 37162306a36Sopenharmony_ci * TSO packet being sent and received by GRO, and another one in qdisc layer. 37262306a36Sopenharmony_ci * We apply another 100% factor because @rate is doubled at this point. 37362306a36Sopenharmony_ci * We cap the cushion to 1ms. 37462306a36Sopenharmony_ci */ 37562306a36Sopenharmony_cistatic u32 hystart_ack_delay(const struct sock *sk) 37662306a36Sopenharmony_ci{ 37762306a36Sopenharmony_ci unsigned long rate; 37862306a36Sopenharmony_ci 37962306a36Sopenharmony_ci rate = READ_ONCE(sk->sk_pacing_rate); 38062306a36Sopenharmony_ci if (!rate) 38162306a36Sopenharmony_ci return 0; 38262306a36Sopenharmony_ci return min_t(u64, USEC_PER_MSEC, 38362306a36Sopenharmony_ci div64_ul((u64)sk->sk_gso_max_size * 4 * USEC_PER_SEC, rate)); 38462306a36Sopenharmony_ci} 38562306a36Sopenharmony_ci 38662306a36Sopenharmony_cistatic void hystart_update(struct sock *sk, u32 delay) 38762306a36Sopenharmony_ci{ 38862306a36Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 38962306a36Sopenharmony_ci struct bictcp *ca = inet_csk_ca(sk); 39062306a36Sopenharmony_ci u32 threshold; 39162306a36Sopenharmony_ci 39262306a36Sopenharmony_ci if (after(tp->snd_una, ca->end_seq)) 39362306a36Sopenharmony_ci bictcp_hystart_reset(sk); 39462306a36Sopenharmony_ci 39562306a36Sopenharmony_ci if (hystart_detect & HYSTART_ACK_TRAIN) { 39662306a36Sopenharmony_ci u32 now = bictcp_clock_us(sk); 39762306a36Sopenharmony_ci 39862306a36Sopenharmony_ci /* first detection parameter - ack-train detection */ 39962306a36Sopenharmony_ci if ((s32)(now - ca->last_ack) <= hystart_ack_delta_us) { 40062306a36Sopenharmony_ci ca->last_ack = now; 40162306a36Sopenharmony_ci 40262306a36Sopenharmony_ci threshold = ca->delay_min + hystart_ack_delay(sk); 40362306a36Sopenharmony_ci 40462306a36Sopenharmony_ci /* Hystart ack train triggers if we get ack past 40562306a36Sopenharmony_ci * ca->delay_min/2. 40662306a36Sopenharmony_ci * Pacing might have delayed packets up to RTT/2 40762306a36Sopenharmony_ci * during slow start. 40862306a36Sopenharmony_ci */ 40962306a36Sopenharmony_ci if (sk->sk_pacing_status == SK_PACING_NONE) 41062306a36Sopenharmony_ci threshold >>= 1; 41162306a36Sopenharmony_ci 41262306a36Sopenharmony_ci if ((s32)(now - ca->round_start) > threshold) { 41362306a36Sopenharmony_ci ca->found = 1; 41462306a36Sopenharmony_ci pr_debug("hystart_ack_train (%u > %u) delay_min %u (+ ack_delay %u) cwnd %u\n", 41562306a36Sopenharmony_ci now - ca->round_start, threshold, 41662306a36Sopenharmony_ci ca->delay_min, hystart_ack_delay(sk), tcp_snd_cwnd(tp)); 41762306a36Sopenharmony_ci NET_INC_STATS(sock_net(sk), 41862306a36Sopenharmony_ci LINUX_MIB_TCPHYSTARTTRAINDETECT); 41962306a36Sopenharmony_ci NET_ADD_STATS(sock_net(sk), 42062306a36Sopenharmony_ci LINUX_MIB_TCPHYSTARTTRAINCWND, 42162306a36Sopenharmony_ci tcp_snd_cwnd(tp)); 42262306a36Sopenharmony_ci tp->snd_ssthresh = tcp_snd_cwnd(tp); 42362306a36Sopenharmony_ci } 42462306a36Sopenharmony_ci } 42562306a36Sopenharmony_ci } 42662306a36Sopenharmony_ci 42762306a36Sopenharmony_ci if (hystart_detect & HYSTART_DELAY) { 42862306a36Sopenharmony_ci /* obtain the minimum delay of more than sampling packets */ 42962306a36Sopenharmony_ci if (ca->curr_rtt > delay) 43062306a36Sopenharmony_ci ca->curr_rtt = delay; 43162306a36Sopenharmony_ci if (ca->sample_cnt < HYSTART_MIN_SAMPLES) { 43262306a36Sopenharmony_ci ca->sample_cnt++; 43362306a36Sopenharmony_ci } else { 43462306a36Sopenharmony_ci if (ca->curr_rtt > ca->delay_min + 43562306a36Sopenharmony_ci HYSTART_DELAY_THRESH(ca->delay_min >> 3)) { 43662306a36Sopenharmony_ci ca->found = 1; 43762306a36Sopenharmony_ci NET_INC_STATS(sock_net(sk), 43862306a36Sopenharmony_ci LINUX_MIB_TCPHYSTARTDELAYDETECT); 43962306a36Sopenharmony_ci NET_ADD_STATS(sock_net(sk), 44062306a36Sopenharmony_ci LINUX_MIB_TCPHYSTARTDELAYCWND, 44162306a36Sopenharmony_ci tcp_snd_cwnd(tp)); 44262306a36Sopenharmony_ci tp->snd_ssthresh = tcp_snd_cwnd(tp); 44362306a36Sopenharmony_ci } 44462306a36Sopenharmony_ci } 44562306a36Sopenharmony_ci } 44662306a36Sopenharmony_ci} 44762306a36Sopenharmony_ci 44862306a36Sopenharmony_ci__bpf_kfunc static void cubictcp_acked(struct sock *sk, const struct ack_sample *sample) 44962306a36Sopenharmony_ci{ 45062306a36Sopenharmony_ci const struct tcp_sock *tp = tcp_sk(sk); 45162306a36Sopenharmony_ci struct bictcp *ca = inet_csk_ca(sk); 45262306a36Sopenharmony_ci u32 delay; 45362306a36Sopenharmony_ci 45462306a36Sopenharmony_ci /* Some calls are for duplicates without timetamps */ 45562306a36Sopenharmony_ci if (sample->rtt_us < 0) 45662306a36Sopenharmony_ci return; 45762306a36Sopenharmony_ci 45862306a36Sopenharmony_ci /* Discard delay samples right after fast recovery */ 45962306a36Sopenharmony_ci if (ca->epoch_start && (s32)(tcp_jiffies32 - ca->epoch_start) < HZ) 46062306a36Sopenharmony_ci return; 46162306a36Sopenharmony_ci 46262306a36Sopenharmony_ci delay = sample->rtt_us; 46362306a36Sopenharmony_ci if (delay == 0) 46462306a36Sopenharmony_ci delay = 1; 46562306a36Sopenharmony_ci 46662306a36Sopenharmony_ci /* first time call or link delay decreases */ 46762306a36Sopenharmony_ci if (ca->delay_min == 0 || ca->delay_min > delay) 46862306a36Sopenharmony_ci ca->delay_min = delay; 46962306a36Sopenharmony_ci 47062306a36Sopenharmony_ci /* hystart triggers when cwnd is larger than some threshold */ 47162306a36Sopenharmony_ci if (!ca->found && tcp_in_slow_start(tp) && hystart && 47262306a36Sopenharmony_ci tcp_snd_cwnd(tp) >= hystart_low_window) 47362306a36Sopenharmony_ci hystart_update(sk, delay); 47462306a36Sopenharmony_ci} 47562306a36Sopenharmony_ci 47662306a36Sopenharmony_cistatic struct tcp_congestion_ops cubictcp __read_mostly = { 47762306a36Sopenharmony_ci .init = cubictcp_init, 47862306a36Sopenharmony_ci .ssthresh = cubictcp_recalc_ssthresh, 47962306a36Sopenharmony_ci .cong_avoid = cubictcp_cong_avoid, 48062306a36Sopenharmony_ci .set_state = cubictcp_state, 48162306a36Sopenharmony_ci .undo_cwnd = tcp_reno_undo_cwnd, 48262306a36Sopenharmony_ci .cwnd_event = cubictcp_cwnd_event, 48362306a36Sopenharmony_ci .pkts_acked = cubictcp_acked, 48462306a36Sopenharmony_ci .owner = THIS_MODULE, 48562306a36Sopenharmony_ci .name = "cubic", 48662306a36Sopenharmony_ci}; 48762306a36Sopenharmony_ci 48862306a36Sopenharmony_ciBTF_SET8_START(tcp_cubic_check_kfunc_ids) 48962306a36Sopenharmony_ci#ifdef CONFIG_X86 49062306a36Sopenharmony_ci#ifdef CONFIG_DYNAMIC_FTRACE 49162306a36Sopenharmony_ciBTF_ID_FLAGS(func, cubictcp_init) 49262306a36Sopenharmony_ciBTF_ID_FLAGS(func, cubictcp_recalc_ssthresh) 49362306a36Sopenharmony_ciBTF_ID_FLAGS(func, cubictcp_cong_avoid) 49462306a36Sopenharmony_ciBTF_ID_FLAGS(func, cubictcp_state) 49562306a36Sopenharmony_ciBTF_ID_FLAGS(func, cubictcp_cwnd_event) 49662306a36Sopenharmony_ciBTF_ID_FLAGS(func, cubictcp_acked) 49762306a36Sopenharmony_ci#endif 49862306a36Sopenharmony_ci#endif 49962306a36Sopenharmony_ciBTF_SET8_END(tcp_cubic_check_kfunc_ids) 50062306a36Sopenharmony_ci 50162306a36Sopenharmony_cistatic const struct btf_kfunc_id_set tcp_cubic_kfunc_set = { 50262306a36Sopenharmony_ci .owner = THIS_MODULE, 50362306a36Sopenharmony_ci .set = &tcp_cubic_check_kfunc_ids, 50462306a36Sopenharmony_ci}; 50562306a36Sopenharmony_ci 50662306a36Sopenharmony_cistatic int __init cubictcp_register(void) 50762306a36Sopenharmony_ci{ 50862306a36Sopenharmony_ci int ret; 50962306a36Sopenharmony_ci 51062306a36Sopenharmony_ci BUILD_BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE); 51162306a36Sopenharmony_ci 51262306a36Sopenharmony_ci /* Precompute a bunch of the scaling factors that are used per-packet 51362306a36Sopenharmony_ci * based on SRTT of 100ms 51462306a36Sopenharmony_ci */ 51562306a36Sopenharmony_ci 51662306a36Sopenharmony_ci beta_scale = 8*(BICTCP_BETA_SCALE+beta) / 3 51762306a36Sopenharmony_ci / (BICTCP_BETA_SCALE - beta); 51862306a36Sopenharmony_ci 51962306a36Sopenharmony_ci cube_rtt_scale = (bic_scale * 10); /* 1024*c/rtt */ 52062306a36Sopenharmony_ci 52162306a36Sopenharmony_ci /* calculate the "K" for (wmax-cwnd) = c/rtt * K^3 52262306a36Sopenharmony_ci * so K = cubic_root( (wmax-cwnd)*rtt/c ) 52362306a36Sopenharmony_ci * the unit of K is bictcp_HZ=2^10, not HZ 52462306a36Sopenharmony_ci * 52562306a36Sopenharmony_ci * c = bic_scale >> 10 52662306a36Sopenharmony_ci * rtt = 100ms 52762306a36Sopenharmony_ci * 52862306a36Sopenharmony_ci * the following code has been designed and tested for 52962306a36Sopenharmony_ci * cwnd < 1 million packets 53062306a36Sopenharmony_ci * RTT < 100 seconds 53162306a36Sopenharmony_ci * HZ < 1,000,00 (corresponding to 10 nano-second) 53262306a36Sopenharmony_ci */ 53362306a36Sopenharmony_ci 53462306a36Sopenharmony_ci /* 1/c * 2^2*bictcp_HZ * srtt */ 53562306a36Sopenharmony_ci cube_factor = 1ull << (10+3*BICTCP_HZ); /* 2^40 */ 53662306a36Sopenharmony_ci 53762306a36Sopenharmony_ci /* divide by bic_scale and by constant Srtt (100ms) */ 53862306a36Sopenharmony_ci do_div(cube_factor, bic_scale * 10); 53962306a36Sopenharmony_ci 54062306a36Sopenharmony_ci ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_cubic_kfunc_set); 54162306a36Sopenharmony_ci if (ret < 0) 54262306a36Sopenharmony_ci return ret; 54362306a36Sopenharmony_ci return tcp_register_congestion_control(&cubictcp); 54462306a36Sopenharmony_ci} 54562306a36Sopenharmony_ci 54662306a36Sopenharmony_cistatic void __exit cubictcp_unregister(void) 54762306a36Sopenharmony_ci{ 54862306a36Sopenharmony_ci tcp_unregister_congestion_control(&cubictcp); 54962306a36Sopenharmony_ci} 55062306a36Sopenharmony_ci 55162306a36Sopenharmony_cimodule_init(cubictcp_register); 55262306a36Sopenharmony_cimodule_exit(cubictcp_unregister); 55362306a36Sopenharmony_ci 55462306a36Sopenharmony_ciMODULE_AUTHOR("Sangtae Ha, Stephen Hemminger"); 55562306a36Sopenharmony_ciMODULE_LICENSE("GPL"); 55662306a36Sopenharmony_ciMODULE_DESCRIPTION("CUBIC TCP"); 55762306a36Sopenharmony_ciMODULE_VERSION("2.3"); 558