162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 262306a36Sopenharmony_ci 362306a36Sopenharmony_ci/* WARNING: This implemenation is not necessarily the same 462306a36Sopenharmony_ci * as the tcp_cubic.c. The purpose is mainly for testing 562306a36Sopenharmony_ci * the kernel BPF logic. 662306a36Sopenharmony_ci * 762306a36Sopenharmony_ci * Highlights: 862306a36Sopenharmony_ci * 1. CONFIG_HZ .kconfig map is used. 962306a36Sopenharmony_ci * 2. In bictcp_update(), calculation is changed to use usec 1062306a36Sopenharmony_ci * resolution (i.e. USEC_PER_JIFFY) instead of using jiffies. 1162306a36Sopenharmony_ci * Thus, usecs_to_jiffies() is not used in the bpf_cubic.c. 1262306a36Sopenharmony_ci * 3. In bitctcp_update() [under tcp_friendliness], the original 1362306a36Sopenharmony_ci * "while (ca->ack_cnt > delta)" loop is changed to the equivalent 1462306a36Sopenharmony_ci * "ca->ack_cnt / delta" operation. 1562306a36Sopenharmony_ci */ 1662306a36Sopenharmony_ci 1762306a36Sopenharmony_ci#include <linux/bpf.h> 1862306a36Sopenharmony_ci#include <linux/stddef.h> 1962306a36Sopenharmony_ci#include <linux/tcp.h> 2062306a36Sopenharmony_ci#include "bpf_tcp_helpers.h" 2162306a36Sopenharmony_ci 2262306a36Sopenharmony_cichar _license[] SEC("license") = "GPL"; 2362306a36Sopenharmony_ci 2462306a36Sopenharmony_ci#define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi) 2562306a36Sopenharmony_ci 2662306a36Sopenharmony_ci#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation 2762306a36Sopenharmony_ci * max_cwnd = snd_cwnd * beta 2862306a36Sopenharmony_ci */ 2962306a36Sopenharmony_ci#define BICTCP_HZ 10 /* BIC HZ 2^10 = 1024 */ 3062306a36Sopenharmony_ci 3162306a36Sopenharmony_ci/* Two methods of hybrid slow start */ 3262306a36Sopenharmony_ci#define HYSTART_ACK_TRAIN 0x1 3362306a36Sopenharmony_ci#define HYSTART_DELAY 0x2 3462306a36Sopenharmony_ci 3562306a36Sopenharmony_ci/* Number of delay samples for detecting the increase of delay */ 3662306a36Sopenharmony_ci#define HYSTART_MIN_SAMPLES 8 3762306a36Sopenharmony_ci#define HYSTART_DELAY_MIN (4000U) /* 4ms */ 3862306a36Sopenharmony_ci#define HYSTART_DELAY_MAX (16000U) /* 16 ms */ 3962306a36Sopenharmony_ci#define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX) 4062306a36Sopenharmony_ci 4162306a36Sopenharmony_cistatic int fast_convergence = 1; 4262306a36Sopenharmony_cistatic const int beta = 717; /* = 717/1024 (BICTCP_BETA_SCALE) */ 4362306a36Sopenharmony_cistatic int initial_ssthresh; 4462306a36Sopenharmony_cistatic const int bic_scale = 41; 4562306a36Sopenharmony_cistatic int tcp_friendliness = 1; 4662306a36Sopenharmony_ci 4762306a36Sopenharmony_cistatic int hystart = 1; 4862306a36Sopenharmony_cistatic int hystart_detect = HYSTART_ACK_TRAIN | HYSTART_DELAY; 4962306a36Sopenharmony_cistatic int hystart_low_window = 16; 5062306a36Sopenharmony_cistatic int hystart_ack_delta_us = 2000; 5162306a36Sopenharmony_ci 5262306a36Sopenharmony_cistatic const __u32 cube_rtt_scale = (bic_scale * 10); /* 1024*c/rtt */ 5362306a36Sopenharmony_cistatic const __u32 beta_scale = 8*(BICTCP_BETA_SCALE+beta) / 3 5462306a36Sopenharmony_ci / (BICTCP_BETA_SCALE - beta); 5562306a36Sopenharmony_ci/* calculate the "K" for (wmax-cwnd) = c/rtt * K^3 5662306a36Sopenharmony_ci * so K = cubic_root( (wmax-cwnd)*rtt/c ) 5762306a36Sopenharmony_ci * the unit of K is bictcp_HZ=2^10, not HZ 5862306a36Sopenharmony_ci * 5962306a36Sopenharmony_ci * c = bic_scale >> 10 6062306a36Sopenharmony_ci * rtt = 100ms 6162306a36Sopenharmony_ci * 6262306a36Sopenharmony_ci * the following code has been designed and tested for 6362306a36Sopenharmony_ci * cwnd < 1 million packets 6462306a36Sopenharmony_ci * RTT < 100 seconds 6562306a36Sopenharmony_ci * HZ < 1,000,00 (corresponding to 10 nano-second) 6662306a36Sopenharmony_ci */ 6762306a36Sopenharmony_ci 6862306a36Sopenharmony_ci/* 1/c * 2^2*bictcp_HZ * srtt, 2^40 */ 6962306a36Sopenharmony_cistatic const __u64 cube_factor = (__u64)(1ull << (10+3*BICTCP_HZ)) 7062306a36Sopenharmony_ci / (bic_scale * 10); 7162306a36Sopenharmony_ci 7262306a36Sopenharmony_ci/* BIC TCP Parameters */ 7362306a36Sopenharmony_cistruct bictcp { 7462306a36Sopenharmony_ci __u32 cnt; /* increase cwnd by 1 after ACKs */ 7562306a36Sopenharmony_ci __u32 last_max_cwnd; /* last maximum snd_cwnd */ 7662306a36Sopenharmony_ci __u32 last_cwnd; /* the last snd_cwnd */ 7762306a36Sopenharmony_ci __u32 last_time; /* time when updated last_cwnd */ 7862306a36Sopenharmony_ci __u32 bic_origin_point;/* origin point of bic function */ 7962306a36Sopenharmony_ci __u32 bic_K; /* time to origin point 8062306a36Sopenharmony_ci from the beginning of the current epoch */ 8162306a36Sopenharmony_ci __u32 delay_min; /* min delay (usec) */ 8262306a36Sopenharmony_ci __u32 epoch_start; /* beginning of an epoch */ 8362306a36Sopenharmony_ci __u32 ack_cnt; /* number of acks */ 8462306a36Sopenharmony_ci __u32 tcp_cwnd; /* estimated tcp cwnd */ 8562306a36Sopenharmony_ci __u16 unused; 8662306a36Sopenharmony_ci __u8 sample_cnt; /* number of samples to decide curr_rtt */ 8762306a36Sopenharmony_ci __u8 found; /* the exit point is found? */ 8862306a36Sopenharmony_ci __u32 round_start; /* beginning of each round */ 8962306a36Sopenharmony_ci __u32 end_seq; /* end_seq of the round */ 9062306a36Sopenharmony_ci __u32 last_ack; /* last time when the ACK spacing is close */ 9162306a36Sopenharmony_ci __u32 curr_rtt; /* the minimum rtt of current round */ 9262306a36Sopenharmony_ci}; 9362306a36Sopenharmony_ci 9462306a36Sopenharmony_cistatic inline void bictcp_reset(struct bictcp *ca) 9562306a36Sopenharmony_ci{ 9662306a36Sopenharmony_ci ca->cnt = 0; 9762306a36Sopenharmony_ci ca->last_max_cwnd = 0; 9862306a36Sopenharmony_ci ca->last_cwnd = 0; 9962306a36Sopenharmony_ci ca->last_time = 0; 10062306a36Sopenharmony_ci ca->bic_origin_point = 0; 10162306a36Sopenharmony_ci ca->bic_K = 0; 10262306a36Sopenharmony_ci ca->delay_min = 0; 10362306a36Sopenharmony_ci ca->epoch_start = 0; 10462306a36Sopenharmony_ci ca->ack_cnt = 0; 10562306a36Sopenharmony_ci ca->tcp_cwnd = 0; 10662306a36Sopenharmony_ci ca->found = 0; 10762306a36Sopenharmony_ci} 10862306a36Sopenharmony_ci 10962306a36Sopenharmony_ciextern unsigned long CONFIG_HZ __kconfig; 11062306a36Sopenharmony_ci#define HZ CONFIG_HZ 11162306a36Sopenharmony_ci#define USEC_PER_MSEC 1000UL 11262306a36Sopenharmony_ci#define USEC_PER_SEC 1000000UL 11362306a36Sopenharmony_ci#define USEC_PER_JIFFY (USEC_PER_SEC / HZ) 11462306a36Sopenharmony_ci 11562306a36Sopenharmony_cistatic __always_inline __u64 div64_u64(__u64 dividend, __u64 divisor) 11662306a36Sopenharmony_ci{ 11762306a36Sopenharmony_ci return dividend / divisor; 11862306a36Sopenharmony_ci} 11962306a36Sopenharmony_ci 12062306a36Sopenharmony_ci#define div64_ul div64_u64 12162306a36Sopenharmony_ci 12262306a36Sopenharmony_ci#define BITS_PER_U64 (sizeof(__u64) * 8) 12362306a36Sopenharmony_cistatic __always_inline int fls64(__u64 x) 12462306a36Sopenharmony_ci{ 12562306a36Sopenharmony_ci int num = BITS_PER_U64 - 1; 12662306a36Sopenharmony_ci 12762306a36Sopenharmony_ci if (x == 0) 12862306a36Sopenharmony_ci return 0; 12962306a36Sopenharmony_ci 13062306a36Sopenharmony_ci if (!(x & (~0ull << (BITS_PER_U64-32)))) { 13162306a36Sopenharmony_ci num -= 32; 13262306a36Sopenharmony_ci x <<= 32; 13362306a36Sopenharmony_ci } 13462306a36Sopenharmony_ci if (!(x & (~0ull << (BITS_PER_U64-16)))) { 13562306a36Sopenharmony_ci num -= 16; 13662306a36Sopenharmony_ci x <<= 16; 13762306a36Sopenharmony_ci } 13862306a36Sopenharmony_ci if (!(x & (~0ull << (BITS_PER_U64-8)))) { 13962306a36Sopenharmony_ci num -= 8; 14062306a36Sopenharmony_ci x <<= 8; 14162306a36Sopenharmony_ci } 14262306a36Sopenharmony_ci if (!(x & (~0ull << (BITS_PER_U64-4)))) { 14362306a36Sopenharmony_ci num -= 4; 14462306a36Sopenharmony_ci x <<= 4; 14562306a36Sopenharmony_ci } 14662306a36Sopenharmony_ci if (!(x & (~0ull << (BITS_PER_U64-2)))) { 14762306a36Sopenharmony_ci num -= 2; 14862306a36Sopenharmony_ci x <<= 2; 14962306a36Sopenharmony_ci } 15062306a36Sopenharmony_ci if (!(x & (~0ull << (BITS_PER_U64-1)))) 15162306a36Sopenharmony_ci num -= 1; 15262306a36Sopenharmony_ci 15362306a36Sopenharmony_ci return num + 1; 15462306a36Sopenharmony_ci} 15562306a36Sopenharmony_ci 15662306a36Sopenharmony_cistatic __always_inline __u32 bictcp_clock_us(const struct sock *sk) 15762306a36Sopenharmony_ci{ 15862306a36Sopenharmony_ci return tcp_sk(sk)->tcp_mstamp; 15962306a36Sopenharmony_ci} 16062306a36Sopenharmony_ci 16162306a36Sopenharmony_cistatic __always_inline void bictcp_hystart_reset(struct sock *sk) 16262306a36Sopenharmony_ci{ 16362306a36Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 16462306a36Sopenharmony_ci struct bictcp *ca = inet_csk_ca(sk); 16562306a36Sopenharmony_ci 16662306a36Sopenharmony_ci ca->round_start = ca->last_ack = bictcp_clock_us(sk); 16762306a36Sopenharmony_ci ca->end_seq = tp->snd_nxt; 16862306a36Sopenharmony_ci ca->curr_rtt = ~0U; 16962306a36Sopenharmony_ci ca->sample_cnt = 0; 17062306a36Sopenharmony_ci} 17162306a36Sopenharmony_ci 17262306a36Sopenharmony_ci/* "struct_ops/" prefix is a requirement */ 17362306a36Sopenharmony_ciSEC("struct_ops/bpf_cubic_init") 17462306a36Sopenharmony_civoid BPF_PROG(bpf_cubic_init, struct sock *sk) 17562306a36Sopenharmony_ci{ 17662306a36Sopenharmony_ci struct bictcp *ca = inet_csk_ca(sk); 17762306a36Sopenharmony_ci 17862306a36Sopenharmony_ci bictcp_reset(ca); 17962306a36Sopenharmony_ci 18062306a36Sopenharmony_ci if (hystart) 18162306a36Sopenharmony_ci bictcp_hystart_reset(sk); 18262306a36Sopenharmony_ci 18362306a36Sopenharmony_ci if (!hystart && initial_ssthresh) 18462306a36Sopenharmony_ci tcp_sk(sk)->snd_ssthresh = initial_ssthresh; 18562306a36Sopenharmony_ci} 18662306a36Sopenharmony_ci 18762306a36Sopenharmony_ci/* "struct_ops" prefix is a requirement */ 18862306a36Sopenharmony_ciSEC("struct_ops/bpf_cubic_cwnd_event") 18962306a36Sopenharmony_civoid BPF_PROG(bpf_cubic_cwnd_event, struct sock *sk, enum tcp_ca_event event) 19062306a36Sopenharmony_ci{ 19162306a36Sopenharmony_ci if (event == CA_EVENT_TX_START) { 19262306a36Sopenharmony_ci struct bictcp *ca = inet_csk_ca(sk); 19362306a36Sopenharmony_ci __u32 now = tcp_jiffies32; 19462306a36Sopenharmony_ci __s32 delta; 19562306a36Sopenharmony_ci 19662306a36Sopenharmony_ci delta = now - tcp_sk(sk)->lsndtime; 19762306a36Sopenharmony_ci 19862306a36Sopenharmony_ci /* We were application limited (idle) for a while. 19962306a36Sopenharmony_ci * Shift epoch_start to keep cwnd growth to cubic curve. 20062306a36Sopenharmony_ci */ 20162306a36Sopenharmony_ci if (ca->epoch_start && delta > 0) { 20262306a36Sopenharmony_ci ca->epoch_start += delta; 20362306a36Sopenharmony_ci if (after(ca->epoch_start, now)) 20462306a36Sopenharmony_ci ca->epoch_start = now; 20562306a36Sopenharmony_ci } 20662306a36Sopenharmony_ci return; 20762306a36Sopenharmony_ci } 20862306a36Sopenharmony_ci} 20962306a36Sopenharmony_ci 21062306a36Sopenharmony_ci/* 21162306a36Sopenharmony_ci * cbrt(x) MSB values for x MSB values in [0..63]. 21262306a36Sopenharmony_ci * Precomputed then refined by hand - Willy Tarreau 21362306a36Sopenharmony_ci * 21462306a36Sopenharmony_ci * For x in [0..63], 21562306a36Sopenharmony_ci * v = cbrt(x << 18) - 1 21662306a36Sopenharmony_ci * cbrt(x) = (v[x] + 10) >> 6 21762306a36Sopenharmony_ci */ 21862306a36Sopenharmony_cistatic const __u8 v[] = { 21962306a36Sopenharmony_ci /* 0x00 */ 0, 54, 54, 54, 118, 118, 118, 118, 22062306a36Sopenharmony_ci /* 0x08 */ 123, 129, 134, 138, 143, 147, 151, 156, 22162306a36Sopenharmony_ci /* 0x10 */ 157, 161, 164, 168, 170, 173, 176, 179, 22262306a36Sopenharmony_ci /* 0x18 */ 181, 185, 187, 190, 192, 194, 197, 199, 22362306a36Sopenharmony_ci /* 0x20 */ 200, 202, 204, 206, 209, 211, 213, 215, 22462306a36Sopenharmony_ci /* 0x28 */ 217, 219, 221, 222, 224, 225, 227, 229, 22562306a36Sopenharmony_ci /* 0x30 */ 231, 232, 234, 236, 237, 239, 240, 242, 22662306a36Sopenharmony_ci /* 0x38 */ 244, 245, 246, 248, 250, 251, 252, 254, 22762306a36Sopenharmony_ci}; 22862306a36Sopenharmony_ci 22962306a36Sopenharmony_ci/* calculate the cubic root of x using a table lookup followed by one 23062306a36Sopenharmony_ci * Newton-Raphson iteration. 23162306a36Sopenharmony_ci * Avg err ~= 0.195% 23262306a36Sopenharmony_ci */ 23362306a36Sopenharmony_cistatic __always_inline __u32 cubic_root(__u64 a) 23462306a36Sopenharmony_ci{ 23562306a36Sopenharmony_ci __u32 x, b, shift; 23662306a36Sopenharmony_ci 23762306a36Sopenharmony_ci if (a < 64) { 23862306a36Sopenharmony_ci /* a in [0..63] */ 23962306a36Sopenharmony_ci return ((__u32)v[(__u32)a] + 35) >> 6; 24062306a36Sopenharmony_ci } 24162306a36Sopenharmony_ci 24262306a36Sopenharmony_ci b = fls64(a); 24362306a36Sopenharmony_ci b = ((b * 84) >> 8) - 1; 24462306a36Sopenharmony_ci shift = (a >> (b * 3)); 24562306a36Sopenharmony_ci 24662306a36Sopenharmony_ci /* it is needed for verifier's bound check on v */ 24762306a36Sopenharmony_ci if (shift >= 64) 24862306a36Sopenharmony_ci return 0; 24962306a36Sopenharmony_ci 25062306a36Sopenharmony_ci x = ((__u32)(((__u32)v[shift] + 10) << b)) >> 6; 25162306a36Sopenharmony_ci 25262306a36Sopenharmony_ci /* 25362306a36Sopenharmony_ci * Newton-Raphson iteration 25462306a36Sopenharmony_ci * 2 25562306a36Sopenharmony_ci * x = ( 2 * x + a / x ) / 3 25662306a36Sopenharmony_ci * k+1 k k 25762306a36Sopenharmony_ci */ 25862306a36Sopenharmony_ci x = (2 * x + (__u32)div64_u64(a, (__u64)x * (__u64)(x - 1))); 25962306a36Sopenharmony_ci x = ((x * 341) >> 10); 26062306a36Sopenharmony_ci return x; 26162306a36Sopenharmony_ci} 26262306a36Sopenharmony_ci 26362306a36Sopenharmony_ci/* 26462306a36Sopenharmony_ci * Compute congestion window to use. 26562306a36Sopenharmony_ci */ 26662306a36Sopenharmony_cistatic __always_inline void bictcp_update(struct bictcp *ca, __u32 cwnd, 26762306a36Sopenharmony_ci __u32 acked) 26862306a36Sopenharmony_ci{ 26962306a36Sopenharmony_ci __u32 delta, bic_target, max_cnt; 27062306a36Sopenharmony_ci __u64 offs, t; 27162306a36Sopenharmony_ci 27262306a36Sopenharmony_ci ca->ack_cnt += acked; /* count the number of ACKed packets */ 27362306a36Sopenharmony_ci 27462306a36Sopenharmony_ci if (ca->last_cwnd == cwnd && 27562306a36Sopenharmony_ci (__s32)(tcp_jiffies32 - ca->last_time) <= HZ / 32) 27662306a36Sopenharmony_ci return; 27762306a36Sopenharmony_ci 27862306a36Sopenharmony_ci /* The CUBIC function can update ca->cnt at most once per jiffy. 27962306a36Sopenharmony_ci * On all cwnd reduction events, ca->epoch_start is set to 0, 28062306a36Sopenharmony_ci * which will force a recalculation of ca->cnt. 28162306a36Sopenharmony_ci */ 28262306a36Sopenharmony_ci if (ca->epoch_start && tcp_jiffies32 == ca->last_time) 28362306a36Sopenharmony_ci goto tcp_friendliness; 28462306a36Sopenharmony_ci 28562306a36Sopenharmony_ci ca->last_cwnd = cwnd; 28662306a36Sopenharmony_ci ca->last_time = tcp_jiffies32; 28762306a36Sopenharmony_ci 28862306a36Sopenharmony_ci if (ca->epoch_start == 0) { 28962306a36Sopenharmony_ci ca->epoch_start = tcp_jiffies32; /* record beginning */ 29062306a36Sopenharmony_ci ca->ack_cnt = acked; /* start counting */ 29162306a36Sopenharmony_ci ca->tcp_cwnd = cwnd; /* syn with cubic */ 29262306a36Sopenharmony_ci 29362306a36Sopenharmony_ci if (ca->last_max_cwnd <= cwnd) { 29462306a36Sopenharmony_ci ca->bic_K = 0; 29562306a36Sopenharmony_ci ca->bic_origin_point = cwnd; 29662306a36Sopenharmony_ci } else { 29762306a36Sopenharmony_ci /* Compute new K based on 29862306a36Sopenharmony_ci * (wmax-cwnd) * (srtt>>3 / HZ) / c * 2^(3*bictcp_HZ) 29962306a36Sopenharmony_ci */ 30062306a36Sopenharmony_ci ca->bic_K = cubic_root(cube_factor 30162306a36Sopenharmony_ci * (ca->last_max_cwnd - cwnd)); 30262306a36Sopenharmony_ci ca->bic_origin_point = ca->last_max_cwnd; 30362306a36Sopenharmony_ci } 30462306a36Sopenharmony_ci } 30562306a36Sopenharmony_ci 30662306a36Sopenharmony_ci /* cubic function - calc*/ 30762306a36Sopenharmony_ci /* calculate c * time^3 / rtt, 30862306a36Sopenharmony_ci * while considering overflow in calculation of time^3 30962306a36Sopenharmony_ci * (so time^3 is done by using 64 bit) 31062306a36Sopenharmony_ci * and without the support of division of 64bit numbers 31162306a36Sopenharmony_ci * (so all divisions are done by using 32 bit) 31262306a36Sopenharmony_ci * also NOTE the unit of those veriables 31362306a36Sopenharmony_ci * time = (t - K) / 2^bictcp_HZ 31462306a36Sopenharmony_ci * c = bic_scale >> 10 31562306a36Sopenharmony_ci * rtt = (srtt >> 3) / HZ 31662306a36Sopenharmony_ci * !!! The following code does not have overflow problems, 31762306a36Sopenharmony_ci * if the cwnd < 1 million packets !!! 31862306a36Sopenharmony_ci */ 31962306a36Sopenharmony_ci 32062306a36Sopenharmony_ci t = (__s32)(tcp_jiffies32 - ca->epoch_start) * USEC_PER_JIFFY; 32162306a36Sopenharmony_ci t += ca->delay_min; 32262306a36Sopenharmony_ci /* change the unit from usec to bictcp_HZ */ 32362306a36Sopenharmony_ci t <<= BICTCP_HZ; 32462306a36Sopenharmony_ci t /= USEC_PER_SEC; 32562306a36Sopenharmony_ci 32662306a36Sopenharmony_ci if (t < ca->bic_K) /* t - K */ 32762306a36Sopenharmony_ci offs = ca->bic_K - t; 32862306a36Sopenharmony_ci else 32962306a36Sopenharmony_ci offs = t - ca->bic_K; 33062306a36Sopenharmony_ci 33162306a36Sopenharmony_ci /* c/rtt * (t-K)^3 */ 33262306a36Sopenharmony_ci delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ); 33362306a36Sopenharmony_ci if (t < ca->bic_K) /* below origin*/ 33462306a36Sopenharmony_ci bic_target = ca->bic_origin_point - delta; 33562306a36Sopenharmony_ci else /* above origin*/ 33662306a36Sopenharmony_ci bic_target = ca->bic_origin_point + delta; 33762306a36Sopenharmony_ci 33862306a36Sopenharmony_ci /* cubic function - calc bictcp_cnt*/ 33962306a36Sopenharmony_ci if (bic_target > cwnd) { 34062306a36Sopenharmony_ci ca->cnt = cwnd / (bic_target - cwnd); 34162306a36Sopenharmony_ci } else { 34262306a36Sopenharmony_ci ca->cnt = 100 * cwnd; /* very small increment*/ 34362306a36Sopenharmony_ci } 34462306a36Sopenharmony_ci 34562306a36Sopenharmony_ci /* 34662306a36Sopenharmony_ci * The initial growth of cubic function may be too conservative 34762306a36Sopenharmony_ci * when the available bandwidth is still unknown. 34862306a36Sopenharmony_ci */ 34962306a36Sopenharmony_ci if (ca->last_max_cwnd == 0 && ca->cnt > 20) 35062306a36Sopenharmony_ci ca->cnt = 20; /* increase cwnd 5% per RTT */ 35162306a36Sopenharmony_ci 35262306a36Sopenharmony_citcp_friendliness: 35362306a36Sopenharmony_ci /* TCP Friendly */ 35462306a36Sopenharmony_ci if (tcp_friendliness) { 35562306a36Sopenharmony_ci __u32 scale = beta_scale; 35662306a36Sopenharmony_ci __u32 n; 35762306a36Sopenharmony_ci 35862306a36Sopenharmony_ci /* update tcp cwnd */ 35962306a36Sopenharmony_ci delta = (cwnd * scale) >> 3; 36062306a36Sopenharmony_ci if (ca->ack_cnt > delta && delta) { 36162306a36Sopenharmony_ci n = ca->ack_cnt / delta; 36262306a36Sopenharmony_ci ca->ack_cnt -= n * delta; 36362306a36Sopenharmony_ci ca->tcp_cwnd += n; 36462306a36Sopenharmony_ci } 36562306a36Sopenharmony_ci 36662306a36Sopenharmony_ci if (ca->tcp_cwnd > cwnd) { /* if bic is slower than tcp */ 36762306a36Sopenharmony_ci delta = ca->tcp_cwnd - cwnd; 36862306a36Sopenharmony_ci max_cnt = cwnd / delta; 36962306a36Sopenharmony_ci if (ca->cnt > max_cnt) 37062306a36Sopenharmony_ci ca->cnt = max_cnt; 37162306a36Sopenharmony_ci } 37262306a36Sopenharmony_ci } 37362306a36Sopenharmony_ci 37462306a36Sopenharmony_ci /* The maximum rate of cwnd increase CUBIC allows is 1 packet per 37562306a36Sopenharmony_ci * 2 packets ACKed, meaning cwnd grows at 1.5x per RTT. 37662306a36Sopenharmony_ci */ 37762306a36Sopenharmony_ci ca->cnt = max(ca->cnt, 2U); 37862306a36Sopenharmony_ci} 37962306a36Sopenharmony_ci 38062306a36Sopenharmony_ci/* Or simply use the BPF_STRUCT_OPS to avoid the SEC boiler plate. */ 38162306a36Sopenharmony_civoid BPF_STRUCT_OPS(bpf_cubic_cong_avoid, struct sock *sk, __u32 ack, __u32 acked) 38262306a36Sopenharmony_ci{ 38362306a36Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 38462306a36Sopenharmony_ci struct bictcp *ca = inet_csk_ca(sk); 38562306a36Sopenharmony_ci 38662306a36Sopenharmony_ci if (!tcp_is_cwnd_limited(sk)) 38762306a36Sopenharmony_ci return; 38862306a36Sopenharmony_ci 38962306a36Sopenharmony_ci if (tcp_in_slow_start(tp)) { 39062306a36Sopenharmony_ci if (hystart && after(ack, ca->end_seq)) 39162306a36Sopenharmony_ci bictcp_hystart_reset(sk); 39262306a36Sopenharmony_ci acked = tcp_slow_start(tp, acked); 39362306a36Sopenharmony_ci if (!acked) 39462306a36Sopenharmony_ci return; 39562306a36Sopenharmony_ci } 39662306a36Sopenharmony_ci bictcp_update(ca, tp->snd_cwnd, acked); 39762306a36Sopenharmony_ci tcp_cong_avoid_ai(tp, ca->cnt, acked); 39862306a36Sopenharmony_ci} 39962306a36Sopenharmony_ci 40062306a36Sopenharmony_ci__u32 BPF_STRUCT_OPS(bpf_cubic_recalc_ssthresh, struct sock *sk) 40162306a36Sopenharmony_ci{ 40262306a36Sopenharmony_ci const struct tcp_sock *tp = tcp_sk(sk); 40362306a36Sopenharmony_ci struct bictcp *ca = inet_csk_ca(sk); 40462306a36Sopenharmony_ci 40562306a36Sopenharmony_ci ca->epoch_start = 0; /* end of epoch */ 40662306a36Sopenharmony_ci 40762306a36Sopenharmony_ci /* Wmax and fast convergence */ 40862306a36Sopenharmony_ci if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence) 40962306a36Sopenharmony_ci ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta)) 41062306a36Sopenharmony_ci / (2 * BICTCP_BETA_SCALE); 41162306a36Sopenharmony_ci else 41262306a36Sopenharmony_ci ca->last_max_cwnd = tp->snd_cwnd; 41362306a36Sopenharmony_ci 41462306a36Sopenharmony_ci return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); 41562306a36Sopenharmony_ci} 41662306a36Sopenharmony_ci 41762306a36Sopenharmony_civoid BPF_STRUCT_OPS(bpf_cubic_state, struct sock *sk, __u8 new_state) 41862306a36Sopenharmony_ci{ 41962306a36Sopenharmony_ci if (new_state == TCP_CA_Loss) { 42062306a36Sopenharmony_ci bictcp_reset(inet_csk_ca(sk)); 42162306a36Sopenharmony_ci bictcp_hystart_reset(sk); 42262306a36Sopenharmony_ci } 42362306a36Sopenharmony_ci} 42462306a36Sopenharmony_ci 42562306a36Sopenharmony_ci#define GSO_MAX_SIZE 65536 42662306a36Sopenharmony_ci 42762306a36Sopenharmony_ci/* Account for TSO/GRO delays. 42862306a36Sopenharmony_ci * Otherwise short RTT flows could get too small ssthresh, since during 42962306a36Sopenharmony_ci * slow start we begin with small TSO packets and ca->delay_min would 43062306a36Sopenharmony_ci * not account for long aggregation delay when TSO packets get bigger. 43162306a36Sopenharmony_ci * Ideally even with a very small RTT we would like to have at least one 43262306a36Sopenharmony_ci * TSO packet being sent and received by GRO, and another one in qdisc layer. 43362306a36Sopenharmony_ci * We apply another 100% factor because @rate is doubled at this point. 43462306a36Sopenharmony_ci * We cap the cushion to 1ms. 43562306a36Sopenharmony_ci */ 43662306a36Sopenharmony_cistatic __always_inline __u32 hystart_ack_delay(struct sock *sk) 43762306a36Sopenharmony_ci{ 43862306a36Sopenharmony_ci unsigned long rate; 43962306a36Sopenharmony_ci 44062306a36Sopenharmony_ci rate = sk->sk_pacing_rate; 44162306a36Sopenharmony_ci if (!rate) 44262306a36Sopenharmony_ci return 0; 44362306a36Sopenharmony_ci return min((__u64)USEC_PER_MSEC, 44462306a36Sopenharmony_ci div64_ul((__u64)GSO_MAX_SIZE * 4 * USEC_PER_SEC, rate)); 44562306a36Sopenharmony_ci} 44662306a36Sopenharmony_ci 44762306a36Sopenharmony_cistatic __always_inline void hystart_update(struct sock *sk, __u32 delay) 44862306a36Sopenharmony_ci{ 44962306a36Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 45062306a36Sopenharmony_ci struct bictcp *ca = inet_csk_ca(sk); 45162306a36Sopenharmony_ci __u32 threshold; 45262306a36Sopenharmony_ci 45362306a36Sopenharmony_ci if (hystart_detect & HYSTART_ACK_TRAIN) { 45462306a36Sopenharmony_ci __u32 now = bictcp_clock_us(sk); 45562306a36Sopenharmony_ci 45662306a36Sopenharmony_ci /* first detection parameter - ack-train detection */ 45762306a36Sopenharmony_ci if ((__s32)(now - ca->last_ack) <= hystart_ack_delta_us) { 45862306a36Sopenharmony_ci ca->last_ack = now; 45962306a36Sopenharmony_ci 46062306a36Sopenharmony_ci threshold = ca->delay_min + hystart_ack_delay(sk); 46162306a36Sopenharmony_ci 46262306a36Sopenharmony_ci /* Hystart ack train triggers if we get ack past 46362306a36Sopenharmony_ci * ca->delay_min/2. 46462306a36Sopenharmony_ci * Pacing might have delayed packets up to RTT/2 46562306a36Sopenharmony_ci * during slow start. 46662306a36Sopenharmony_ci */ 46762306a36Sopenharmony_ci if (sk->sk_pacing_status == SK_PACING_NONE) 46862306a36Sopenharmony_ci threshold >>= 1; 46962306a36Sopenharmony_ci 47062306a36Sopenharmony_ci if ((__s32)(now - ca->round_start) > threshold) { 47162306a36Sopenharmony_ci ca->found = 1; 47262306a36Sopenharmony_ci tp->snd_ssthresh = tp->snd_cwnd; 47362306a36Sopenharmony_ci } 47462306a36Sopenharmony_ci } 47562306a36Sopenharmony_ci } 47662306a36Sopenharmony_ci 47762306a36Sopenharmony_ci if (hystart_detect & HYSTART_DELAY) { 47862306a36Sopenharmony_ci /* obtain the minimum delay of more than sampling packets */ 47962306a36Sopenharmony_ci if (ca->curr_rtt > delay) 48062306a36Sopenharmony_ci ca->curr_rtt = delay; 48162306a36Sopenharmony_ci if (ca->sample_cnt < HYSTART_MIN_SAMPLES) { 48262306a36Sopenharmony_ci ca->sample_cnt++; 48362306a36Sopenharmony_ci } else { 48462306a36Sopenharmony_ci if (ca->curr_rtt > ca->delay_min + 48562306a36Sopenharmony_ci HYSTART_DELAY_THRESH(ca->delay_min >> 3)) { 48662306a36Sopenharmony_ci ca->found = 1; 48762306a36Sopenharmony_ci tp->snd_ssthresh = tp->snd_cwnd; 48862306a36Sopenharmony_ci } 48962306a36Sopenharmony_ci } 49062306a36Sopenharmony_ci } 49162306a36Sopenharmony_ci} 49262306a36Sopenharmony_ci 49362306a36Sopenharmony_ciint bpf_cubic_acked_called = 0; 49462306a36Sopenharmony_ci 49562306a36Sopenharmony_civoid BPF_STRUCT_OPS(bpf_cubic_acked, struct sock *sk, 49662306a36Sopenharmony_ci const struct ack_sample *sample) 49762306a36Sopenharmony_ci{ 49862306a36Sopenharmony_ci const struct tcp_sock *tp = tcp_sk(sk); 49962306a36Sopenharmony_ci struct bictcp *ca = inet_csk_ca(sk); 50062306a36Sopenharmony_ci __u32 delay; 50162306a36Sopenharmony_ci 50262306a36Sopenharmony_ci bpf_cubic_acked_called = 1; 50362306a36Sopenharmony_ci /* Some calls are for duplicates without timetamps */ 50462306a36Sopenharmony_ci if (sample->rtt_us < 0) 50562306a36Sopenharmony_ci return; 50662306a36Sopenharmony_ci 50762306a36Sopenharmony_ci /* Discard delay samples right after fast recovery */ 50862306a36Sopenharmony_ci if (ca->epoch_start && (__s32)(tcp_jiffies32 - ca->epoch_start) < HZ) 50962306a36Sopenharmony_ci return; 51062306a36Sopenharmony_ci 51162306a36Sopenharmony_ci delay = sample->rtt_us; 51262306a36Sopenharmony_ci if (delay == 0) 51362306a36Sopenharmony_ci delay = 1; 51462306a36Sopenharmony_ci 51562306a36Sopenharmony_ci /* first time call or link delay decreases */ 51662306a36Sopenharmony_ci if (ca->delay_min == 0 || ca->delay_min > delay) 51762306a36Sopenharmony_ci ca->delay_min = delay; 51862306a36Sopenharmony_ci 51962306a36Sopenharmony_ci /* hystart triggers when cwnd is larger than some threshold */ 52062306a36Sopenharmony_ci if (!ca->found && tcp_in_slow_start(tp) && hystart && 52162306a36Sopenharmony_ci tp->snd_cwnd >= hystart_low_window) 52262306a36Sopenharmony_ci hystart_update(sk, delay); 52362306a36Sopenharmony_ci} 52462306a36Sopenharmony_ci 52562306a36Sopenharmony_ciextern __u32 tcp_reno_undo_cwnd(struct sock *sk) __ksym; 52662306a36Sopenharmony_ci 52762306a36Sopenharmony_ci__u32 BPF_STRUCT_OPS(bpf_cubic_undo_cwnd, struct sock *sk) 52862306a36Sopenharmony_ci{ 52962306a36Sopenharmony_ci return tcp_reno_undo_cwnd(sk); 53062306a36Sopenharmony_ci} 53162306a36Sopenharmony_ci 53262306a36Sopenharmony_ciSEC(".struct_ops") 53362306a36Sopenharmony_cistruct tcp_congestion_ops cubic = { 53462306a36Sopenharmony_ci .init = (void *)bpf_cubic_init, 53562306a36Sopenharmony_ci .ssthresh = (void *)bpf_cubic_recalc_ssthresh, 53662306a36Sopenharmony_ci .cong_avoid = (void *)bpf_cubic_cong_avoid, 53762306a36Sopenharmony_ci .set_state = (void *)bpf_cubic_state, 53862306a36Sopenharmony_ci .undo_cwnd = (void *)bpf_cubic_undo_cwnd, 53962306a36Sopenharmony_ci .cwnd_event = (void *)bpf_cubic_cwnd_event, 54062306a36Sopenharmony_ci .pkts_acked = (void *)bpf_cubic_acked, 54162306a36Sopenharmony_ci .name = "bpf_cubic", 54262306a36Sopenharmony_ci}; 543