18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 28c2ecf20Sopenharmony_ci 38c2ecf20Sopenharmony_ci/* WARNING: This implemenation is not necessarily the same 48c2ecf20Sopenharmony_ci * as the tcp_cubic.c. The purpose is mainly for testing 58c2ecf20Sopenharmony_ci * the kernel BPF logic. 68c2ecf20Sopenharmony_ci * 78c2ecf20Sopenharmony_ci * Highlights: 88c2ecf20Sopenharmony_ci * 1. CONFIG_HZ .kconfig map is used. 98c2ecf20Sopenharmony_ci * 2. In bictcp_update(), calculation is changed to use usec 108c2ecf20Sopenharmony_ci * resolution (i.e. USEC_PER_JIFFY) instead of using jiffies. 118c2ecf20Sopenharmony_ci * Thus, usecs_to_jiffies() is not used in the bpf_cubic.c. 128c2ecf20Sopenharmony_ci * 3. In bitctcp_update() [under tcp_friendliness], the original 138c2ecf20Sopenharmony_ci * "while (ca->ack_cnt > delta)" loop is changed to the equivalent 148c2ecf20Sopenharmony_ci * "ca->ack_cnt / delta" operation. 158c2ecf20Sopenharmony_ci */ 168c2ecf20Sopenharmony_ci 178c2ecf20Sopenharmony_ci#include <linux/bpf.h> 188c2ecf20Sopenharmony_ci#include <linux/stddef.h> 198c2ecf20Sopenharmony_ci#include <linux/tcp.h> 208c2ecf20Sopenharmony_ci#include "bpf_tcp_helpers.h" 218c2ecf20Sopenharmony_ci 228c2ecf20Sopenharmony_cichar _license[] SEC("license") = "GPL"; 238c2ecf20Sopenharmony_ci 248c2ecf20Sopenharmony_ci#define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi) 258c2ecf20Sopenharmony_ci 268c2ecf20Sopenharmony_ci#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation 278c2ecf20Sopenharmony_ci * max_cwnd = snd_cwnd * beta 288c2ecf20Sopenharmony_ci */ 298c2ecf20Sopenharmony_ci#define BICTCP_HZ 10 /* BIC HZ 2^10 = 1024 */ 308c2ecf20Sopenharmony_ci 318c2ecf20Sopenharmony_ci/* Two methods of hybrid slow start */ 328c2ecf20Sopenharmony_ci#define HYSTART_ACK_TRAIN 0x1 338c2ecf20Sopenharmony_ci#define HYSTART_DELAY 0x2 348c2ecf20Sopenharmony_ci 358c2ecf20Sopenharmony_ci/* Number of delay samples for detecting the increase of delay */ 368c2ecf20Sopenharmony_ci#define HYSTART_MIN_SAMPLES 8 378c2ecf20Sopenharmony_ci#define HYSTART_DELAY_MIN (4000U) /* 4ms */ 388c2ecf20Sopenharmony_ci#define HYSTART_DELAY_MAX (16000U) /* 16 ms */ 398c2ecf20Sopenharmony_ci#define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX) 408c2ecf20Sopenharmony_ci 418c2ecf20Sopenharmony_cistatic int fast_convergence = 1; 428c2ecf20Sopenharmony_cistatic const int beta = 717; /* = 717/1024 (BICTCP_BETA_SCALE) */ 438c2ecf20Sopenharmony_cistatic int initial_ssthresh; 448c2ecf20Sopenharmony_cistatic const int bic_scale = 41; 458c2ecf20Sopenharmony_cistatic int tcp_friendliness = 1; 468c2ecf20Sopenharmony_ci 478c2ecf20Sopenharmony_cistatic int hystart = 1; 488c2ecf20Sopenharmony_cistatic int hystart_detect = HYSTART_ACK_TRAIN | HYSTART_DELAY; 498c2ecf20Sopenharmony_cistatic int hystart_low_window = 16; 508c2ecf20Sopenharmony_cistatic int hystart_ack_delta_us = 2000; 518c2ecf20Sopenharmony_ci 528c2ecf20Sopenharmony_cistatic const __u32 cube_rtt_scale = (bic_scale * 10); /* 1024*c/rtt */ 538c2ecf20Sopenharmony_cistatic const __u32 beta_scale = 8*(BICTCP_BETA_SCALE+beta) / 3 548c2ecf20Sopenharmony_ci / (BICTCP_BETA_SCALE - beta); 558c2ecf20Sopenharmony_ci/* calculate the "K" for (wmax-cwnd) = c/rtt * K^3 568c2ecf20Sopenharmony_ci * so K = cubic_root( (wmax-cwnd)*rtt/c ) 578c2ecf20Sopenharmony_ci * the unit of K is bictcp_HZ=2^10, not HZ 588c2ecf20Sopenharmony_ci * 598c2ecf20Sopenharmony_ci * c = bic_scale >> 10 608c2ecf20Sopenharmony_ci * rtt = 100ms 618c2ecf20Sopenharmony_ci * 628c2ecf20Sopenharmony_ci * the following code has been designed and tested for 638c2ecf20Sopenharmony_ci * cwnd < 1 million packets 648c2ecf20Sopenharmony_ci * RTT < 100 seconds 658c2ecf20Sopenharmony_ci * HZ < 1,000,00 (corresponding to 10 nano-second) 668c2ecf20Sopenharmony_ci */ 678c2ecf20Sopenharmony_ci 688c2ecf20Sopenharmony_ci/* 1/c * 2^2*bictcp_HZ * srtt, 2^40 */ 698c2ecf20Sopenharmony_cistatic const __u64 cube_factor = (__u64)(1ull << (10+3*BICTCP_HZ)) 708c2ecf20Sopenharmony_ci / (bic_scale * 10); 718c2ecf20Sopenharmony_ci 728c2ecf20Sopenharmony_ci/* BIC TCP Parameters */ 738c2ecf20Sopenharmony_cistruct bictcp { 748c2ecf20Sopenharmony_ci __u32 cnt; /* increase cwnd by 1 after ACKs */ 758c2ecf20Sopenharmony_ci __u32 last_max_cwnd; /* last maximum snd_cwnd */ 768c2ecf20Sopenharmony_ci __u32 last_cwnd; /* the last snd_cwnd */ 778c2ecf20Sopenharmony_ci __u32 last_time; /* time when updated last_cwnd */ 788c2ecf20Sopenharmony_ci __u32 bic_origin_point;/* origin point of bic function */ 798c2ecf20Sopenharmony_ci __u32 bic_K; /* time to origin point 808c2ecf20Sopenharmony_ci from the beginning of the current epoch */ 818c2ecf20Sopenharmony_ci __u32 delay_min; /* min delay (usec) */ 828c2ecf20Sopenharmony_ci __u32 epoch_start; /* beginning of an epoch */ 838c2ecf20Sopenharmony_ci __u32 ack_cnt; /* number of acks */ 848c2ecf20Sopenharmony_ci __u32 tcp_cwnd; /* estimated tcp cwnd */ 858c2ecf20Sopenharmony_ci __u16 unused; 868c2ecf20Sopenharmony_ci __u8 sample_cnt; /* number of samples to decide curr_rtt */ 878c2ecf20Sopenharmony_ci __u8 found; /* the exit point is found? */ 888c2ecf20Sopenharmony_ci __u32 round_start; /* beginning of each round */ 898c2ecf20Sopenharmony_ci __u32 end_seq; /* end_seq of the round */ 908c2ecf20Sopenharmony_ci __u32 last_ack; /* last time when the ACK spacing is close */ 918c2ecf20Sopenharmony_ci __u32 curr_rtt; /* the minimum rtt of current round */ 928c2ecf20Sopenharmony_ci}; 938c2ecf20Sopenharmony_ci 948c2ecf20Sopenharmony_cistatic inline void bictcp_reset(struct bictcp *ca) 958c2ecf20Sopenharmony_ci{ 968c2ecf20Sopenharmony_ci ca->cnt = 0; 978c2ecf20Sopenharmony_ci ca->last_max_cwnd = 0; 988c2ecf20Sopenharmony_ci ca->last_cwnd = 0; 998c2ecf20Sopenharmony_ci ca->last_time = 0; 1008c2ecf20Sopenharmony_ci ca->bic_origin_point = 0; 1018c2ecf20Sopenharmony_ci ca->bic_K = 0; 1028c2ecf20Sopenharmony_ci ca->delay_min = 0; 1038c2ecf20Sopenharmony_ci ca->epoch_start = 0; 1048c2ecf20Sopenharmony_ci ca->ack_cnt = 0; 1058c2ecf20Sopenharmony_ci ca->tcp_cwnd = 0; 1068c2ecf20Sopenharmony_ci ca->found = 0; 1078c2ecf20Sopenharmony_ci} 1088c2ecf20Sopenharmony_ci 1098c2ecf20Sopenharmony_ciextern unsigned long CONFIG_HZ __kconfig; 1108c2ecf20Sopenharmony_ci#define HZ CONFIG_HZ 1118c2ecf20Sopenharmony_ci#define USEC_PER_MSEC 1000UL 1128c2ecf20Sopenharmony_ci#define USEC_PER_SEC 1000000UL 1138c2ecf20Sopenharmony_ci#define USEC_PER_JIFFY (USEC_PER_SEC / HZ) 1148c2ecf20Sopenharmony_ci 1158c2ecf20Sopenharmony_cistatic __always_inline __u64 div64_u64(__u64 dividend, __u64 divisor) 1168c2ecf20Sopenharmony_ci{ 1178c2ecf20Sopenharmony_ci return dividend / divisor; 1188c2ecf20Sopenharmony_ci} 1198c2ecf20Sopenharmony_ci 1208c2ecf20Sopenharmony_ci#define div64_ul div64_u64 1218c2ecf20Sopenharmony_ci 1228c2ecf20Sopenharmony_ci#define BITS_PER_U64 (sizeof(__u64) * 8) 1238c2ecf20Sopenharmony_cistatic __always_inline int fls64(__u64 x) 1248c2ecf20Sopenharmony_ci{ 1258c2ecf20Sopenharmony_ci int num = BITS_PER_U64 - 1; 1268c2ecf20Sopenharmony_ci 1278c2ecf20Sopenharmony_ci if (x == 0) 1288c2ecf20Sopenharmony_ci return 0; 1298c2ecf20Sopenharmony_ci 1308c2ecf20Sopenharmony_ci if (!(x & (~0ull << (BITS_PER_U64-32)))) { 1318c2ecf20Sopenharmony_ci num -= 32; 1328c2ecf20Sopenharmony_ci x <<= 32; 1338c2ecf20Sopenharmony_ci } 1348c2ecf20Sopenharmony_ci if (!(x & (~0ull << (BITS_PER_U64-16)))) { 1358c2ecf20Sopenharmony_ci num -= 16; 1368c2ecf20Sopenharmony_ci x <<= 16; 1378c2ecf20Sopenharmony_ci } 1388c2ecf20Sopenharmony_ci if (!(x & (~0ull << (BITS_PER_U64-8)))) { 1398c2ecf20Sopenharmony_ci num -= 8; 1408c2ecf20Sopenharmony_ci x <<= 8; 1418c2ecf20Sopenharmony_ci } 1428c2ecf20Sopenharmony_ci if (!(x & (~0ull << (BITS_PER_U64-4)))) { 1438c2ecf20Sopenharmony_ci num -= 4; 1448c2ecf20Sopenharmony_ci x <<= 4; 1458c2ecf20Sopenharmony_ci } 1468c2ecf20Sopenharmony_ci if (!(x & (~0ull << (BITS_PER_U64-2)))) { 1478c2ecf20Sopenharmony_ci num -= 2; 1488c2ecf20Sopenharmony_ci x <<= 2; 1498c2ecf20Sopenharmony_ci } 1508c2ecf20Sopenharmony_ci if (!(x & (~0ull << (BITS_PER_U64-1)))) 1518c2ecf20Sopenharmony_ci num -= 1; 1528c2ecf20Sopenharmony_ci 1538c2ecf20Sopenharmony_ci return num + 1; 1548c2ecf20Sopenharmony_ci} 1558c2ecf20Sopenharmony_ci 1568c2ecf20Sopenharmony_cistatic __always_inline __u32 bictcp_clock_us(const struct sock *sk) 1578c2ecf20Sopenharmony_ci{ 1588c2ecf20Sopenharmony_ci return tcp_sk(sk)->tcp_mstamp; 1598c2ecf20Sopenharmony_ci} 1608c2ecf20Sopenharmony_ci 1618c2ecf20Sopenharmony_cistatic __always_inline void bictcp_hystart_reset(struct sock *sk) 1628c2ecf20Sopenharmony_ci{ 1638c2ecf20Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 1648c2ecf20Sopenharmony_ci struct bictcp *ca = inet_csk_ca(sk); 1658c2ecf20Sopenharmony_ci 1668c2ecf20Sopenharmony_ci ca->round_start = ca->last_ack = bictcp_clock_us(sk); 1678c2ecf20Sopenharmony_ci ca->end_seq = tp->snd_nxt; 1688c2ecf20Sopenharmony_ci ca->curr_rtt = ~0U; 1698c2ecf20Sopenharmony_ci ca->sample_cnt = 0; 1708c2ecf20Sopenharmony_ci} 1718c2ecf20Sopenharmony_ci 1728c2ecf20Sopenharmony_ci/* "struct_ops/" prefix is not a requirement 1738c2ecf20Sopenharmony_ci * It will be recognized as BPF_PROG_TYPE_STRUCT_OPS 1748c2ecf20Sopenharmony_ci * as long as it is used in one of the func ptr 1758c2ecf20Sopenharmony_ci * under SEC(".struct_ops"). 1768c2ecf20Sopenharmony_ci */ 1778c2ecf20Sopenharmony_ciSEC("struct_ops/bictcp_init") 1788c2ecf20Sopenharmony_civoid BPF_PROG(bictcp_init, struct sock *sk) 1798c2ecf20Sopenharmony_ci{ 1808c2ecf20Sopenharmony_ci struct bictcp *ca = inet_csk_ca(sk); 1818c2ecf20Sopenharmony_ci 1828c2ecf20Sopenharmony_ci bictcp_reset(ca); 1838c2ecf20Sopenharmony_ci 1848c2ecf20Sopenharmony_ci if (hystart) 1858c2ecf20Sopenharmony_ci bictcp_hystart_reset(sk); 1868c2ecf20Sopenharmony_ci 1878c2ecf20Sopenharmony_ci if (!hystart && initial_ssthresh) 1888c2ecf20Sopenharmony_ci tcp_sk(sk)->snd_ssthresh = initial_ssthresh; 1898c2ecf20Sopenharmony_ci} 1908c2ecf20Sopenharmony_ci 1918c2ecf20Sopenharmony_ci/* No prefix in SEC will also work. 1928c2ecf20Sopenharmony_ci * The remaining tcp-cubic functions have an easier way. 1938c2ecf20Sopenharmony_ci */ 1948c2ecf20Sopenharmony_ciSEC("no-sec-prefix-bictcp_cwnd_event") 1958c2ecf20Sopenharmony_civoid BPF_PROG(bictcp_cwnd_event, struct sock *sk, enum tcp_ca_event event) 1968c2ecf20Sopenharmony_ci{ 1978c2ecf20Sopenharmony_ci if (event == CA_EVENT_TX_START) { 1988c2ecf20Sopenharmony_ci struct bictcp *ca = inet_csk_ca(sk); 1998c2ecf20Sopenharmony_ci __u32 now = tcp_jiffies32; 2008c2ecf20Sopenharmony_ci __s32 delta; 2018c2ecf20Sopenharmony_ci 2028c2ecf20Sopenharmony_ci delta = now - tcp_sk(sk)->lsndtime; 2038c2ecf20Sopenharmony_ci 2048c2ecf20Sopenharmony_ci /* We were application limited (idle) for a while. 2058c2ecf20Sopenharmony_ci * Shift epoch_start to keep cwnd growth to cubic curve. 2068c2ecf20Sopenharmony_ci */ 2078c2ecf20Sopenharmony_ci if (ca->epoch_start && delta > 0) { 2088c2ecf20Sopenharmony_ci ca->epoch_start += delta; 2098c2ecf20Sopenharmony_ci if (after(ca->epoch_start, now)) 2108c2ecf20Sopenharmony_ci ca->epoch_start = now; 2118c2ecf20Sopenharmony_ci } 2128c2ecf20Sopenharmony_ci return; 2138c2ecf20Sopenharmony_ci } 2148c2ecf20Sopenharmony_ci} 2158c2ecf20Sopenharmony_ci 2168c2ecf20Sopenharmony_ci/* 2178c2ecf20Sopenharmony_ci * cbrt(x) MSB values for x MSB values in [0..63]. 2188c2ecf20Sopenharmony_ci * Precomputed then refined by hand - Willy Tarreau 2198c2ecf20Sopenharmony_ci * 2208c2ecf20Sopenharmony_ci * For x in [0..63], 2218c2ecf20Sopenharmony_ci * v = cbrt(x << 18) - 1 2228c2ecf20Sopenharmony_ci * cbrt(x) = (v[x] + 10) >> 6 2238c2ecf20Sopenharmony_ci */ 2248c2ecf20Sopenharmony_cistatic const __u8 v[] = { 2258c2ecf20Sopenharmony_ci /* 0x00 */ 0, 54, 54, 54, 118, 118, 118, 118, 2268c2ecf20Sopenharmony_ci /* 0x08 */ 123, 129, 134, 138, 143, 147, 151, 156, 2278c2ecf20Sopenharmony_ci /* 0x10 */ 157, 161, 164, 168, 170, 173, 176, 179, 2288c2ecf20Sopenharmony_ci /* 0x18 */ 181, 185, 187, 190, 192, 194, 197, 199, 2298c2ecf20Sopenharmony_ci /* 0x20 */ 200, 202, 204, 206, 209, 211, 213, 215, 2308c2ecf20Sopenharmony_ci /* 0x28 */ 217, 219, 221, 222, 224, 225, 227, 229, 2318c2ecf20Sopenharmony_ci /* 0x30 */ 231, 232, 234, 236, 237, 239, 240, 242, 2328c2ecf20Sopenharmony_ci /* 0x38 */ 244, 245, 246, 248, 250, 251, 252, 254, 2338c2ecf20Sopenharmony_ci}; 2348c2ecf20Sopenharmony_ci 2358c2ecf20Sopenharmony_ci/* calculate the cubic root of x using a table lookup followed by one 2368c2ecf20Sopenharmony_ci * Newton-Raphson iteration. 2378c2ecf20Sopenharmony_ci * Avg err ~= 0.195% 2388c2ecf20Sopenharmony_ci */ 2398c2ecf20Sopenharmony_cistatic __always_inline __u32 cubic_root(__u64 a) 2408c2ecf20Sopenharmony_ci{ 2418c2ecf20Sopenharmony_ci __u32 x, b, shift; 2428c2ecf20Sopenharmony_ci 2438c2ecf20Sopenharmony_ci if (a < 64) { 2448c2ecf20Sopenharmony_ci /* a in [0..63] */ 2458c2ecf20Sopenharmony_ci return ((__u32)v[(__u32)a] + 35) >> 6; 2468c2ecf20Sopenharmony_ci } 2478c2ecf20Sopenharmony_ci 2488c2ecf20Sopenharmony_ci b = fls64(a); 2498c2ecf20Sopenharmony_ci b = ((b * 84) >> 8) - 1; 2508c2ecf20Sopenharmony_ci shift = (a >> (b * 3)); 2518c2ecf20Sopenharmony_ci 2528c2ecf20Sopenharmony_ci /* it is needed for verifier's bound check on v */ 2538c2ecf20Sopenharmony_ci if (shift >= 64) 2548c2ecf20Sopenharmony_ci return 0; 2558c2ecf20Sopenharmony_ci 2568c2ecf20Sopenharmony_ci x = ((__u32)(((__u32)v[shift] + 10) << b)) >> 6; 2578c2ecf20Sopenharmony_ci 2588c2ecf20Sopenharmony_ci /* 2598c2ecf20Sopenharmony_ci * Newton-Raphson iteration 2608c2ecf20Sopenharmony_ci * 2 2618c2ecf20Sopenharmony_ci * x = ( 2 * x + a / x ) / 3 2628c2ecf20Sopenharmony_ci * k+1 k k 2638c2ecf20Sopenharmony_ci */ 2648c2ecf20Sopenharmony_ci x = (2 * x + (__u32)div64_u64(a, (__u64)x * (__u64)(x - 1))); 2658c2ecf20Sopenharmony_ci x = ((x * 341) >> 10); 2668c2ecf20Sopenharmony_ci return x; 2678c2ecf20Sopenharmony_ci} 2688c2ecf20Sopenharmony_ci 2698c2ecf20Sopenharmony_ci/* 2708c2ecf20Sopenharmony_ci * Compute congestion window to use. 2718c2ecf20Sopenharmony_ci */ 2728c2ecf20Sopenharmony_cistatic __always_inline void bictcp_update(struct bictcp *ca, __u32 cwnd, 2738c2ecf20Sopenharmony_ci __u32 acked) 2748c2ecf20Sopenharmony_ci{ 2758c2ecf20Sopenharmony_ci __u32 delta, bic_target, max_cnt; 2768c2ecf20Sopenharmony_ci __u64 offs, t; 2778c2ecf20Sopenharmony_ci 2788c2ecf20Sopenharmony_ci ca->ack_cnt += acked; /* count the number of ACKed packets */ 2798c2ecf20Sopenharmony_ci 2808c2ecf20Sopenharmony_ci if (ca->last_cwnd == cwnd && 2818c2ecf20Sopenharmony_ci (__s32)(tcp_jiffies32 - ca->last_time) <= HZ / 32) 2828c2ecf20Sopenharmony_ci return; 2838c2ecf20Sopenharmony_ci 2848c2ecf20Sopenharmony_ci /* The CUBIC function can update ca->cnt at most once per jiffy. 2858c2ecf20Sopenharmony_ci * On all cwnd reduction events, ca->epoch_start is set to 0, 2868c2ecf20Sopenharmony_ci * which will force a recalculation of ca->cnt. 2878c2ecf20Sopenharmony_ci */ 2888c2ecf20Sopenharmony_ci if (ca->epoch_start && tcp_jiffies32 == ca->last_time) 2898c2ecf20Sopenharmony_ci goto tcp_friendliness; 2908c2ecf20Sopenharmony_ci 2918c2ecf20Sopenharmony_ci ca->last_cwnd = cwnd; 2928c2ecf20Sopenharmony_ci ca->last_time = tcp_jiffies32; 2938c2ecf20Sopenharmony_ci 2948c2ecf20Sopenharmony_ci if (ca->epoch_start == 0) { 2958c2ecf20Sopenharmony_ci ca->epoch_start = tcp_jiffies32; /* record beginning */ 2968c2ecf20Sopenharmony_ci ca->ack_cnt = acked; /* start counting */ 2978c2ecf20Sopenharmony_ci ca->tcp_cwnd = cwnd; /* syn with cubic */ 2988c2ecf20Sopenharmony_ci 2998c2ecf20Sopenharmony_ci if (ca->last_max_cwnd <= cwnd) { 3008c2ecf20Sopenharmony_ci ca->bic_K = 0; 3018c2ecf20Sopenharmony_ci ca->bic_origin_point = cwnd; 3028c2ecf20Sopenharmony_ci } else { 3038c2ecf20Sopenharmony_ci /* Compute new K based on 3048c2ecf20Sopenharmony_ci * (wmax-cwnd) * (srtt>>3 / HZ) / c * 2^(3*bictcp_HZ) 3058c2ecf20Sopenharmony_ci */ 3068c2ecf20Sopenharmony_ci ca->bic_K = cubic_root(cube_factor 3078c2ecf20Sopenharmony_ci * (ca->last_max_cwnd - cwnd)); 3088c2ecf20Sopenharmony_ci ca->bic_origin_point = ca->last_max_cwnd; 3098c2ecf20Sopenharmony_ci } 3108c2ecf20Sopenharmony_ci } 3118c2ecf20Sopenharmony_ci 3128c2ecf20Sopenharmony_ci /* cubic function - calc*/ 3138c2ecf20Sopenharmony_ci /* calculate c * time^3 / rtt, 3148c2ecf20Sopenharmony_ci * while considering overflow in calculation of time^3 3158c2ecf20Sopenharmony_ci * (so time^3 is done by using 64 bit) 3168c2ecf20Sopenharmony_ci * and without the support of division of 64bit numbers 3178c2ecf20Sopenharmony_ci * (so all divisions are done by using 32 bit) 3188c2ecf20Sopenharmony_ci * also NOTE the unit of those veriables 3198c2ecf20Sopenharmony_ci * time = (t - K) / 2^bictcp_HZ 3208c2ecf20Sopenharmony_ci * c = bic_scale >> 10 3218c2ecf20Sopenharmony_ci * rtt = (srtt >> 3) / HZ 3228c2ecf20Sopenharmony_ci * !!! The following code does not have overflow problems, 3238c2ecf20Sopenharmony_ci * if the cwnd < 1 million packets !!! 3248c2ecf20Sopenharmony_ci */ 3258c2ecf20Sopenharmony_ci 3268c2ecf20Sopenharmony_ci t = (__s32)(tcp_jiffies32 - ca->epoch_start) * USEC_PER_JIFFY; 3278c2ecf20Sopenharmony_ci t += ca->delay_min; 3288c2ecf20Sopenharmony_ci /* change the unit from usec to bictcp_HZ */ 3298c2ecf20Sopenharmony_ci t <<= BICTCP_HZ; 3308c2ecf20Sopenharmony_ci t /= USEC_PER_SEC; 3318c2ecf20Sopenharmony_ci 3328c2ecf20Sopenharmony_ci if (t < ca->bic_K) /* t - K */ 3338c2ecf20Sopenharmony_ci offs = ca->bic_K - t; 3348c2ecf20Sopenharmony_ci else 3358c2ecf20Sopenharmony_ci offs = t - ca->bic_K; 3368c2ecf20Sopenharmony_ci 3378c2ecf20Sopenharmony_ci /* c/rtt * (t-K)^3 */ 3388c2ecf20Sopenharmony_ci delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ); 3398c2ecf20Sopenharmony_ci if (t < ca->bic_K) /* below origin*/ 3408c2ecf20Sopenharmony_ci bic_target = ca->bic_origin_point - delta; 3418c2ecf20Sopenharmony_ci else /* above origin*/ 3428c2ecf20Sopenharmony_ci bic_target = ca->bic_origin_point + delta; 3438c2ecf20Sopenharmony_ci 3448c2ecf20Sopenharmony_ci /* cubic function - calc bictcp_cnt*/ 3458c2ecf20Sopenharmony_ci if (bic_target > cwnd) { 3468c2ecf20Sopenharmony_ci ca->cnt = cwnd / (bic_target - cwnd); 3478c2ecf20Sopenharmony_ci } else { 3488c2ecf20Sopenharmony_ci ca->cnt = 100 * cwnd; /* very small increment*/ 3498c2ecf20Sopenharmony_ci } 3508c2ecf20Sopenharmony_ci 3518c2ecf20Sopenharmony_ci /* 3528c2ecf20Sopenharmony_ci * The initial growth of cubic function may be too conservative 3538c2ecf20Sopenharmony_ci * when the available bandwidth is still unknown. 3548c2ecf20Sopenharmony_ci */ 3558c2ecf20Sopenharmony_ci if (ca->last_max_cwnd == 0 && ca->cnt > 20) 3568c2ecf20Sopenharmony_ci ca->cnt = 20; /* increase cwnd 5% per RTT */ 3578c2ecf20Sopenharmony_ci 3588c2ecf20Sopenharmony_citcp_friendliness: 3598c2ecf20Sopenharmony_ci /* TCP Friendly */ 3608c2ecf20Sopenharmony_ci if (tcp_friendliness) { 3618c2ecf20Sopenharmony_ci __u32 scale = beta_scale; 3628c2ecf20Sopenharmony_ci __u32 n; 3638c2ecf20Sopenharmony_ci 3648c2ecf20Sopenharmony_ci /* update tcp cwnd */ 3658c2ecf20Sopenharmony_ci delta = (cwnd * scale) >> 3; 3668c2ecf20Sopenharmony_ci if (ca->ack_cnt > delta && delta) { 3678c2ecf20Sopenharmony_ci n = ca->ack_cnt / delta; 3688c2ecf20Sopenharmony_ci ca->ack_cnt -= n * delta; 3698c2ecf20Sopenharmony_ci ca->tcp_cwnd += n; 3708c2ecf20Sopenharmony_ci } 3718c2ecf20Sopenharmony_ci 3728c2ecf20Sopenharmony_ci if (ca->tcp_cwnd > cwnd) { /* if bic is slower than tcp */ 3738c2ecf20Sopenharmony_ci delta = ca->tcp_cwnd - cwnd; 3748c2ecf20Sopenharmony_ci max_cnt = cwnd / delta; 3758c2ecf20Sopenharmony_ci if (ca->cnt > max_cnt) 3768c2ecf20Sopenharmony_ci ca->cnt = max_cnt; 3778c2ecf20Sopenharmony_ci } 3788c2ecf20Sopenharmony_ci } 3798c2ecf20Sopenharmony_ci 3808c2ecf20Sopenharmony_ci /* The maximum rate of cwnd increase CUBIC allows is 1 packet per 3818c2ecf20Sopenharmony_ci * 2 packets ACKed, meaning cwnd grows at 1.5x per RTT. 3828c2ecf20Sopenharmony_ci */ 3838c2ecf20Sopenharmony_ci ca->cnt = max(ca->cnt, 2U); 3848c2ecf20Sopenharmony_ci} 3858c2ecf20Sopenharmony_ci 3868c2ecf20Sopenharmony_ci/* Or simply use the BPF_STRUCT_OPS to avoid the SEC boiler plate. */ 3878c2ecf20Sopenharmony_civoid BPF_STRUCT_OPS(bictcp_cong_avoid, struct sock *sk, __u32 ack, __u32 acked) 3888c2ecf20Sopenharmony_ci{ 3898c2ecf20Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 3908c2ecf20Sopenharmony_ci struct bictcp *ca = inet_csk_ca(sk); 3918c2ecf20Sopenharmony_ci 3928c2ecf20Sopenharmony_ci if (!tcp_is_cwnd_limited(sk)) 3938c2ecf20Sopenharmony_ci return; 3948c2ecf20Sopenharmony_ci 3958c2ecf20Sopenharmony_ci if (tcp_in_slow_start(tp)) { 3968c2ecf20Sopenharmony_ci if (hystart && after(ack, ca->end_seq)) 3978c2ecf20Sopenharmony_ci bictcp_hystart_reset(sk); 3988c2ecf20Sopenharmony_ci acked = tcp_slow_start(tp, acked); 3998c2ecf20Sopenharmony_ci if (!acked) 4008c2ecf20Sopenharmony_ci return; 4018c2ecf20Sopenharmony_ci } 4028c2ecf20Sopenharmony_ci bictcp_update(ca, tp->snd_cwnd, acked); 4038c2ecf20Sopenharmony_ci tcp_cong_avoid_ai(tp, ca->cnt, acked); 4048c2ecf20Sopenharmony_ci} 4058c2ecf20Sopenharmony_ci 4068c2ecf20Sopenharmony_ci__u32 BPF_STRUCT_OPS(bictcp_recalc_ssthresh, struct sock *sk) 4078c2ecf20Sopenharmony_ci{ 4088c2ecf20Sopenharmony_ci const struct tcp_sock *tp = tcp_sk(sk); 4098c2ecf20Sopenharmony_ci struct bictcp *ca = inet_csk_ca(sk); 4108c2ecf20Sopenharmony_ci 4118c2ecf20Sopenharmony_ci ca->epoch_start = 0; /* end of epoch */ 4128c2ecf20Sopenharmony_ci 4138c2ecf20Sopenharmony_ci /* Wmax and fast convergence */ 4148c2ecf20Sopenharmony_ci if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence) 4158c2ecf20Sopenharmony_ci ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta)) 4168c2ecf20Sopenharmony_ci / (2 * BICTCP_BETA_SCALE); 4178c2ecf20Sopenharmony_ci else 4188c2ecf20Sopenharmony_ci ca->last_max_cwnd = tp->snd_cwnd; 4198c2ecf20Sopenharmony_ci 4208c2ecf20Sopenharmony_ci return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); 4218c2ecf20Sopenharmony_ci} 4228c2ecf20Sopenharmony_ci 4238c2ecf20Sopenharmony_civoid BPF_STRUCT_OPS(bictcp_state, struct sock *sk, __u8 new_state) 4248c2ecf20Sopenharmony_ci{ 4258c2ecf20Sopenharmony_ci if (new_state == TCP_CA_Loss) { 4268c2ecf20Sopenharmony_ci bictcp_reset(inet_csk_ca(sk)); 4278c2ecf20Sopenharmony_ci bictcp_hystart_reset(sk); 4288c2ecf20Sopenharmony_ci } 4298c2ecf20Sopenharmony_ci} 4308c2ecf20Sopenharmony_ci 4318c2ecf20Sopenharmony_ci#define GSO_MAX_SIZE 65536 4328c2ecf20Sopenharmony_ci 4338c2ecf20Sopenharmony_ci/* Account for TSO/GRO delays. 4348c2ecf20Sopenharmony_ci * Otherwise short RTT flows could get too small ssthresh, since during 4358c2ecf20Sopenharmony_ci * slow start we begin with small TSO packets and ca->delay_min would 4368c2ecf20Sopenharmony_ci * not account for long aggregation delay when TSO packets get bigger. 4378c2ecf20Sopenharmony_ci * Ideally even with a very small RTT we would like to have at least one 4388c2ecf20Sopenharmony_ci * TSO packet being sent and received by GRO, and another one in qdisc layer. 4398c2ecf20Sopenharmony_ci * We apply another 100% factor because @rate is doubled at this point. 4408c2ecf20Sopenharmony_ci * We cap the cushion to 1ms. 4418c2ecf20Sopenharmony_ci */ 4428c2ecf20Sopenharmony_cistatic __always_inline __u32 hystart_ack_delay(struct sock *sk) 4438c2ecf20Sopenharmony_ci{ 4448c2ecf20Sopenharmony_ci unsigned long rate; 4458c2ecf20Sopenharmony_ci 4468c2ecf20Sopenharmony_ci rate = sk->sk_pacing_rate; 4478c2ecf20Sopenharmony_ci if (!rate) 4488c2ecf20Sopenharmony_ci return 0; 4498c2ecf20Sopenharmony_ci return min((__u64)USEC_PER_MSEC, 4508c2ecf20Sopenharmony_ci div64_ul((__u64)GSO_MAX_SIZE * 4 * USEC_PER_SEC, rate)); 4518c2ecf20Sopenharmony_ci} 4528c2ecf20Sopenharmony_ci 4538c2ecf20Sopenharmony_cistatic __always_inline void hystart_update(struct sock *sk, __u32 delay) 4548c2ecf20Sopenharmony_ci{ 4558c2ecf20Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 4568c2ecf20Sopenharmony_ci struct bictcp *ca = inet_csk_ca(sk); 4578c2ecf20Sopenharmony_ci __u32 threshold; 4588c2ecf20Sopenharmony_ci 4598c2ecf20Sopenharmony_ci if (hystart_detect & HYSTART_ACK_TRAIN) { 4608c2ecf20Sopenharmony_ci __u32 now = bictcp_clock_us(sk); 4618c2ecf20Sopenharmony_ci 4628c2ecf20Sopenharmony_ci /* first detection parameter - ack-train detection */ 4638c2ecf20Sopenharmony_ci if ((__s32)(now - ca->last_ack) <= hystart_ack_delta_us) { 4648c2ecf20Sopenharmony_ci ca->last_ack = now; 4658c2ecf20Sopenharmony_ci 4668c2ecf20Sopenharmony_ci threshold = ca->delay_min + hystart_ack_delay(sk); 4678c2ecf20Sopenharmony_ci 4688c2ecf20Sopenharmony_ci /* Hystart ack train triggers if we get ack past 4698c2ecf20Sopenharmony_ci * ca->delay_min/2. 4708c2ecf20Sopenharmony_ci * Pacing might have delayed packets up to RTT/2 4718c2ecf20Sopenharmony_ci * during slow start. 4728c2ecf20Sopenharmony_ci */ 4738c2ecf20Sopenharmony_ci if (sk->sk_pacing_status == SK_PACING_NONE) 4748c2ecf20Sopenharmony_ci threshold >>= 1; 4758c2ecf20Sopenharmony_ci 4768c2ecf20Sopenharmony_ci if ((__s32)(now - ca->round_start) > threshold) { 4778c2ecf20Sopenharmony_ci ca->found = 1; 4788c2ecf20Sopenharmony_ci tp->snd_ssthresh = tp->snd_cwnd; 4798c2ecf20Sopenharmony_ci } 4808c2ecf20Sopenharmony_ci } 4818c2ecf20Sopenharmony_ci } 4828c2ecf20Sopenharmony_ci 4838c2ecf20Sopenharmony_ci if (hystart_detect & HYSTART_DELAY) { 4848c2ecf20Sopenharmony_ci /* obtain the minimum delay of more than sampling packets */ 4858c2ecf20Sopenharmony_ci if (ca->curr_rtt > delay) 4868c2ecf20Sopenharmony_ci ca->curr_rtt = delay; 4878c2ecf20Sopenharmony_ci if (ca->sample_cnt < HYSTART_MIN_SAMPLES) { 4888c2ecf20Sopenharmony_ci ca->sample_cnt++; 4898c2ecf20Sopenharmony_ci } else { 4908c2ecf20Sopenharmony_ci if (ca->curr_rtt > ca->delay_min + 4918c2ecf20Sopenharmony_ci HYSTART_DELAY_THRESH(ca->delay_min >> 3)) { 4928c2ecf20Sopenharmony_ci ca->found = 1; 4938c2ecf20Sopenharmony_ci tp->snd_ssthresh = tp->snd_cwnd; 4948c2ecf20Sopenharmony_ci } 4958c2ecf20Sopenharmony_ci } 4968c2ecf20Sopenharmony_ci } 4978c2ecf20Sopenharmony_ci} 4988c2ecf20Sopenharmony_ci 4998c2ecf20Sopenharmony_civoid BPF_STRUCT_OPS(bictcp_acked, struct sock *sk, 5008c2ecf20Sopenharmony_ci const struct ack_sample *sample) 5018c2ecf20Sopenharmony_ci{ 5028c2ecf20Sopenharmony_ci const struct tcp_sock *tp = tcp_sk(sk); 5038c2ecf20Sopenharmony_ci struct bictcp *ca = inet_csk_ca(sk); 5048c2ecf20Sopenharmony_ci __u32 delay; 5058c2ecf20Sopenharmony_ci 5068c2ecf20Sopenharmony_ci /* Some calls are for duplicates without timetamps */ 5078c2ecf20Sopenharmony_ci if (sample->rtt_us < 0) 5088c2ecf20Sopenharmony_ci return; 5098c2ecf20Sopenharmony_ci 5108c2ecf20Sopenharmony_ci /* Discard delay samples right after fast recovery */ 5118c2ecf20Sopenharmony_ci if (ca->epoch_start && (__s32)(tcp_jiffies32 - ca->epoch_start) < HZ) 5128c2ecf20Sopenharmony_ci return; 5138c2ecf20Sopenharmony_ci 5148c2ecf20Sopenharmony_ci delay = sample->rtt_us; 5158c2ecf20Sopenharmony_ci if (delay == 0) 5168c2ecf20Sopenharmony_ci delay = 1; 5178c2ecf20Sopenharmony_ci 5188c2ecf20Sopenharmony_ci /* first time call or link delay decreases */ 5198c2ecf20Sopenharmony_ci if (ca->delay_min == 0 || ca->delay_min > delay) 5208c2ecf20Sopenharmony_ci ca->delay_min = delay; 5218c2ecf20Sopenharmony_ci 5228c2ecf20Sopenharmony_ci /* hystart triggers when cwnd is larger than some threshold */ 5238c2ecf20Sopenharmony_ci if (!ca->found && tcp_in_slow_start(tp) && hystart && 5248c2ecf20Sopenharmony_ci tp->snd_cwnd >= hystart_low_window) 5258c2ecf20Sopenharmony_ci hystart_update(sk, delay); 5268c2ecf20Sopenharmony_ci} 5278c2ecf20Sopenharmony_ci 5288c2ecf20Sopenharmony_ci__u32 BPF_STRUCT_OPS(tcp_reno_undo_cwnd, struct sock *sk) 5298c2ecf20Sopenharmony_ci{ 5308c2ecf20Sopenharmony_ci const struct tcp_sock *tp = tcp_sk(sk); 5318c2ecf20Sopenharmony_ci 5328c2ecf20Sopenharmony_ci return max(tp->snd_cwnd, tp->prior_cwnd); 5338c2ecf20Sopenharmony_ci} 5348c2ecf20Sopenharmony_ci 5358c2ecf20Sopenharmony_ciSEC(".struct_ops") 5368c2ecf20Sopenharmony_cistruct tcp_congestion_ops cubic = { 5378c2ecf20Sopenharmony_ci .init = (void *)bictcp_init, 5388c2ecf20Sopenharmony_ci .ssthresh = (void *)bictcp_recalc_ssthresh, 5398c2ecf20Sopenharmony_ci .cong_avoid = (void *)bictcp_cong_avoid, 5408c2ecf20Sopenharmony_ci .set_state = (void *)bictcp_state, 5418c2ecf20Sopenharmony_ci .undo_cwnd = (void *)tcp_reno_undo_cwnd, 5428c2ecf20Sopenharmony_ci .cwnd_event = (void *)bictcp_cwnd_event, 5438c2ecf20Sopenharmony_ci .pkts_acked = (void *)bictcp_acked, 5448c2ecf20Sopenharmony_ci .name = "bpf_cubic", 5458c2ecf20Sopenharmony_ci}; 546