162306a36Sopenharmony_ci/* Bottleneck Bandwidth and RTT (BBR) congestion control 262306a36Sopenharmony_ci * 362306a36Sopenharmony_ci * BBR congestion control computes the sending rate based on the delivery 462306a36Sopenharmony_ci * rate (throughput) estimated from ACKs. In a nutshell: 562306a36Sopenharmony_ci * 662306a36Sopenharmony_ci * On each ACK, update our model of the network path: 762306a36Sopenharmony_ci * bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips) 862306a36Sopenharmony_ci * min_rtt = windowed_min(rtt, 10 seconds) 962306a36Sopenharmony_ci * pacing_rate = pacing_gain * bottleneck_bandwidth 1062306a36Sopenharmony_ci * cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4) 1162306a36Sopenharmony_ci * 1262306a36Sopenharmony_ci * The core algorithm does not react directly to packet losses or delays, 1362306a36Sopenharmony_ci * although BBR may adjust the size of next send per ACK when loss is 1462306a36Sopenharmony_ci * observed, or adjust the sending rate if it estimates there is a 1562306a36Sopenharmony_ci * traffic policer, in order to keep the drop rate reasonable. 1662306a36Sopenharmony_ci * 1762306a36Sopenharmony_ci * Here is a state transition diagram for BBR: 1862306a36Sopenharmony_ci * 1962306a36Sopenharmony_ci * | 2062306a36Sopenharmony_ci * V 2162306a36Sopenharmony_ci * +---> STARTUP ----+ 2262306a36Sopenharmony_ci * | | | 2362306a36Sopenharmony_ci * | V | 2462306a36Sopenharmony_ci * | DRAIN ----+ 2562306a36Sopenharmony_ci * | | | 2662306a36Sopenharmony_ci * | V | 2762306a36Sopenharmony_ci * +---> PROBE_BW ----+ 2862306a36Sopenharmony_ci * | ^ | | 2962306a36Sopenharmony_ci * | | | | 3062306a36Sopenharmony_ci * | +----+ | 3162306a36Sopenharmony_ci * | | 3262306a36Sopenharmony_ci * +---- PROBE_RTT <--+ 3362306a36Sopenharmony_ci * 3462306a36Sopenharmony_ci * A BBR flow starts in STARTUP, and ramps up its sending rate quickly. 3562306a36Sopenharmony_ci * When it estimates the pipe is full, it enters DRAIN to drain the queue. 3662306a36Sopenharmony_ci * In steady state a BBR flow only uses PROBE_BW and PROBE_RTT. 3762306a36Sopenharmony_ci * A long-lived BBR flow spends the vast majority of its time remaining 3862306a36Sopenharmony_ci * (repeatedly) in PROBE_BW, fully probing and utilizing the pipe's bandwidth 3962306a36Sopenharmony_ci * in a fair manner, with a small, bounded queue. *If* a flow has been 4062306a36Sopenharmony_ci * continuously sending for the entire min_rtt window, and hasn't seen an RTT 4162306a36Sopenharmony_ci * sample that matches or decreases its min_rtt estimate for 10 seconds, then 4262306a36Sopenharmony_ci * it briefly enters PROBE_RTT to cut inflight to a minimum value to re-probe 4362306a36Sopenharmony_ci * the path's two-way propagation delay (min_rtt). When exiting PROBE_RTT, if 4462306a36Sopenharmony_ci * we estimated that we reached the full bw of the pipe then we enter PROBE_BW; 4562306a36Sopenharmony_ci * otherwise we enter STARTUP to try to fill the pipe. 4662306a36Sopenharmony_ci * 4762306a36Sopenharmony_ci * BBR is described in detail in: 4862306a36Sopenharmony_ci * "BBR: Congestion-Based Congestion Control", 4962306a36Sopenharmony_ci * Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh, 5062306a36Sopenharmony_ci * Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016. 5162306a36Sopenharmony_ci * 5262306a36Sopenharmony_ci * There is a public e-mail list for discussing BBR development and testing: 5362306a36Sopenharmony_ci * https://groups.google.com/forum/#!forum/bbr-dev 5462306a36Sopenharmony_ci * 5562306a36Sopenharmony_ci * NOTE: BBR might be used with the fq qdisc ("man tc-fq") with pacing enabled, 5662306a36Sopenharmony_ci * otherwise TCP stack falls back to an internal pacing using one high 5762306a36Sopenharmony_ci * resolution timer per TCP socket and may use more resources. 5862306a36Sopenharmony_ci */ 5962306a36Sopenharmony_ci#include <linux/btf.h> 6062306a36Sopenharmony_ci#include <linux/btf_ids.h> 6162306a36Sopenharmony_ci#include <linux/module.h> 6262306a36Sopenharmony_ci#include <net/tcp.h> 6362306a36Sopenharmony_ci#include <linux/inet_diag.h> 6462306a36Sopenharmony_ci#include <linux/inet.h> 6562306a36Sopenharmony_ci#include <linux/random.h> 6662306a36Sopenharmony_ci#include <linux/win_minmax.h> 6762306a36Sopenharmony_ci 6862306a36Sopenharmony_ci/* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth 6962306a36Sopenharmony_ci * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps. 7062306a36Sopenharmony_ci * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32. 7162306a36Sopenharmony_ci * Since the minimum window is >=4 packets, the lower bound isn't 7262306a36Sopenharmony_ci * an issue. The upper bound isn't an issue with existing technologies. 7362306a36Sopenharmony_ci */ 7462306a36Sopenharmony_ci#define BW_SCALE 24 7562306a36Sopenharmony_ci#define BW_UNIT (1 << BW_SCALE) 7662306a36Sopenharmony_ci 7762306a36Sopenharmony_ci#define BBR_SCALE 8 /* scaling factor for fractions in BBR (e.g. gains) */ 7862306a36Sopenharmony_ci#define BBR_UNIT (1 << BBR_SCALE) 7962306a36Sopenharmony_ci 8062306a36Sopenharmony_ci/* BBR has the following modes for deciding how fast to send: */ 8162306a36Sopenharmony_cienum bbr_mode { 8262306a36Sopenharmony_ci BBR_STARTUP, /* ramp up sending rate rapidly to fill pipe */ 8362306a36Sopenharmony_ci BBR_DRAIN, /* drain any queue created during startup */ 8462306a36Sopenharmony_ci BBR_PROBE_BW, /* discover, share bw: pace around estimated bw */ 8562306a36Sopenharmony_ci BBR_PROBE_RTT, /* cut inflight to min to probe min_rtt */ 8662306a36Sopenharmony_ci}; 8762306a36Sopenharmony_ci 8862306a36Sopenharmony_ci/* BBR congestion control block */ 8962306a36Sopenharmony_cistruct bbr { 9062306a36Sopenharmony_ci u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */ 9162306a36Sopenharmony_ci u32 min_rtt_stamp; /* timestamp of min_rtt_us */ 9262306a36Sopenharmony_ci u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */ 9362306a36Sopenharmony_ci struct minmax bw; /* Max recent delivery rate in pkts/uS << 24 */ 9462306a36Sopenharmony_ci u32 rtt_cnt; /* count of packet-timed rounds elapsed */ 9562306a36Sopenharmony_ci u32 next_rtt_delivered; /* scb->tx.delivered at end of round */ 9662306a36Sopenharmony_ci u64 cycle_mstamp; /* time of this cycle phase start */ 9762306a36Sopenharmony_ci u32 mode:3, /* current bbr_mode in state machine */ 9862306a36Sopenharmony_ci prev_ca_state:3, /* CA state on previous ACK */ 9962306a36Sopenharmony_ci packet_conservation:1, /* use packet conservation? */ 10062306a36Sopenharmony_ci round_start:1, /* start of packet-timed tx->ack round? */ 10162306a36Sopenharmony_ci idle_restart:1, /* restarting after idle? */ 10262306a36Sopenharmony_ci probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */ 10362306a36Sopenharmony_ci unused:13, 10462306a36Sopenharmony_ci lt_is_sampling:1, /* taking long-term ("LT") samples now? */ 10562306a36Sopenharmony_ci lt_rtt_cnt:7, /* round trips in long-term interval */ 10662306a36Sopenharmony_ci lt_use_bw:1; /* use lt_bw as our bw estimate? */ 10762306a36Sopenharmony_ci u32 lt_bw; /* LT est delivery rate in pkts/uS << 24 */ 10862306a36Sopenharmony_ci u32 lt_last_delivered; /* LT intvl start: tp->delivered */ 10962306a36Sopenharmony_ci u32 lt_last_stamp; /* LT intvl start: tp->delivered_mstamp */ 11062306a36Sopenharmony_ci u32 lt_last_lost; /* LT intvl start: tp->lost */ 11162306a36Sopenharmony_ci u32 pacing_gain:10, /* current gain for setting pacing rate */ 11262306a36Sopenharmony_ci cwnd_gain:10, /* current gain for setting cwnd */ 11362306a36Sopenharmony_ci full_bw_reached:1, /* reached full bw in Startup? */ 11462306a36Sopenharmony_ci full_bw_cnt:2, /* number of rounds without large bw gains */ 11562306a36Sopenharmony_ci cycle_idx:3, /* current index in pacing_gain cycle array */ 11662306a36Sopenharmony_ci has_seen_rtt:1, /* have we seen an RTT sample yet? */ 11762306a36Sopenharmony_ci unused_b:5; 11862306a36Sopenharmony_ci u32 prior_cwnd; /* prior cwnd upon entering loss recovery */ 11962306a36Sopenharmony_ci u32 full_bw; /* recent bw, to estimate if pipe is full */ 12062306a36Sopenharmony_ci 12162306a36Sopenharmony_ci /* For tracking ACK aggregation: */ 12262306a36Sopenharmony_ci u64 ack_epoch_mstamp; /* start of ACK sampling epoch */ 12362306a36Sopenharmony_ci u16 extra_acked[2]; /* max excess data ACKed in epoch */ 12462306a36Sopenharmony_ci u32 ack_epoch_acked:20, /* packets (S)ACKed in sampling epoch */ 12562306a36Sopenharmony_ci extra_acked_win_rtts:5, /* age of extra_acked, in round trips */ 12662306a36Sopenharmony_ci extra_acked_win_idx:1, /* current index in extra_acked array */ 12762306a36Sopenharmony_ci unused_c:6; 12862306a36Sopenharmony_ci}; 12962306a36Sopenharmony_ci 13062306a36Sopenharmony_ci#define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */ 13162306a36Sopenharmony_ci 13262306a36Sopenharmony_ci/* Window length of bw filter (in rounds): */ 13362306a36Sopenharmony_cistatic const int bbr_bw_rtts = CYCLE_LEN + 2; 13462306a36Sopenharmony_ci/* Window length of min_rtt filter (in sec): */ 13562306a36Sopenharmony_cistatic const u32 bbr_min_rtt_win_sec = 10; 13662306a36Sopenharmony_ci/* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */ 13762306a36Sopenharmony_cistatic const u32 bbr_probe_rtt_mode_ms = 200; 13862306a36Sopenharmony_ci/* Skip TSO below the following bandwidth (bits/sec): */ 13962306a36Sopenharmony_cistatic const int bbr_min_tso_rate = 1200000; 14062306a36Sopenharmony_ci 14162306a36Sopenharmony_ci/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. 14262306a36Sopenharmony_ci * In order to help drive the network toward lower queues and low latency while 14362306a36Sopenharmony_ci * maintaining high utilization, the average pacing rate aims to be slightly 14462306a36Sopenharmony_ci * lower than the estimated bandwidth. This is an important aspect of the 14562306a36Sopenharmony_ci * design. 14662306a36Sopenharmony_ci */ 14762306a36Sopenharmony_cistatic const int bbr_pacing_margin_percent = 1; 14862306a36Sopenharmony_ci 14962306a36Sopenharmony_ci/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain 15062306a36Sopenharmony_ci * that will allow a smoothly increasing pacing rate that will double each RTT 15162306a36Sopenharmony_ci * and send the same number of packets per RTT that an un-paced, slow-starting 15262306a36Sopenharmony_ci * Reno or CUBIC flow would: 15362306a36Sopenharmony_ci */ 15462306a36Sopenharmony_cistatic const int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1; 15562306a36Sopenharmony_ci/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain 15662306a36Sopenharmony_ci * the queue created in BBR_STARTUP in a single round: 15762306a36Sopenharmony_ci */ 15862306a36Sopenharmony_cistatic const int bbr_drain_gain = BBR_UNIT * 1000 / 2885; 15962306a36Sopenharmony_ci/* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs: */ 16062306a36Sopenharmony_cistatic const int bbr_cwnd_gain = BBR_UNIT * 2; 16162306a36Sopenharmony_ci/* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */ 16262306a36Sopenharmony_cistatic const int bbr_pacing_gain[] = { 16362306a36Sopenharmony_ci BBR_UNIT * 5 / 4, /* probe for more available bw */ 16462306a36Sopenharmony_ci BBR_UNIT * 3 / 4, /* drain queue and/or yield bw to other flows */ 16562306a36Sopenharmony_ci BBR_UNIT, BBR_UNIT, BBR_UNIT, /* cruise at 1.0*bw to utilize pipe, */ 16662306a36Sopenharmony_ci BBR_UNIT, BBR_UNIT, BBR_UNIT /* without creating excess queue... */ 16762306a36Sopenharmony_ci}; 16862306a36Sopenharmony_ci/* Randomize the starting gain cycling phase over N phases: */ 16962306a36Sopenharmony_cistatic const u32 bbr_cycle_rand = 7; 17062306a36Sopenharmony_ci 17162306a36Sopenharmony_ci/* Try to keep at least this many packets in flight, if things go smoothly. For 17262306a36Sopenharmony_ci * smooth functioning, a sliding window protocol ACKing every other packet 17362306a36Sopenharmony_ci * needs at least 4 packets in flight: 17462306a36Sopenharmony_ci */ 17562306a36Sopenharmony_cistatic const u32 bbr_cwnd_min_target = 4; 17662306a36Sopenharmony_ci 17762306a36Sopenharmony_ci/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */ 17862306a36Sopenharmony_ci/* If bw has increased significantly (1.25x), there may be more bw available: */ 17962306a36Sopenharmony_cistatic const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4; 18062306a36Sopenharmony_ci/* But after 3 rounds w/o significant bw growth, estimate pipe is full: */ 18162306a36Sopenharmony_cistatic const u32 bbr_full_bw_cnt = 3; 18262306a36Sopenharmony_ci 18362306a36Sopenharmony_ci/* "long-term" ("LT") bandwidth estimator parameters... */ 18462306a36Sopenharmony_ci/* The minimum number of rounds in an LT bw sampling interval: */ 18562306a36Sopenharmony_cistatic const u32 bbr_lt_intvl_min_rtts = 4; 18662306a36Sopenharmony_ci/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */ 18762306a36Sopenharmony_cistatic const u32 bbr_lt_loss_thresh = 50; 18862306a36Sopenharmony_ci/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */ 18962306a36Sopenharmony_cistatic const u32 bbr_lt_bw_ratio = BBR_UNIT / 8; 19062306a36Sopenharmony_ci/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */ 19162306a36Sopenharmony_cistatic const u32 bbr_lt_bw_diff = 4000 / 8; 19262306a36Sopenharmony_ci/* If we estimate we're policed, use lt_bw for this many round trips: */ 19362306a36Sopenharmony_cistatic const u32 bbr_lt_bw_max_rtts = 48; 19462306a36Sopenharmony_ci 19562306a36Sopenharmony_ci/* Gain factor for adding extra_acked to target cwnd: */ 19662306a36Sopenharmony_cistatic const int bbr_extra_acked_gain = BBR_UNIT; 19762306a36Sopenharmony_ci/* Window length of extra_acked window. */ 19862306a36Sopenharmony_cistatic const u32 bbr_extra_acked_win_rtts = 5; 19962306a36Sopenharmony_ci/* Max allowed val for ack_epoch_acked, after which sampling epoch is reset */ 20062306a36Sopenharmony_cistatic const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20; 20162306a36Sopenharmony_ci/* Time period for clamping cwnd increment due to ack aggregation */ 20262306a36Sopenharmony_cistatic const u32 bbr_extra_acked_max_us = 100 * 1000; 20362306a36Sopenharmony_ci 20462306a36Sopenharmony_cistatic void bbr_check_probe_rtt_done(struct sock *sk); 20562306a36Sopenharmony_ci 20662306a36Sopenharmony_ci/* Do we estimate that STARTUP filled the pipe? */ 20762306a36Sopenharmony_cistatic bool bbr_full_bw_reached(const struct sock *sk) 20862306a36Sopenharmony_ci{ 20962306a36Sopenharmony_ci const struct bbr *bbr = inet_csk_ca(sk); 21062306a36Sopenharmony_ci 21162306a36Sopenharmony_ci return bbr->full_bw_reached; 21262306a36Sopenharmony_ci} 21362306a36Sopenharmony_ci 21462306a36Sopenharmony_ci/* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */ 21562306a36Sopenharmony_cistatic u32 bbr_max_bw(const struct sock *sk) 21662306a36Sopenharmony_ci{ 21762306a36Sopenharmony_ci struct bbr *bbr = inet_csk_ca(sk); 21862306a36Sopenharmony_ci 21962306a36Sopenharmony_ci return minmax_get(&bbr->bw); 22062306a36Sopenharmony_ci} 22162306a36Sopenharmony_ci 22262306a36Sopenharmony_ci/* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */ 22362306a36Sopenharmony_cistatic u32 bbr_bw(const struct sock *sk) 22462306a36Sopenharmony_ci{ 22562306a36Sopenharmony_ci struct bbr *bbr = inet_csk_ca(sk); 22662306a36Sopenharmony_ci 22762306a36Sopenharmony_ci return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk); 22862306a36Sopenharmony_ci} 22962306a36Sopenharmony_ci 23062306a36Sopenharmony_ci/* Return maximum extra acked in past k-2k round trips, 23162306a36Sopenharmony_ci * where k = bbr_extra_acked_win_rtts. 23262306a36Sopenharmony_ci */ 23362306a36Sopenharmony_cistatic u16 bbr_extra_acked(const struct sock *sk) 23462306a36Sopenharmony_ci{ 23562306a36Sopenharmony_ci struct bbr *bbr = inet_csk_ca(sk); 23662306a36Sopenharmony_ci 23762306a36Sopenharmony_ci return max(bbr->extra_acked[0], bbr->extra_acked[1]); 23862306a36Sopenharmony_ci} 23962306a36Sopenharmony_ci 24062306a36Sopenharmony_ci/* Return rate in bytes per second, optionally with a gain. 24162306a36Sopenharmony_ci * The order here is chosen carefully to avoid overflow of u64. This should 24262306a36Sopenharmony_ci * work for input rates of up to 2.9Tbit/sec and gain of 2.89x. 24362306a36Sopenharmony_ci */ 24462306a36Sopenharmony_cistatic u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) 24562306a36Sopenharmony_ci{ 24662306a36Sopenharmony_ci unsigned int mss = tcp_sk(sk)->mss_cache; 24762306a36Sopenharmony_ci 24862306a36Sopenharmony_ci rate *= mss; 24962306a36Sopenharmony_ci rate *= gain; 25062306a36Sopenharmony_ci rate >>= BBR_SCALE; 25162306a36Sopenharmony_ci rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent); 25262306a36Sopenharmony_ci return rate >> BW_SCALE; 25362306a36Sopenharmony_ci} 25462306a36Sopenharmony_ci 25562306a36Sopenharmony_ci/* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */ 25662306a36Sopenharmony_cistatic unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) 25762306a36Sopenharmony_ci{ 25862306a36Sopenharmony_ci u64 rate = bw; 25962306a36Sopenharmony_ci 26062306a36Sopenharmony_ci rate = bbr_rate_bytes_per_sec(sk, rate, gain); 26162306a36Sopenharmony_ci rate = min_t(u64, rate, sk->sk_max_pacing_rate); 26262306a36Sopenharmony_ci return rate; 26362306a36Sopenharmony_ci} 26462306a36Sopenharmony_ci 26562306a36Sopenharmony_ci/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */ 26662306a36Sopenharmony_cistatic void bbr_init_pacing_rate_from_rtt(struct sock *sk) 26762306a36Sopenharmony_ci{ 26862306a36Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 26962306a36Sopenharmony_ci struct bbr *bbr = inet_csk_ca(sk); 27062306a36Sopenharmony_ci u64 bw; 27162306a36Sopenharmony_ci u32 rtt_us; 27262306a36Sopenharmony_ci 27362306a36Sopenharmony_ci if (tp->srtt_us) { /* any RTT sample yet? */ 27462306a36Sopenharmony_ci rtt_us = max(tp->srtt_us >> 3, 1U); 27562306a36Sopenharmony_ci bbr->has_seen_rtt = 1; 27662306a36Sopenharmony_ci } else { /* no RTT sample yet */ 27762306a36Sopenharmony_ci rtt_us = USEC_PER_MSEC; /* use nominal default RTT */ 27862306a36Sopenharmony_ci } 27962306a36Sopenharmony_ci bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT; 28062306a36Sopenharmony_ci do_div(bw, rtt_us); 28162306a36Sopenharmony_ci sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain); 28262306a36Sopenharmony_ci} 28362306a36Sopenharmony_ci 28462306a36Sopenharmony_ci/* Pace using current bw estimate and a gain factor. */ 28562306a36Sopenharmony_cistatic void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) 28662306a36Sopenharmony_ci{ 28762306a36Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 28862306a36Sopenharmony_ci struct bbr *bbr = inet_csk_ca(sk); 28962306a36Sopenharmony_ci unsigned long rate = bbr_bw_to_pacing_rate(sk, bw, gain); 29062306a36Sopenharmony_ci 29162306a36Sopenharmony_ci if (unlikely(!bbr->has_seen_rtt && tp->srtt_us)) 29262306a36Sopenharmony_ci bbr_init_pacing_rate_from_rtt(sk); 29362306a36Sopenharmony_ci if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate) 29462306a36Sopenharmony_ci sk->sk_pacing_rate = rate; 29562306a36Sopenharmony_ci} 29662306a36Sopenharmony_ci 29762306a36Sopenharmony_ci/* override sysctl_tcp_min_tso_segs */ 29862306a36Sopenharmony_ci__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk) 29962306a36Sopenharmony_ci{ 30062306a36Sopenharmony_ci return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2; 30162306a36Sopenharmony_ci} 30262306a36Sopenharmony_ci 30362306a36Sopenharmony_cistatic u32 bbr_tso_segs_goal(struct sock *sk) 30462306a36Sopenharmony_ci{ 30562306a36Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 30662306a36Sopenharmony_ci u32 segs, bytes; 30762306a36Sopenharmony_ci 30862306a36Sopenharmony_ci /* Sort of tcp_tso_autosize() but ignoring 30962306a36Sopenharmony_ci * driver provided sk_gso_max_size. 31062306a36Sopenharmony_ci */ 31162306a36Sopenharmony_ci bytes = min_t(unsigned long, 31262306a36Sopenharmony_ci sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift), 31362306a36Sopenharmony_ci GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER); 31462306a36Sopenharmony_ci segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk)); 31562306a36Sopenharmony_ci 31662306a36Sopenharmony_ci return min(segs, 0x7FU); 31762306a36Sopenharmony_ci} 31862306a36Sopenharmony_ci 31962306a36Sopenharmony_ci/* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ 32062306a36Sopenharmony_cistatic void bbr_save_cwnd(struct sock *sk) 32162306a36Sopenharmony_ci{ 32262306a36Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 32362306a36Sopenharmony_ci struct bbr *bbr = inet_csk_ca(sk); 32462306a36Sopenharmony_ci 32562306a36Sopenharmony_ci if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT) 32662306a36Sopenharmony_ci bbr->prior_cwnd = tcp_snd_cwnd(tp); /* this cwnd is good enough */ 32762306a36Sopenharmony_ci else /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */ 32862306a36Sopenharmony_ci bbr->prior_cwnd = max(bbr->prior_cwnd, tcp_snd_cwnd(tp)); 32962306a36Sopenharmony_ci} 33062306a36Sopenharmony_ci 33162306a36Sopenharmony_ci__bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) 33262306a36Sopenharmony_ci{ 33362306a36Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 33462306a36Sopenharmony_ci struct bbr *bbr = inet_csk_ca(sk); 33562306a36Sopenharmony_ci 33662306a36Sopenharmony_ci if (event == CA_EVENT_TX_START && tp->app_limited) { 33762306a36Sopenharmony_ci bbr->idle_restart = 1; 33862306a36Sopenharmony_ci bbr->ack_epoch_mstamp = tp->tcp_mstamp; 33962306a36Sopenharmony_ci bbr->ack_epoch_acked = 0; 34062306a36Sopenharmony_ci /* Avoid pointless buffer overflows: pace at est. bw if we don't 34162306a36Sopenharmony_ci * need more speed (we're restarting from idle and app-limited). 34262306a36Sopenharmony_ci */ 34362306a36Sopenharmony_ci if (bbr->mode == BBR_PROBE_BW) 34462306a36Sopenharmony_ci bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT); 34562306a36Sopenharmony_ci else if (bbr->mode == BBR_PROBE_RTT) 34662306a36Sopenharmony_ci bbr_check_probe_rtt_done(sk); 34762306a36Sopenharmony_ci } 34862306a36Sopenharmony_ci} 34962306a36Sopenharmony_ci 35062306a36Sopenharmony_ci/* Calculate bdp based on min RTT and the estimated bottleneck bandwidth: 35162306a36Sopenharmony_ci * 35262306a36Sopenharmony_ci * bdp = ceil(bw * min_rtt * gain) 35362306a36Sopenharmony_ci * 35462306a36Sopenharmony_ci * The key factor, gain, controls the amount of queue. While a small gain 35562306a36Sopenharmony_ci * builds a smaller queue, it becomes more vulnerable to noise in RTT 35662306a36Sopenharmony_ci * measurements (e.g., delayed ACKs or other ACK compression effects). This 35762306a36Sopenharmony_ci * noise may cause BBR to under-estimate the rate. 35862306a36Sopenharmony_ci */ 35962306a36Sopenharmony_cistatic u32 bbr_bdp(struct sock *sk, u32 bw, int gain) 36062306a36Sopenharmony_ci{ 36162306a36Sopenharmony_ci struct bbr *bbr = inet_csk_ca(sk); 36262306a36Sopenharmony_ci u32 bdp; 36362306a36Sopenharmony_ci u64 w; 36462306a36Sopenharmony_ci 36562306a36Sopenharmony_ci /* If we've never had a valid RTT sample, cap cwnd at the initial 36662306a36Sopenharmony_ci * default. This should only happen when the connection is not using TCP 36762306a36Sopenharmony_ci * timestamps and has retransmitted all of the SYN/SYNACK/data packets 36862306a36Sopenharmony_ci * ACKed so far. In this case, an RTO can cut cwnd to 1, in which 36962306a36Sopenharmony_ci * case we need to slow-start up toward something safe: TCP_INIT_CWND. 37062306a36Sopenharmony_ci */ 37162306a36Sopenharmony_ci if (unlikely(bbr->min_rtt_us == ~0U)) /* no valid RTT samples yet? */ 37262306a36Sopenharmony_ci return TCP_INIT_CWND; /* be safe: cap at default initial cwnd*/ 37362306a36Sopenharmony_ci 37462306a36Sopenharmony_ci w = (u64)bw * bbr->min_rtt_us; 37562306a36Sopenharmony_ci 37662306a36Sopenharmony_ci /* Apply a gain to the given value, remove the BW_SCALE shift, and 37762306a36Sopenharmony_ci * round the value up to avoid a negative feedback loop. 37862306a36Sopenharmony_ci */ 37962306a36Sopenharmony_ci bdp = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT; 38062306a36Sopenharmony_ci 38162306a36Sopenharmony_ci return bdp; 38262306a36Sopenharmony_ci} 38362306a36Sopenharmony_ci 38462306a36Sopenharmony_ci/* To achieve full performance in high-speed paths, we budget enough cwnd to 38562306a36Sopenharmony_ci * fit full-sized skbs in-flight on both end hosts to fully utilize the path: 38662306a36Sopenharmony_ci * - one skb in sending host Qdisc, 38762306a36Sopenharmony_ci * - one skb in sending host TSO/GSO engine 38862306a36Sopenharmony_ci * - one skb being received by receiver host LRO/GRO/delayed-ACK engine 38962306a36Sopenharmony_ci * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because 39062306a36Sopenharmony_ci * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, 39162306a36Sopenharmony_ci * which allows 2 outstanding 2-packet sequences, to try to keep pipe 39262306a36Sopenharmony_ci * full even with ACK-every-other-packet delayed ACKs. 39362306a36Sopenharmony_ci */ 39462306a36Sopenharmony_cistatic u32 bbr_quantization_budget(struct sock *sk, u32 cwnd) 39562306a36Sopenharmony_ci{ 39662306a36Sopenharmony_ci struct bbr *bbr = inet_csk_ca(sk); 39762306a36Sopenharmony_ci 39862306a36Sopenharmony_ci /* Allow enough full-sized skbs in flight to utilize end systems. */ 39962306a36Sopenharmony_ci cwnd += 3 * bbr_tso_segs_goal(sk); 40062306a36Sopenharmony_ci 40162306a36Sopenharmony_ci /* Reduce delayed ACKs by rounding up cwnd to the next even number. */ 40262306a36Sopenharmony_ci cwnd = (cwnd + 1) & ~1U; 40362306a36Sopenharmony_ci 40462306a36Sopenharmony_ci /* Ensure gain cycling gets inflight above BDP even for small BDPs. */ 40562306a36Sopenharmony_ci if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0) 40662306a36Sopenharmony_ci cwnd += 2; 40762306a36Sopenharmony_ci 40862306a36Sopenharmony_ci return cwnd; 40962306a36Sopenharmony_ci} 41062306a36Sopenharmony_ci 41162306a36Sopenharmony_ci/* Find inflight based on min RTT and the estimated bottleneck bandwidth. */ 41262306a36Sopenharmony_cistatic u32 bbr_inflight(struct sock *sk, u32 bw, int gain) 41362306a36Sopenharmony_ci{ 41462306a36Sopenharmony_ci u32 inflight; 41562306a36Sopenharmony_ci 41662306a36Sopenharmony_ci inflight = bbr_bdp(sk, bw, gain); 41762306a36Sopenharmony_ci inflight = bbr_quantization_budget(sk, inflight); 41862306a36Sopenharmony_ci 41962306a36Sopenharmony_ci return inflight; 42062306a36Sopenharmony_ci} 42162306a36Sopenharmony_ci 42262306a36Sopenharmony_ci/* With pacing at lower layers, there's often less data "in the network" than 42362306a36Sopenharmony_ci * "in flight". With TSQ and departure time pacing at lower layers (e.g. fq), 42462306a36Sopenharmony_ci * we often have several skbs queued in the pacing layer with a pre-scheduled 42562306a36Sopenharmony_ci * earliest departure time (EDT). BBR adapts its pacing rate based on the 42662306a36Sopenharmony_ci * inflight level that it estimates has already been "baked in" by previous 42762306a36Sopenharmony_ci * departure time decisions. We calculate a rough estimate of the number of our 42862306a36Sopenharmony_ci * packets that might be in the network at the earliest departure time for the 42962306a36Sopenharmony_ci * next skb scheduled: 43062306a36Sopenharmony_ci * in_network_at_edt = inflight_at_edt - (EDT - now) * bw 43162306a36Sopenharmony_ci * If we're increasing inflight, then we want to know if the transmit of the 43262306a36Sopenharmony_ci * EDT skb will push inflight above the target, so inflight_at_edt includes 43362306a36Sopenharmony_ci * bbr_tso_segs_goal() from the skb departing at EDT. If decreasing inflight, 43462306a36Sopenharmony_ci * then estimate if inflight will sink too low just before the EDT transmit. 43562306a36Sopenharmony_ci */ 43662306a36Sopenharmony_cistatic u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now) 43762306a36Sopenharmony_ci{ 43862306a36Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 43962306a36Sopenharmony_ci struct bbr *bbr = inet_csk_ca(sk); 44062306a36Sopenharmony_ci u64 now_ns, edt_ns, interval_us; 44162306a36Sopenharmony_ci u32 interval_delivered, inflight_at_edt; 44262306a36Sopenharmony_ci 44362306a36Sopenharmony_ci now_ns = tp->tcp_clock_cache; 44462306a36Sopenharmony_ci edt_ns = max(tp->tcp_wstamp_ns, now_ns); 44562306a36Sopenharmony_ci interval_us = div_u64(edt_ns - now_ns, NSEC_PER_USEC); 44662306a36Sopenharmony_ci interval_delivered = (u64)bbr_bw(sk) * interval_us >> BW_SCALE; 44762306a36Sopenharmony_ci inflight_at_edt = inflight_now; 44862306a36Sopenharmony_ci if (bbr->pacing_gain > BBR_UNIT) /* increasing inflight */ 44962306a36Sopenharmony_ci inflight_at_edt += bbr_tso_segs_goal(sk); /* include EDT skb */ 45062306a36Sopenharmony_ci if (interval_delivered >= inflight_at_edt) 45162306a36Sopenharmony_ci return 0; 45262306a36Sopenharmony_ci return inflight_at_edt - interval_delivered; 45362306a36Sopenharmony_ci} 45462306a36Sopenharmony_ci 45562306a36Sopenharmony_ci/* Find the cwnd increment based on estimate of ack aggregation */ 45662306a36Sopenharmony_cistatic u32 bbr_ack_aggregation_cwnd(struct sock *sk) 45762306a36Sopenharmony_ci{ 45862306a36Sopenharmony_ci u32 max_aggr_cwnd, aggr_cwnd = 0; 45962306a36Sopenharmony_ci 46062306a36Sopenharmony_ci if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) { 46162306a36Sopenharmony_ci max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us) 46262306a36Sopenharmony_ci / BW_UNIT; 46362306a36Sopenharmony_ci aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk)) 46462306a36Sopenharmony_ci >> BBR_SCALE; 46562306a36Sopenharmony_ci aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd); 46662306a36Sopenharmony_ci } 46762306a36Sopenharmony_ci 46862306a36Sopenharmony_ci return aggr_cwnd; 46962306a36Sopenharmony_ci} 47062306a36Sopenharmony_ci 47162306a36Sopenharmony_ci/* An optimization in BBR to reduce losses: On the first round of recovery, we 47262306a36Sopenharmony_ci * follow the packet conservation principle: send P packets per P packets acked. 47362306a36Sopenharmony_ci * After that, we slow-start and send at most 2*P packets per P packets acked. 47462306a36Sopenharmony_ci * After recovery finishes, or upon undo, we restore the cwnd we had when 47562306a36Sopenharmony_ci * recovery started (capped by the target cwnd based on estimated BDP). 47662306a36Sopenharmony_ci * 47762306a36Sopenharmony_ci * TODO(ycheng/ncardwell): implement a rate-based approach. 47862306a36Sopenharmony_ci */ 47962306a36Sopenharmony_cistatic bool bbr_set_cwnd_to_recover_or_restore( 48062306a36Sopenharmony_ci struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd) 48162306a36Sopenharmony_ci{ 48262306a36Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 48362306a36Sopenharmony_ci struct bbr *bbr = inet_csk_ca(sk); 48462306a36Sopenharmony_ci u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state; 48562306a36Sopenharmony_ci u32 cwnd = tcp_snd_cwnd(tp); 48662306a36Sopenharmony_ci 48762306a36Sopenharmony_ci /* An ACK for P pkts should release at most 2*P packets. We do this 48862306a36Sopenharmony_ci * in two steps. First, here we deduct the number of lost packets. 48962306a36Sopenharmony_ci * Then, in bbr_set_cwnd() we slow start up toward the target cwnd. 49062306a36Sopenharmony_ci */ 49162306a36Sopenharmony_ci if (rs->losses > 0) 49262306a36Sopenharmony_ci cwnd = max_t(s32, cwnd - rs->losses, 1); 49362306a36Sopenharmony_ci 49462306a36Sopenharmony_ci if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) { 49562306a36Sopenharmony_ci /* Starting 1st round of Recovery, so do packet conservation. */ 49662306a36Sopenharmony_ci bbr->packet_conservation = 1; 49762306a36Sopenharmony_ci bbr->next_rtt_delivered = tp->delivered; /* start round now */ 49862306a36Sopenharmony_ci /* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */ 49962306a36Sopenharmony_ci cwnd = tcp_packets_in_flight(tp) + acked; 50062306a36Sopenharmony_ci } else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) { 50162306a36Sopenharmony_ci /* Exiting loss recovery; restore cwnd saved before recovery. */ 50262306a36Sopenharmony_ci cwnd = max(cwnd, bbr->prior_cwnd); 50362306a36Sopenharmony_ci bbr->packet_conservation = 0; 50462306a36Sopenharmony_ci } 50562306a36Sopenharmony_ci bbr->prev_ca_state = state; 50662306a36Sopenharmony_ci 50762306a36Sopenharmony_ci if (bbr->packet_conservation) { 50862306a36Sopenharmony_ci *new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked); 50962306a36Sopenharmony_ci return true; /* yes, using packet conservation */ 51062306a36Sopenharmony_ci } 51162306a36Sopenharmony_ci *new_cwnd = cwnd; 51262306a36Sopenharmony_ci return false; 51362306a36Sopenharmony_ci} 51462306a36Sopenharmony_ci 51562306a36Sopenharmony_ci/* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss 51662306a36Sopenharmony_ci * has drawn us down below target), or snap down to target if we're above it. 51762306a36Sopenharmony_ci */ 51862306a36Sopenharmony_cistatic void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, 51962306a36Sopenharmony_ci u32 acked, u32 bw, int gain) 52062306a36Sopenharmony_ci{ 52162306a36Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 52262306a36Sopenharmony_ci struct bbr *bbr = inet_csk_ca(sk); 52362306a36Sopenharmony_ci u32 cwnd = tcp_snd_cwnd(tp), target_cwnd = 0; 52462306a36Sopenharmony_ci 52562306a36Sopenharmony_ci if (!acked) 52662306a36Sopenharmony_ci goto done; /* no packet fully ACKed; just apply caps */ 52762306a36Sopenharmony_ci 52862306a36Sopenharmony_ci if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd)) 52962306a36Sopenharmony_ci goto done; 53062306a36Sopenharmony_ci 53162306a36Sopenharmony_ci target_cwnd = bbr_bdp(sk, bw, gain); 53262306a36Sopenharmony_ci 53362306a36Sopenharmony_ci /* Increment the cwnd to account for excess ACKed data that seems 53462306a36Sopenharmony_ci * due to aggregation (of data and/or ACKs) visible in the ACK stream. 53562306a36Sopenharmony_ci */ 53662306a36Sopenharmony_ci target_cwnd += bbr_ack_aggregation_cwnd(sk); 53762306a36Sopenharmony_ci target_cwnd = bbr_quantization_budget(sk, target_cwnd); 53862306a36Sopenharmony_ci 53962306a36Sopenharmony_ci /* If we're below target cwnd, slow start cwnd toward target cwnd. */ 54062306a36Sopenharmony_ci if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */ 54162306a36Sopenharmony_ci cwnd = min(cwnd + acked, target_cwnd); 54262306a36Sopenharmony_ci else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND) 54362306a36Sopenharmony_ci cwnd = cwnd + acked; 54462306a36Sopenharmony_ci cwnd = max(cwnd, bbr_cwnd_min_target); 54562306a36Sopenharmony_ci 54662306a36Sopenharmony_cidone: 54762306a36Sopenharmony_ci tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); /* apply global cap */ 54862306a36Sopenharmony_ci if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */ 54962306a36Sopenharmony_ci tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), bbr_cwnd_min_target)); 55062306a36Sopenharmony_ci} 55162306a36Sopenharmony_ci 55262306a36Sopenharmony_ci/* End cycle phase if it's time and/or we hit the phase's in-flight target. */ 55362306a36Sopenharmony_cistatic bool bbr_is_next_cycle_phase(struct sock *sk, 55462306a36Sopenharmony_ci const struct rate_sample *rs) 55562306a36Sopenharmony_ci{ 55662306a36Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 55762306a36Sopenharmony_ci struct bbr *bbr = inet_csk_ca(sk); 55862306a36Sopenharmony_ci bool is_full_length = 55962306a36Sopenharmony_ci tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) > 56062306a36Sopenharmony_ci bbr->min_rtt_us; 56162306a36Sopenharmony_ci u32 inflight, bw; 56262306a36Sopenharmony_ci 56362306a36Sopenharmony_ci /* The pacing_gain of 1.0 paces at the estimated bw to try to fully 56462306a36Sopenharmony_ci * use the pipe without increasing the queue. 56562306a36Sopenharmony_ci */ 56662306a36Sopenharmony_ci if (bbr->pacing_gain == BBR_UNIT) 56762306a36Sopenharmony_ci return is_full_length; /* just use wall clock time */ 56862306a36Sopenharmony_ci 56962306a36Sopenharmony_ci inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight); 57062306a36Sopenharmony_ci bw = bbr_max_bw(sk); 57162306a36Sopenharmony_ci 57262306a36Sopenharmony_ci /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at 57362306a36Sopenharmony_ci * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is 57462306a36Sopenharmony_ci * small (e.g. on a LAN). We do not persist if packets are lost, since 57562306a36Sopenharmony_ci * a path with small buffers may not hold that much. 57662306a36Sopenharmony_ci */ 57762306a36Sopenharmony_ci if (bbr->pacing_gain > BBR_UNIT) 57862306a36Sopenharmony_ci return is_full_length && 57962306a36Sopenharmony_ci (rs->losses || /* perhaps pacing_gain*BDP won't fit */ 58062306a36Sopenharmony_ci inflight >= bbr_inflight(sk, bw, bbr->pacing_gain)); 58162306a36Sopenharmony_ci 58262306a36Sopenharmony_ci /* A pacing_gain < 1.0 tries to drain extra queue we added if bw 58362306a36Sopenharmony_ci * probing didn't find more bw. If inflight falls to match BDP then we 58462306a36Sopenharmony_ci * estimate queue is drained; persisting would underutilize the pipe. 58562306a36Sopenharmony_ci */ 58662306a36Sopenharmony_ci return is_full_length || 58762306a36Sopenharmony_ci inflight <= bbr_inflight(sk, bw, BBR_UNIT); 58862306a36Sopenharmony_ci} 58962306a36Sopenharmony_ci 59062306a36Sopenharmony_cistatic void bbr_advance_cycle_phase(struct sock *sk) 59162306a36Sopenharmony_ci{ 59262306a36Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 59362306a36Sopenharmony_ci struct bbr *bbr = inet_csk_ca(sk); 59462306a36Sopenharmony_ci 59562306a36Sopenharmony_ci bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1); 59662306a36Sopenharmony_ci bbr->cycle_mstamp = tp->delivered_mstamp; 59762306a36Sopenharmony_ci} 59862306a36Sopenharmony_ci 59962306a36Sopenharmony_ci/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */ 60062306a36Sopenharmony_cistatic void bbr_update_cycle_phase(struct sock *sk, 60162306a36Sopenharmony_ci const struct rate_sample *rs) 60262306a36Sopenharmony_ci{ 60362306a36Sopenharmony_ci struct bbr *bbr = inet_csk_ca(sk); 60462306a36Sopenharmony_ci 60562306a36Sopenharmony_ci if (bbr->mode == BBR_PROBE_BW && bbr_is_next_cycle_phase(sk, rs)) 60662306a36Sopenharmony_ci bbr_advance_cycle_phase(sk); 60762306a36Sopenharmony_ci} 60862306a36Sopenharmony_ci 60962306a36Sopenharmony_cistatic void bbr_reset_startup_mode(struct sock *sk) 61062306a36Sopenharmony_ci{ 61162306a36Sopenharmony_ci struct bbr *bbr = inet_csk_ca(sk); 61262306a36Sopenharmony_ci 61362306a36Sopenharmony_ci bbr->mode = BBR_STARTUP; 61462306a36Sopenharmony_ci} 61562306a36Sopenharmony_ci 61662306a36Sopenharmony_cistatic void bbr_reset_probe_bw_mode(struct sock *sk) 61762306a36Sopenharmony_ci{ 61862306a36Sopenharmony_ci struct bbr *bbr = inet_csk_ca(sk); 61962306a36Sopenharmony_ci 62062306a36Sopenharmony_ci bbr->mode = BBR_PROBE_BW; 62162306a36Sopenharmony_ci bbr->cycle_idx = CYCLE_LEN - 1 - get_random_u32_below(bbr_cycle_rand); 62262306a36Sopenharmony_ci bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */ 62362306a36Sopenharmony_ci} 62462306a36Sopenharmony_ci 62562306a36Sopenharmony_cistatic void bbr_reset_mode(struct sock *sk) 62662306a36Sopenharmony_ci{ 62762306a36Sopenharmony_ci if (!bbr_full_bw_reached(sk)) 62862306a36Sopenharmony_ci bbr_reset_startup_mode(sk); 62962306a36Sopenharmony_ci else 63062306a36Sopenharmony_ci bbr_reset_probe_bw_mode(sk); 63162306a36Sopenharmony_ci} 63262306a36Sopenharmony_ci 63362306a36Sopenharmony_ci/* Start a new long-term sampling interval. */ 63462306a36Sopenharmony_cistatic void bbr_reset_lt_bw_sampling_interval(struct sock *sk) 63562306a36Sopenharmony_ci{ 63662306a36Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 63762306a36Sopenharmony_ci struct bbr *bbr = inet_csk_ca(sk); 63862306a36Sopenharmony_ci 63962306a36Sopenharmony_ci bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC); 64062306a36Sopenharmony_ci bbr->lt_last_delivered = tp->delivered; 64162306a36Sopenharmony_ci bbr->lt_last_lost = tp->lost; 64262306a36Sopenharmony_ci bbr->lt_rtt_cnt = 0; 64362306a36Sopenharmony_ci} 64462306a36Sopenharmony_ci 64562306a36Sopenharmony_ci/* Completely reset long-term bandwidth sampling. */ 64662306a36Sopenharmony_cistatic void bbr_reset_lt_bw_sampling(struct sock *sk) 64762306a36Sopenharmony_ci{ 64862306a36Sopenharmony_ci struct bbr *bbr = inet_csk_ca(sk); 64962306a36Sopenharmony_ci 65062306a36Sopenharmony_ci bbr->lt_bw = 0; 65162306a36Sopenharmony_ci bbr->lt_use_bw = 0; 65262306a36Sopenharmony_ci bbr->lt_is_sampling = false; 65362306a36Sopenharmony_ci bbr_reset_lt_bw_sampling_interval(sk); 65462306a36Sopenharmony_ci} 65562306a36Sopenharmony_ci 65662306a36Sopenharmony_ci/* Long-term bw sampling interval is done. Estimate whether we're policed. */ 65762306a36Sopenharmony_cistatic void bbr_lt_bw_interval_done(struct sock *sk, u32 bw) 65862306a36Sopenharmony_ci{ 65962306a36Sopenharmony_ci struct bbr *bbr = inet_csk_ca(sk); 66062306a36Sopenharmony_ci u32 diff; 66162306a36Sopenharmony_ci 66262306a36Sopenharmony_ci if (bbr->lt_bw) { /* do we have bw from a previous interval? */ 66362306a36Sopenharmony_ci /* Is new bw close to the lt_bw from the previous interval? */ 66462306a36Sopenharmony_ci diff = abs(bw - bbr->lt_bw); 66562306a36Sopenharmony_ci if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) || 66662306a36Sopenharmony_ci (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <= 66762306a36Sopenharmony_ci bbr_lt_bw_diff)) { 66862306a36Sopenharmony_ci /* All criteria are met; estimate we're policed. */ 66962306a36Sopenharmony_ci bbr->lt_bw = (bw + bbr->lt_bw) >> 1; /* avg 2 intvls */ 67062306a36Sopenharmony_ci bbr->lt_use_bw = 1; 67162306a36Sopenharmony_ci bbr->pacing_gain = BBR_UNIT; /* try to avoid drops */ 67262306a36Sopenharmony_ci bbr->lt_rtt_cnt = 0; 67362306a36Sopenharmony_ci return; 67462306a36Sopenharmony_ci } 67562306a36Sopenharmony_ci } 67662306a36Sopenharmony_ci bbr->lt_bw = bw; 67762306a36Sopenharmony_ci bbr_reset_lt_bw_sampling_interval(sk); 67862306a36Sopenharmony_ci} 67962306a36Sopenharmony_ci 68062306a36Sopenharmony_ci/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of 68162306a36Sopenharmony_ci * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and 68262306a36Sopenharmony_ci * explicitly models their policed rate, to reduce unnecessary losses. We 68362306a36Sopenharmony_ci * estimate that we're policed if we see 2 consecutive sampling intervals with 68462306a36Sopenharmony_ci * consistent throughput and high packet loss. If we think we're being policed, 68562306a36Sopenharmony_ci * set lt_bw to the "long-term" average delivery rate from those 2 intervals. 68662306a36Sopenharmony_ci */ 68762306a36Sopenharmony_cistatic void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs) 68862306a36Sopenharmony_ci{ 68962306a36Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 69062306a36Sopenharmony_ci struct bbr *bbr = inet_csk_ca(sk); 69162306a36Sopenharmony_ci u32 lost, delivered; 69262306a36Sopenharmony_ci u64 bw; 69362306a36Sopenharmony_ci u32 t; 69462306a36Sopenharmony_ci 69562306a36Sopenharmony_ci if (bbr->lt_use_bw) { /* already using long-term rate, lt_bw? */ 69662306a36Sopenharmony_ci if (bbr->mode == BBR_PROBE_BW && bbr->round_start && 69762306a36Sopenharmony_ci ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) { 69862306a36Sopenharmony_ci bbr_reset_lt_bw_sampling(sk); /* stop using lt_bw */ 69962306a36Sopenharmony_ci bbr_reset_probe_bw_mode(sk); /* restart gain cycling */ 70062306a36Sopenharmony_ci } 70162306a36Sopenharmony_ci return; 70262306a36Sopenharmony_ci } 70362306a36Sopenharmony_ci 70462306a36Sopenharmony_ci /* Wait for the first loss before sampling, to let the policer exhaust 70562306a36Sopenharmony_ci * its tokens and estimate the steady-state rate allowed by the policer. 70662306a36Sopenharmony_ci * Starting samples earlier includes bursts that over-estimate the bw. 70762306a36Sopenharmony_ci */ 70862306a36Sopenharmony_ci if (!bbr->lt_is_sampling) { 70962306a36Sopenharmony_ci if (!rs->losses) 71062306a36Sopenharmony_ci return; 71162306a36Sopenharmony_ci bbr_reset_lt_bw_sampling_interval(sk); 71262306a36Sopenharmony_ci bbr->lt_is_sampling = true; 71362306a36Sopenharmony_ci } 71462306a36Sopenharmony_ci 71562306a36Sopenharmony_ci /* To avoid underestimates, reset sampling if we run out of data. */ 71662306a36Sopenharmony_ci if (rs->is_app_limited) { 71762306a36Sopenharmony_ci bbr_reset_lt_bw_sampling(sk); 71862306a36Sopenharmony_ci return; 71962306a36Sopenharmony_ci } 72062306a36Sopenharmony_ci 72162306a36Sopenharmony_ci if (bbr->round_start) 72262306a36Sopenharmony_ci bbr->lt_rtt_cnt++; /* count round trips in this interval */ 72362306a36Sopenharmony_ci if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts) 72462306a36Sopenharmony_ci return; /* sampling interval needs to be longer */ 72562306a36Sopenharmony_ci if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) { 72662306a36Sopenharmony_ci bbr_reset_lt_bw_sampling(sk); /* interval is too long */ 72762306a36Sopenharmony_ci return; 72862306a36Sopenharmony_ci } 72962306a36Sopenharmony_ci 73062306a36Sopenharmony_ci /* End sampling interval when a packet is lost, so we estimate the 73162306a36Sopenharmony_ci * policer tokens were exhausted. Stopping the sampling before the 73262306a36Sopenharmony_ci * tokens are exhausted under-estimates the policed rate. 73362306a36Sopenharmony_ci */ 73462306a36Sopenharmony_ci if (!rs->losses) 73562306a36Sopenharmony_ci return; 73662306a36Sopenharmony_ci 73762306a36Sopenharmony_ci /* Calculate packets lost and delivered in sampling interval. */ 73862306a36Sopenharmony_ci lost = tp->lost - bbr->lt_last_lost; 73962306a36Sopenharmony_ci delivered = tp->delivered - bbr->lt_last_delivered; 74062306a36Sopenharmony_ci /* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */ 74162306a36Sopenharmony_ci if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered) 74262306a36Sopenharmony_ci return; 74362306a36Sopenharmony_ci 74462306a36Sopenharmony_ci /* Find average delivery rate in this sampling interval. */ 74562306a36Sopenharmony_ci t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp; 74662306a36Sopenharmony_ci if ((s32)t < 1) 74762306a36Sopenharmony_ci return; /* interval is less than one ms, so wait */ 74862306a36Sopenharmony_ci /* Check if can multiply without overflow */ 74962306a36Sopenharmony_ci if (t >= ~0U / USEC_PER_MSEC) { 75062306a36Sopenharmony_ci bbr_reset_lt_bw_sampling(sk); /* interval too long; reset */ 75162306a36Sopenharmony_ci return; 75262306a36Sopenharmony_ci } 75362306a36Sopenharmony_ci t *= USEC_PER_MSEC; 75462306a36Sopenharmony_ci bw = (u64)delivered * BW_UNIT; 75562306a36Sopenharmony_ci do_div(bw, t); 75662306a36Sopenharmony_ci bbr_lt_bw_interval_done(sk, bw); 75762306a36Sopenharmony_ci} 75862306a36Sopenharmony_ci 75962306a36Sopenharmony_ci/* Estimate the bandwidth based on how fast packets are delivered */ 76062306a36Sopenharmony_cistatic void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) 76162306a36Sopenharmony_ci{ 76262306a36Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 76362306a36Sopenharmony_ci struct bbr *bbr = inet_csk_ca(sk); 76462306a36Sopenharmony_ci u64 bw; 76562306a36Sopenharmony_ci 76662306a36Sopenharmony_ci bbr->round_start = 0; 76762306a36Sopenharmony_ci if (rs->delivered < 0 || rs->interval_us <= 0) 76862306a36Sopenharmony_ci return; /* Not a valid observation */ 76962306a36Sopenharmony_ci 77062306a36Sopenharmony_ci /* See if we've reached the next RTT */ 77162306a36Sopenharmony_ci if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) { 77262306a36Sopenharmony_ci bbr->next_rtt_delivered = tp->delivered; 77362306a36Sopenharmony_ci bbr->rtt_cnt++; 77462306a36Sopenharmony_ci bbr->round_start = 1; 77562306a36Sopenharmony_ci bbr->packet_conservation = 0; 77662306a36Sopenharmony_ci } 77762306a36Sopenharmony_ci 77862306a36Sopenharmony_ci bbr_lt_bw_sampling(sk, rs); 77962306a36Sopenharmony_ci 78062306a36Sopenharmony_ci /* Divide delivered by the interval to find a (lower bound) bottleneck 78162306a36Sopenharmony_ci * bandwidth sample. Delivered is in packets and interval_us in uS and 78262306a36Sopenharmony_ci * ratio will be <<1 for most connections. So delivered is first scaled. 78362306a36Sopenharmony_ci */ 78462306a36Sopenharmony_ci bw = div64_long((u64)rs->delivered * BW_UNIT, rs->interval_us); 78562306a36Sopenharmony_ci 78662306a36Sopenharmony_ci /* If this sample is application-limited, it is likely to have a very 78762306a36Sopenharmony_ci * low delivered count that represents application behavior rather than 78862306a36Sopenharmony_ci * the available network rate. Such a sample could drag down estimated 78962306a36Sopenharmony_ci * bw, causing needless slow-down. Thus, to continue to send at the 79062306a36Sopenharmony_ci * last measured network rate, we filter out app-limited samples unless 79162306a36Sopenharmony_ci * they describe the path bw at least as well as our bw model. 79262306a36Sopenharmony_ci * 79362306a36Sopenharmony_ci * So the goal during app-limited phase is to proceed with the best 79462306a36Sopenharmony_ci * network rate no matter how long. We automatically leave this 79562306a36Sopenharmony_ci * phase when app writes faster than the network can deliver :) 79662306a36Sopenharmony_ci */ 79762306a36Sopenharmony_ci if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) { 79862306a36Sopenharmony_ci /* Incorporate new sample into our max bw filter. */ 79962306a36Sopenharmony_ci minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw); 80062306a36Sopenharmony_ci } 80162306a36Sopenharmony_ci} 80262306a36Sopenharmony_ci 80362306a36Sopenharmony_ci/* Estimates the windowed max degree of ack aggregation. 80462306a36Sopenharmony_ci * This is used to provision extra in-flight data to keep sending during 80562306a36Sopenharmony_ci * inter-ACK silences. 80662306a36Sopenharmony_ci * 80762306a36Sopenharmony_ci * Degree of ack aggregation is estimated as extra data acked beyond expected. 80862306a36Sopenharmony_ci * 80962306a36Sopenharmony_ci * max_extra_acked = "maximum recent excess data ACKed beyond max_bw * interval" 81062306a36Sopenharmony_ci * cwnd += max_extra_acked 81162306a36Sopenharmony_ci * 81262306a36Sopenharmony_ci * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms). 81362306a36Sopenharmony_ci * Max filter is an approximate sliding window of 5-10 (packet timed) round 81462306a36Sopenharmony_ci * trips. 81562306a36Sopenharmony_ci */ 81662306a36Sopenharmony_cistatic void bbr_update_ack_aggregation(struct sock *sk, 81762306a36Sopenharmony_ci const struct rate_sample *rs) 81862306a36Sopenharmony_ci{ 81962306a36Sopenharmony_ci u32 epoch_us, expected_acked, extra_acked; 82062306a36Sopenharmony_ci struct bbr *bbr = inet_csk_ca(sk); 82162306a36Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 82262306a36Sopenharmony_ci 82362306a36Sopenharmony_ci if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 || 82462306a36Sopenharmony_ci rs->delivered < 0 || rs->interval_us <= 0) 82562306a36Sopenharmony_ci return; 82662306a36Sopenharmony_ci 82762306a36Sopenharmony_ci if (bbr->round_start) { 82862306a36Sopenharmony_ci bbr->extra_acked_win_rtts = min(0x1F, 82962306a36Sopenharmony_ci bbr->extra_acked_win_rtts + 1); 83062306a36Sopenharmony_ci if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) { 83162306a36Sopenharmony_ci bbr->extra_acked_win_rtts = 0; 83262306a36Sopenharmony_ci bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ? 83362306a36Sopenharmony_ci 0 : 1; 83462306a36Sopenharmony_ci bbr->extra_acked[bbr->extra_acked_win_idx] = 0; 83562306a36Sopenharmony_ci } 83662306a36Sopenharmony_ci } 83762306a36Sopenharmony_ci 83862306a36Sopenharmony_ci /* Compute how many packets we expected to be delivered over epoch. */ 83962306a36Sopenharmony_ci epoch_us = tcp_stamp_us_delta(tp->delivered_mstamp, 84062306a36Sopenharmony_ci bbr->ack_epoch_mstamp); 84162306a36Sopenharmony_ci expected_acked = ((u64)bbr_bw(sk) * epoch_us) / BW_UNIT; 84262306a36Sopenharmony_ci 84362306a36Sopenharmony_ci /* Reset the aggregation epoch if ACK rate is below expected rate or 84462306a36Sopenharmony_ci * significantly large no. of ack received since epoch (potentially 84562306a36Sopenharmony_ci * quite old epoch). 84662306a36Sopenharmony_ci */ 84762306a36Sopenharmony_ci if (bbr->ack_epoch_acked <= expected_acked || 84862306a36Sopenharmony_ci (bbr->ack_epoch_acked + rs->acked_sacked >= 84962306a36Sopenharmony_ci bbr_ack_epoch_acked_reset_thresh)) { 85062306a36Sopenharmony_ci bbr->ack_epoch_acked = 0; 85162306a36Sopenharmony_ci bbr->ack_epoch_mstamp = tp->delivered_mstamp; 85262306a36Sopenharmony_ci expected_acked = 0; 85362306a36Sopenharmony_ci } 85462306a36Sopenharmony_ci 85562306a36Sopenharmony_ci /* Compute excess data delivered, beyond what was expected. */ 85662306a36Sopenharmony_ci bbr->ack_epoch_acked = min_t(u32, 0xFFFFF, 85762306a36Sopenharmony_ci bbr->ack_epoch_acked + rs->acked_sacked); 85862306a36Sopenharmony_ci extra_acked = bbr->ack_epoch_acked - expected_acked; 85962306a36Sopenharmony_ci extra_acked = min(extra_acked, tcp_snd_cwnd(tp)); 86062306a36Sopenharmony_ci if (extra_acked > bbr->extra_acked[bbr->extra_acked_win_idx]) 86162306a36Sopenharmony_ci bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked; 86262306a36Sopenharmony_ci} 86362306a36Sopenharmony_ci 86462306a36Sopenharmony_ci/* Estimate when the pipe is full, using the change in delivery rate: BBR 86562306a36Sopenharmony_ci * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by 86662306a36Sopenharmony_ci * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited 86762306a36Sopenharmony_ci * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the 86862306a36Sopenharmony_ci * higher rwin, 3: we get higher delivery rate samples. Or transient 86962306a36Sopenharmony_ci * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar 87062306a36Sopenharmony_ci * design goal, but uses delay and inter-ACK spacing instead of bandwidth. 87162306a36Sopenharmony_ci */ 87262306a36Sopenharmony_cistatic void bbr_check_full_bw_reached(struct sock *sk, 87362306a36Sopenharmony_ci const struct rate_sample *rs) 87462306a36Sopenharmony_ci{ 87562306a36Sopenharmony_ci struct bbr *bbr = inet_csk_ca(sk); 87662306a36Sopenharmony_ci u32 bw_thresh; 87762306a36Sopenharmony_ci 87862306a36Sopenharmony_ci if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited) 87962306a36Sopenharmony_ci return; 88062306a36Sopenharmony_ci 88162306a36Sopenharmony_ci bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE; 88262306a36Sopenharmony_ci if (bbr_max_bw(sk) >= bw_thresh) { 88362306a36Sopenharmony_ci bbr->full_bw = bbr_max_bw(sk); 88462306a36Sopenharmony_ci bbr->full_bw_cnt = 0; 88562306a36Sopenharmony_ci return; 88662306a36Sopenharmony_ci } 88762306a36Sopenharmony_ci ++bbr->full_bw_cnt; 88862306a36Sopenharmony_ci bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt; 88962306a36Sopenharmony_ci} 89062306a36Sopenharmony_ci 89162306a36Sopenharmony_ci/* If pipe is probably full, drain the queue and then enter steady-state. */ 89262306a36Sopenharmony_cistatic void bbr_check_drain(struct sock *sk, const struct rate_sample *rs) 89362306a36Sopenharmony_ci{ 89462306a36Sopenharmony_ci struct bbr *bbr = inet_csk_ca(sk); 89562306a36Sopenharmony_ci 89662306a36Sopenharmony_ci if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { 89762306a36Sopenharmony_ci bbr->mode = BBR_DRAIN; /* drain queue we created */ 89862306a36Sopenharmony_ci tcp_sk(sk)->snd_ssthresh = 89962306a36Sopenharmony_ci bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); 90062306a36Sopenharmony_ci } /* fall through to check if in-flight is already small: */ 90162306a36Sopenharmony_ci if (bbr->mode == BBR_DRAIN && 90262306a36Sopenharmony_ci bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= 90362306a36Sopenharmony_ci bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) 90462306a36Sopenharmony_ci bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */ 90562306a36Sopenharmony_ci} 90662306a36Sopenharmony_ci 90762306a36Sopenharmony_cistatic void bbr_check_probe_rtt_done(struct sock *sk) 90862306a36Sopenharmony_ci{ 90962306a36Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 91062306a36Sopenharmony_ci struct bbr *bbr = inet_csk_ca(sk); 91162306a36Sopenharmony_ci 91262306a36Sopenharmony_ci if (!(bbr->probe_rtt_done_stamp && 91362306a36Sopenharmony_ci after(tcp_jiffies32, bbr->probe_rtt_done_stamp))) 91462306a36Sopenharmony_ci return; 91562306a36Sopenharmony_ci 91662306a36Sopenharmony_ci bbr->min_rtt_stamp = tcp_jiffies32; /* wait a while until PROBE_RTT */ 91762306a36Sopenharmony_ci tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd)); 91862306a36Sopenharmony_ci bbr_reset_mode(sk); 91962306a36Sopenharmony_ci} 92062306a36Sopenharmony_ci 92162306a36Sopenharmony_ci/* The goal of PROBE_RTT mode is to have BBR flows cooperatively and 92262306a36Sopenharmony_ci * periodically drain the bottleneck queue, to converge to measure the true 92362306a36Sopenharmony_ci * min_rtt (unloaded propagation delay). This allows the flows to keep queues 92462306a36Sopenharmony_ci * small (reducing queuing delay and packet loss) and achieve fairness among 92562306a36Sopenharmony_ci * BBR flows. 92662306a36Sopenharmony_ci * 92762306a36Sopenharmony_ci * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires, 92862306a36Sopenharmony_ci * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets. 92962306a36Sopenharmony_ci * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed 93062306a36Sopenharmony_ci * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and 93162306a36Sopenharmony_ci * re-enter the previous mode. BBR uses 200ms to approximately bound the 93262306a36Sopenharmony_ci * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s). 93362306a36Sopenharmony_ci * 93462306a36Sopenharmony_ci * Note that flows need only pay 2% if they are busy sending over the last 10 93562306a36Sopenharmony_ci * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have 93662306a36Sopenharmony_ci * natural silences or low-rate periods within 10 seconds where the rate is low 93762306a36Sopenharmony_ci * enough for long enough to drain its queue in the bottleneck. We pick up 93862306a36Sopenharmony_ci * these min RTT measurements opportunistically with our min_rtt filter. :-) 93962306a36Sopenharmony_ci */ 94062306a36Sopenharmony_cistatic void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) 94162306a36Sopenharmony_ci{ 94262306a36Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 94362306a36Sopenharmony_ci struct bbr *bbr = inet_csk_ca(sk); 94462306a36Sopenharmony_ci bool filter_expired; 94562306a36Sopenharmony_ci 94662306a36Sopenharmony_ci /* Track min RTT seen in the min_rtt_win_sec filter window: */ 94762306a36Sopenharmony_ci filter_expired = after(tcp_jiffies32, 94862306a36Sopenharmony_ci bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ); 94962306a36Sopenharmony_ci if (rs->rtt_us >= 0 && 95062306a36Sopenharmony_ci (rs->rtt_us < bbr->min_rtt_us || 95162306a36Sopenharmony_ci (filter_expired && !rs->is_ack_delayed))) { 95262306a36Sopenharmony_ci bbr->min_rtt_us = rs->rtt_us; 95362306a36Sopenharmony_ci bbr->min_rtt_stamp = tcp_jiffies32; 95462306a36Sopenharmony_ci } 95562306a36Sopenharmony_ci 95662306a36Sopenharmony_ci if (bbr_probe_rtt_mode_ms > 0 && filter_expired && 95762306a36Sopenharmony_ci !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) { 95862306a36Sopenharmony_ci bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */ 95962306a36Sopenharmony_ci bbr_save_cwnd(sk); /* note cwnd so we can restore it */ 96062306a36Sopenharmony_ci bbr->probe_rtt_done_stamp = 0; 96162306a36Sopenharmony_ci } 96262306a36Sopenharmony_ci 96362306a36Sopenharmony_ci if (bbr->mode == BBR_PROBE_RTT) { 96462306a36Sopenharmony_ci /* Ignore low rate samples during this mode. */ 96562306a36Sopenharmony_ci tp->app_limited = 96662306a36Sopenharmony_ci (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; 96762306a36Sopenharmony_ci /* Maintain min packets in flight for max(200 ms, 1 round). */ 96862306a36Sopenharmony_ci if (!bbr->probe_rtt_done_stamp && 96962306a36Sopenharmony_ci tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) { 97062306a36Sopenharmony_ci bbr->probe_rtt_done_stamp = tcp_jiffies32 + 97162306a36Sopenharmony_ci msecs_to_jiffies(bbr_probe_rtt_mode_ms); 97262306a36Sopenharmony_ci bbr->probe_rtt_round_done = 0; 97362306a36Sopenharmony_ci bbr->next_rtt_delivered = tp->delivered; 97462306a36Sopenharmony_ci } else if (bbr->probe_rtt_done_stamp) { 97562306a36Sopenharmony_ci if (bbr->round_start) 97662306a36Sopenharmony_ci bbr->probe_rtt_round_done = 1; 97762306a36Sopenharmony_ci if (bbr->probe_rtt_round_done) 97862306a36Sopenharmony_ci bbr_check_probe_rtt_done(sk); 97962306a36Sopenharmony_ci } 98062306a36Sopenharmony_ci } 98162306a36Sopenharmony_ci /* Restart after idle ends only once we process a new S/ACK for data */ 98262306a36Sopenharmony_ci if (rs->delivered > 0) 98362306a36Sopenharmony_ci bbr->idle_restart = 0; 98462306a36Sopenharmony_ci} 98562306a36Sopenharmony_ci 98662306a36Sopenharmony_cistatic void bbr_update_gains(struct sock *sk) 98762306a36Sopenharmony_ci{ 98862306a36Sopenharmony_ci struct bbr *bbr = inet_csk_ca(sk); 98962306a36Sopenharmony_ci 99062306a36Sopenharmony_ci switch (bbr->mode) { 99162306a36Sopenharmony_ci case BBR_STARTUP: 99262306a36Sopenharmony_ci bbr->pacing_gain = bbr_high_gain; 99362306a36Sopenharmony_ci bbr->cwnd_gain = bbr_high_gain; 99462306a36Sopenharmony_ci break; 99562306a36Sopenharmony_ci case BBR_DRAIN: 99662306a36Sopenharmony_ci bbr->pacing_gain = bbr_drain_gain; /* slow, to drain */ 99762306a36Sopenharmony_ci bbr->cwnd_gain = bbr_high_gain; /* keep cwnd */ 99862306a36Sopenharmony_ci break; 99962306a36Sopenharmony_ci case BBR_PROBE_BW: 100062306a36Sopenharmony_ci bbr->pacing_gain = (bbr->lt_use_bw ? 100162306a36Sopenharmony_ci BBR_UNIT : 100262306a36Sopenharmony_ci bbr_pacing_gain[bbr->cycle_idx]); 100362306a36Sopenharmony_ci bbr->cwnd_gain = bbr_cwnd_gain; 100462306a36Sopenharmony_ci break; 100562306a36Sopenharmony_ci case BBR_PROBE_RTT: 100662306a36Sopenharmony_ci bbr->pacing_gain = BBR_UNIT; 100762306a36Sopenharmony_ci bbr->cwnd_gain = BBR_UNIT; 100862306a36Sopenharmony_ci break; 100962306a36Sopenharmony_ci default: 101062306a36Sopenharmony_ci WARN_ONCE(1, "BBR bad mode: %u\n", bbr->mode); 101162306a36Sopenharmony_ci break; 101262306a36Sopenharmony_ci } 101362306a36Sopenharmony_ci} 101462306a36Sopenharmony_ci 101562306a36Sopenharmony_cistatic void bbr_update_model(struct sock *sk, const struct rate_sample *rs) 101662306a36Sopenharmony_ci{ 101762306a36Sopenharmony_ci bbr_update_bw(sk, rs); 101862306a36Sopenharmony_ci bbr_update_ack_aggregation(sk, rs); 101962306a36Sopenharmony_ci bbr_update_cycle_phase(sk, rs); 102062306a36Sopenharmony_ci bbr_check_full_bw_reached(sk, rs); 102162306a36Sopenharmony_ci bbr_check_drain(sk, rs); 102262306a36Sopenharmony_ci bbr_update_min_rtt(sk, rs); 102362306a36Sopenharmony_ci bbr_update_gains(sk); 102462306a36Sopenharmony_ci} 102562306a36Sopenharmony_ci 102662306a36Sopenharmony_ci__bpf_kfunc static void bbr_main(struct sock *sk, const struct rate_sample *rs) 102762306a36Sopenharmony_ci{ 102862306a36Sopenharmony_ci struct bbr *bbr = inet_csk_ca(sk); 102962306a36Sopenharmony_ci u32 bw; 103062306a36Sopenharmony_ci 103162306a36Sopenharmony_ci bbr_update_model(sk, rs); 103262306a36Sopenharmony_ci 103362306a36Sopenharmony_ci bw = bbr_bw(sk); 103462306a36Sopenharmony_ci bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); 103562306a36Sopenharmony_ci bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain); 103662306a36Sopenharmony_ci} 103762306a36Sopenharmony_ci 103862306a36Sopenharmony_ci__bpf_kfunc static void bbr_init(struct sock *sk) 103962306a36Sopenharmony_ci{ 104062306a36Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 104162306a36Sopenharmony_ci struct bbr *bbr = inet_csk_ca(sk); 104262306a36Sopenharmony_ci 104362306a36Sopenharmony_ci bbr->prior_cwnd = 0; 104462306a36Sopenharmony_ci tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; 104562306a36Sopenharmony_ci bbr->rtt_cnt = 0; 104662306a36Sopenharmony_ci bbr->next_rtt_delivered = tp->delivered; 104762306a36Sopenharmony_ci bbr->prev_ca_state = TCP_CA_Open; 104862306a36Sopenharmony_ci bbr->packet_conservation = 0; 104962306a36Sopenharmony_ci 105062306a36Sopenharmony_ci bbr->probe_rtt_done_stamp = 0; 105162306a36Sopenharmony_ci bbr->probe_rtt_round_done = 0; 105262306a36Sopenharmony_ci bbr->min_rtt_us = tcp_min_rtt(tp); 105362306a36Sopenharmony_ci bbr->min_rtt_stamp = tcp_jiffies32; 105462306a36Sopenharmony_ci 105562306a36Sopenharmony_ci minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */ 105662306a36Sopenharmony_ci 105762306a36Sopenharmony_ci bbr->has_seen_rtt = 0; 105862306a36Sopenharmony_ci bbr_init_pacing_rate_from_rtt(sk); 105962306a36Sopenharmony_ci 106062306a36Sopenharmony_ci bbr->round_start = 0; 106162306a36Sopenharmony_ci bbr->idle_restart = 0; 106262306a36Sopenharmony_ci bbr->full_bw_reached = 0; 106362306a36Sopenharmony_ci bbr->full_bw = 0; 106462306a36Sopenharmony_ci bbr->full_bw_cnt = 0; 106562306a36Sopenharmony_ci bbr->cycle_mstamp = 0; 106662306a36Sopenharmony_ci bbr->cycle_idx = 0; 106762306a36Sopenharmony_ci bbr_reset_lt_bw_sampling(sk); 106862306a36Sopenharmony_ci bbr_reset_startup_mode(sk); 106962306a36Sopenharmony_ci 107062306a36Sopenharmony_ci bbr->ack_epoch_mstamp = tp->tcp_mstamp; 107162306a36Sopenharmony_ci bbr->ack_epoch_acked = 0; 107262306a36Sopenharmony_ci bbr->extra_acked_win_rtts = 0; 107362306a36Sopenharmony_ci bbr->extra_acked_win_idx = 0; 107462306a36Sopenharmony_ci bbr->extra_acked[0] = 0; 107562306a36Sopenharmony_ci bbr->extra_acked[1] = 0; 107662306a36Sopenharmony_ci 107762306a36Sopenharmony_ci cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); 107862306a36Sopenharmony_ci} 107962306a36Sopenharmony_ci 108062306a36Sopenharmony_ci__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk) 108162306a36Sopenharmony_ci{ 108262306a36Sopenharmony_ci /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ 108362306a36Sopenharmony_ci return 3; 108462306a36Sopenharmony_ci} 108562306a36Sopenharmony_ci 108662306a36Sopenharmony_ci/* In theory BBR does not need to undo the cwnd since it does not 108762306a36Sopenharmony_ci * always reduce cwnd on losses (see bbr_main()). Keep it for now. 108862306a36Sopenharmony_ci */ 108962306a36Sopenharmony_ci__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk) 109062306a36Sopenharmony_ci{ 109162306a36Sopenharmony_ci struct bbr *bbr = inet_csk_ca(sk); 109262306a36Sopenharmony_ci 109362306a36Sopenharmony_ci bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */ 109462306a36Sopenharmony_ci bbr->full_bw_cnt = 0; 109562306a36Sopenharmony_ci bbr_reset_lt_bw_sampling(sk); 109662306a36Sopenharmony_ci return tcp_snd_cwnd(tcp_sk(sk)); 109762306a36Sopenharmony_ci} 109862306a36Sopenharmony_ci 109962306a36Sopenharmony_ci/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */ 110062306a36Sopenharmony_ci__bpf_kfunc static u32 bbr_ssthresh(struct sock *sk) 110162306a36Sopenharmony_ci{ 110262306a36Sopenharmony_ci bbr_save_cwnd(sk); 110362306a36Sopenharmony_ci return tcp_sk(sk)->snd_ssthresh; 110462306a36Sopenharmony_ci} 110562306a36Sopenharmony_ci 110662306a36Sopenharmony_cistatic size_t bbr_get_info(struct sock *sk, u32 ext, int *attr, 110762306a36Sopenharmony_ci union tcp_cc_info *info) 110862306a36Sopenharmony_ci{ 110962306a36Sopenharmony_ci if (ext & (1 << (INET_DIAG_BBRINFO - 1)) || 111062306a36Sopenharmony_ci ext & (1 << (INET_DIAG_VEGASINFO - 1))) { 111162306a36Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 111262306a36Sopenharmony_ci struct bbr *bbr = inet_csk_ca(sk); 111362306a36Sopenharmony_ci u64 bw = bbr_bw(sk); 111462306a36Sopenharmony_ci 111562306a36Sopenharmony_ci bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE; 111662306a36Sopenharmony_ci memset(&info->bbr, 0, sizeof(info->bbr)); 111762306a36Sopenharmony_ci info->bbr.bbr_bw_lo = (u32)bw; 111862306a36Sopenharmony_ci info->bbr.bbr_bw_hi = (u32)(bw >> 32); 111962306a36Sopenharmony_ci info->bbr.bbr_min_rtt = bbr->min_rtt_us; 112062306a36Sopenharmony_ci info->bbr.bbr_pacing_gain = bbr->pacing_gain; 112162306a36Sopenharmony_ci info->bbr.bbr_cwnd_gain = bbr->cwnd_gain; 112262306a36Sopenharmony_ci *attr = INET_DIAG_BBRINFO; 112362306a36Sopenharmony_ci return sizeof(info->bbr); 112462306a36Sopenharmony_ci } 112562306a36Sopenharmony_ci return 0; 112662306a36Sopenharmony_ci} 112762306a36Sopenharmony_ci 112862306a36Sopenharmony_ci__bpf_kfunc static void bbr_set_state(struct sock *sk, u8 new_state) 112962306a36Sopenharmony_ci{ 113062306a36Sopenharmony_ci struct bbr *bbr = inet_csk_ca(sk); 113162306a36Sopenharmony_ci 113262306a36Sopenharmony_ci if (new_state == TCP_CA_Loss) { 113362306a36Sopenharmony_ci struct rate_sample rs = { .losses = 1 }; 113462306a36Sopenharmony_ci 113562306a36Sopenharmony_ci bbr->prev_ca_state = TCP_CA_Loss; 113662306a36Sopenharmony_ci bbr->full_bw = 0; 113762306a36Sopenharmony_ci bbr->round_start = 1; /* treat RTO like end of a round */ 113862306a36Sopenharmony_ci bbr_lt_bw_sampling(sk, &rs); 113962306a36Sopenharmony_ci } 114062306a36Sopenharmony_ci} 114162306a36Sopenharmony_ci 114262306a36Sopenharmony_cistatic struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { 114362306a36Sopenharmony_ci .flags = TCP_CONG_NON_RESTRICTED, 114462306a36Sopenharmony_ci .name = "bbr", 114562306a36Sopenharmony_ci .owner = THIS_MODULE, 114662306a36Sopenharmony_ci .init = bbr_init, 114762306a36Sopenharmony_ci .cong_control = bbr_main, 114862306a36Sopenharmony_ci .sndbuf_expand = bbr_sndbuf_expand, 114962306a36Sopenharmony_ci .undo_cwnd = bbr_undo_cwnd, 115062306a36Sopenharmony_ci .cwnd_event = bbr_cwnd_event, 115162306a36Sopenharmony_ci .ssthresh = bbr_ssthresh, 115262306a36Sopenharmony_ci .min_tso_segs = bbr_min_tso_segs, 115362306a36Sopenharmony_ci .get_info = bbr_get_info, 115462306a36Sopenharmony_ci .set_state = bbr_set_state, 115562306a36Sopenharmony_ci}; 115662306a36Sopenharmony_ci 115762306a36Sopenharmony_ciBTF_SET8_START(tcp_bbr_check_kfunc_ids) 115862306a36Sopenharmony_ci#ifdef CONFIG_X86 115962306a36Sopenharmony_ci#ifdef CONFIG_DYNAMIC_FTRACE 116062306a36Sopenharmony_ciBTF_ID_FLAGS(func, bbr_init) 116162306a36Sopenharmony_ciBTF_ID_FLAGS(func, bbr_main) 116262306a36Sopenharmony_ciBTF_ID_FLAGS(func, bbr_sndbuf_expand) 116362306a36Sopenharmony_ciBTF_ID_FLAGS(func, bbr_undo_cwnd) 116462306a36Sopenharmony_ciBTF_ID_FLAGS(func, bbr_cwnd_event) 116562306a36Sopenharmony_ciBTF_ID_FLAGS(func, bbr_ssthresh) 116662306a36Sopenharmony_ciBTF_ID_FLAGS(func, bbr_min_tso_segs) 116762306a36Sopenharmony_ciBTF_ID_FLAGS(func, bbr_set_state) 116862306a36Sopenharmony_ci#endif 116962306a36Sopenharmony_ci#endif 117062306a36Sopenharmony_ciBTF_SET8_END(tcp_bbr_check_kfunc_ids) 117162306a36Sopenharmony_ci 117262306a36Sopenharmony_cistatic const struct btf_kfunc_id_set tcp_bbr_kfunc_set = { 117362306a36Sopenharmony_ci .owner = THIS_MODULE, 117462306a36Sopenharmony_ci .set = &tcp_bbr_check_kfunc_ids, 117562306a36Sopenharmony_ci}; 117662306a36Sopenharmony_ci 117762306a36Sopenharmony_cistatic int __init bbr_register(void) 117862306a36Sopenharmony_ci{ 117962306a36Sopenharmony_ci int ret; 118062306a36Sopenharmony_ci 118162306a36Sopenharmony_ci BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE); 118262306a36Sopenharmony_ci 118362306a36Sopenharmony_ci ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_bbr_kfunc_set); 118462306a36Sopenharmony_ci if (ret < 0) 118562306a36Sopenharmony_ci return ret; 118662306a36Sopenharmony_ci return tcp_register_congestion_control(&tcp_bbr_cong_ops); 118762306a36Sopenharmony_ci} 118862306a36Sopenharmony_ci 118962306a36Sopenharmony_cistatic void __exit bbr_unregister(void) 119062306a36Sopenharmony_ci{ 119162306a36Sopenharmony_ci tcp_unregister_congestion_control(&tcp_bbr_cong_ops); 119262306a36Sopenharmony_ci} 119362306a36Sopenharmony_ci 119462306a36Sopenharmony_cimodule_init(bbr_register); 119562306a36Sopenharmony_cimodule_exit(bbr_unregister); 119662306a36Sopenharmony_ci 119762306a36Sopenharmony_ciMODULE_AUTHOR("Van Jacobson <vanj@google.com>"); 119862306a36Sopenharmony_ciMODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>"); 119962306a36Sopenharmony_ciMODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>"); 120062306a36Sopenharmony_ciMODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>"); 120162306a36Sopenharmony_ciMODULE_LICENSE("Dual BSD/GPL"); 120262306a36Sopenharmony_ciMODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)"); 1203