162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * TCP NV: TCP with Congestion Avoidance 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * TCP-NV is a successor of TCP-Vegas that has been developed to 662306a36Sopenharmony_ci * deal with the issues that occur in modern networks. 762306a36Sopenharmony_ci * Like TCP-Vegas, TCP-NV supports true congestion avoidance, 862306a36Sopenharmony_ci * the ability to detect congestion before packet losses occur. 962306a36Sopenharmony_ci * When congestion (queue buildup) starts to occur, TCP-NV 1062306a36Sopenharmony_ci * predicts what the cwnd size should be for the current 1162306a36Sopenharmony_ci * throughput and it reduces the cwnd proportionally to 1262306a36Sopenharmony_ci * the difference between the current cwnd and the predicted cwnd. 1362306a36Sopenharmony_ci * 1462306a36Sopenharmony_ci * NV is only recommeneded for traffic within a data center, and when 1562306a36Sopenharmony_ci * all the flows are NV (at least those within the data center). This 1662306a36Sopenharmony_ci * is due to the inherent unfairness between flows using losses to 1762306a36Sopenharmony_ci * detect congestion (congestion control) and those that use queue 1862306a36Sopenharmony_ci * buildup to detect congestion (congestion avoidance). 1962306a36Sopenharmony_ci * 2062306a36Sopenharmony_ci * Note: High NIC coalescence values may lower the performance of NV 2162306a36Sopenharmony_ci * due to the increased noise in RTT values. In particular, we have 2262306a36Sopenharmony_ci * seen issues with rx-frames values greater than 8. 2362306a36Sopenharmony_ci * 2462306a36Sopenharmony_ci * TODO: 2562306a36Sopenharmony_ci * 1) Add mechanism to deal with reverse congestion. 2662306a36Sopenharmony_ci */ 2762306a36Sopenharmony_ci 2862306a36Sopenharmony_ci#include <linux/module.h> 2962306a36Sopenharmony_ci#include <linux/math64.h> 3062306a36Sopenharmony_ci#include <net/tcp.h> 3162306a36Sopenharmony_ci#include <linux/inet_diag.h> 3262306a36Sopenharmony_ci 3362306a36Sopenharmony_ci/* TCP NV parameters 3462306a36Sopenharmony_ci * 3562306a36Sopenharmony_ci * nv_pad Max number of queued packets allowed in network 3662306a36Sopenharmony_ci * nv_pad_buffer Do not grow cwnd if this closed to nv_pad 3762306a36Sopenharmony_ci * nv_reset_period How often (in) seconds)to reset min_rtt 3862306a36Sopenharmony_ci * nv_min_cwnd Don't decrease cwnd below this if there are no losses 3962306a36Sopenharmony_ci * nv_cong_dec_mult Decrease cwnd by X% (30%) of congestion when detected 4062306a36Sopenharmony_ci * nv_ssthresh_factor On congestion set ssthresh to this * <desired cwnd> / 8 4162306a36Sopenharmony_ci * nv_rtt_factor RTT averaging factor 4262306a36Sopenharmony_ci * nv_loss_dec_factor Decrease cwnd to this (80%) when losses occur 4362306a36Sopenharmony_ci * nv_dec_eval_min_calls Wait this many RTT measurements before dec cwnd 4462306a36Sopenharmony_ci * nv_inc_eval_min_calls Wait this many RTT measurements before inc cwnd 4562306a36Sopenharmony_ci * nv_ssthresh_eval_min_calls Wait this many RTT measurements before stopping 4662306a36Sopenharmony_ci * slow-start due to congestion 4762306a36Sopenharmony_ci * nv_stop_rtt_cnt Only grow cwnd for this many RTTs after non-congestion 4862306a36Sopenharmony_ci * nv_rtt_min_cnt Wait these many RTTs before making congesion decision 4962306a36Sopenharmony_ci * nv_cwnd_growth_rate_neg 5062306a36Sopenharmony_ci * nv_cwnd_growth_rate_pos 5162306a36Sopenharmony_ci * How quickly to double growth rate (not rate) of cwnd when not 5262306a36Sopenharmony_ci * congested. One value (nv_cwnd_growth_rate_neg) for when 5362306a36Sopenharmony_ci * rate < 1 pkt/RTT (after losses). The other (nv_cwnd_growth_rate_pos) 5462306a36Sopenharmony_ci * otherwise. 5562306a36Sopenharmony_ci */ 5662306a36Sopenharmony_ci 5762306a36Sopenharmony_cistatic int nv_pad __read_mostly = 10; 5862306a36Sopenharmony_cistatic int nv_pad_buffer __read_mostly = 2; 5962306a36Sopenharmony_cistatic int nv_reset_period __read_mostly = 5; /* in seconds */ 6062306a36Sopenharmony_cistatic int nv_min_cwnd __read_mostly = 2; 6162306a36Sopenharmony_cistatic int nv_cong_dec_mult __read_mostly = 30 * 128 / 100; /* = 30% */ 6262306a36Sopenharmony_cistatic int nv_ssthresh_factor __read_mostly = 8; /* = 1 */ 6362306a36Sopenharmony_cistatic int nv_rtt_factor __read_mostly = 128; /* = 1/2*old + 1/2*new */ 6462306a36Sopenharmony_cistatic int nv_loss_dec_factor __read_mostly = 819; /* => 80% */ 6562306a36Sopenharmony_cistatic int nv_cwnd_growth_rate_neg __read_mostly = 8; 6662306a36Sopenharmony_cistatic int nv_cwnd_growth_rate_pos __read_mostly; /* 0 => fixed like Reno */ 6762306a36Sopenharmony_cistatic int nv_dec_eval_min_calls __read_mostly = 60; 6862306a36Sopenharmony_cistatic int nv_inc_eval_min_calls __read_mostly = 20; 6962306a36Sopenharmony_cistatic int nv_ssthresh_eval_min_calls __read_mostly = 30; 7062306a36Sopenharmony_cistatic int nv_stop_rtt_cnt __read_mostly = 10; 7162306a36Sopenharmony_cistatic int nv_rtt_min_cnt __read_mostly = 2; 7262306a36Sopenharmony_ci 7362306a36Sopenharmony_cimodule_param(nv_pad, int, 0644); 7462306a36Sopenharmony_ciMODULE_PARM_DESC(nv_pad, "max queued packets allowed in network"); 7562306a36Sopenharmony_cimodule_param(nv_reset_period, int, 0644); 7662306a36Sopenharmony_ciMODULE_PARM_DESC(nv_reset_period, "nv_min_rtt reset period (secs)"); 7762306a36Sopenharmony_cimodule_param(nv_min_cwnd, int, 0644); 7862306a36Sopenharmony_ciMODULE_PARM_DESC(nv_min_cwnd, "NV will not decrease cwnd below this value" 7962306a36Sopenharmony_ci " without losses"); 8062306a36Sopenharmony_ci 8162306a36Sopenharmony_ci/* TCP NV Parameters */ 8262306a36Sopenharmony_cistruct tcpnv { 8362306a36Sopenharmony_ci unsigned long nv_min_rtt_reset_jiffies; /* when to switch to 8462306a36Sopenharmony_ci * nv_min_rtt_new */ 8562306a36Sopenharmony_ci s8 cwnd_growth_factor; /* Current cwnd growth factor, 8662306a36Sopenharmony_ci * < 0 => less than 1 packet/RTT */ 8762306a36Sopenharmony_ci u8 available8; 8862306a36Sopenharmony_ci u16 available16; 8962306a36Sopenharmony_ci u8 nv_allow_cwnd_growth:1, /* whether cwnd can grow */ 9062306a36Sopenharmony_ci nv_reset:1, /* whether to reset values */ 9162306a36Sopenharmony_ci nv_catchup:1; /* whether we are growing because 9262306a36Sopenharmony_ci * of temporary cwnd decrease */ 9362306a36Sopenharmony_ci u8 nv_eval_call_cnt; /* call count since last eval */ 9462306a36Sopenharmony_ci u8 nv_min_cwnd; /* nv won't make a ca decision if cwnd is 9562306a36Sopenharmony_ci * smaller than this. It may grow to handle 9662306a36Sopenharmony_ci * TSO, LRO and interrupt coalescence because 9762306a36Sopenharmony_ci * with these a small cwnd cannot saturate 9862306a36Sopenharmony_ci * the link. Note that this is different from 9962306a36Sopenharmony_ci * the file local nv_min_cwnd */ 10062306a36Sopenharmony_ci u8 nv_rtt_cnt; /* RTTs without making ca decision */; 10162306a36Sopenharmony_ci u32 nv_last_rtt; /* last rtt */ 10262306a36Sopenharmony_ci u32 nv_min_rtt; /* active min rtt. Used to determine slope */ 10362306a36Sopenharmony_ci u32 nv_min_rtt_new; /* min rtt for future use */ 10462306a36Sopenharmony_ci u32 nv_base_rtt; /* If non-zero it represents the threshold for 10562306a36Sopenharmony_ci * congestion */ 10662306a36Sopenharmony_ci u32 nv_lower_bound_rtt; /* Used in conjunction with nv_base_rtt. It is 10762306a36Sopenharmony_ci * set to 80% of nv_base_rtt. It helps reduce 10862306a36Sopenharmony_ci * unfairness between flows */ 10962306a36Sopenharmony_ci u32 nv_rtt_max_rate; /* max rate seen during current RTT */ 11062306a36Sopenharmony_ci u32 nv_rtt_start_seq; /* current RTT ends when packet arrives 11162306a36Sopenharmony_ci * acking beyond nv_rtt_start_seq */ 11262306a36Sopenharmony_ci u32 nv_last_snd_una; /* Previous value of tp->snd_una. It is 11362306a36Sopenharmony_ci * used to determine bytes acked since last 11462306a36Sopenharmony_ci * call to bictcp_acked */ 11562306a36Sopenharmony_ci u32 nv_no_cong_cnt; /* Consecutive no congestion decisions */ 11662306a36Sopenharmony_ci}; 11762306a36Sopenharmony_ci 11862306a36Sopenharmony_ci#define NV_INIT_RTT U32_MAX 11962306a36Sopenharmony_ci#define NV_MIN_CWND 4 12062306a36Sopenharmony_ci#define NV_MIN_CWND_GROW 2 12162306a36Sopenharmony_ci#define NV_TSO_CWND_BOUND 80 12262306a36Sopenharmony_ci 12362306a36Sopenharmony_cistatic inline void tcpnv_reset(struct tcpnv *ca, struct sock *sk) 12462306a36Sopenharmony_ci{ 12562306a36Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 12662306a36Sopenharmony_ci 12762306a36Sopenharmony_ci ca->nv_reset = 0; 12862306a36Sopenharmony_ci ca->nv_no_cong_cnt = 0; 12962306a36Sopenharmony_ci ca->nv_rtt_cnt = 0; 13062306a36Sopenharmony_ci ca->nv_last_rtt = 0; 13162306a36Sopenharmony_ci ca->nv_rtt_max_rate = 0; 13262306a36Sopenharmony_ci ca->nv_rtt_start_seq = tp->snd_una; 13362306a36Sopenharmony_ci ca->nv_eval_call_cnt = 0; 13462306a36Sopenharmony_ci ca->nv_last_snd_una = tp->snd_una; 13562306a36Sopenharmony_ci} 13662306a36Sopenharmony_ci 13762306a36Sopenharmony_cistatic void tcpnv_init(struct sock *sk) 13862306a36Sopenharmony_ci{ 13962306a36Sopenharmony_ci struct tcpnv *ca = inet_csk_ca(sk); 14062306a36Sopenharmony_ci int base_rtt; 14162306a36Sopenharmony_ci 14262306a36Sopenharmony_ci tcpnv_reset(ca, sk); 14362306a36Sopenharmony_ci 14462306a36Sopenharmony_ci /* See if base_rtt is available from socket_ops bpf program. 14562306a36Sopenharmony_ci * It is meant to be used in environments, such as communication 14662306a36Sopenharmony_ci * within a datacenter, where we have reasonable estimates of 14762306a36Sopenharmony_ci * RTTs 14862306a36Sopenharmony_ci */ 14962306a36Sopenharmony_ci base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT, 0, NULL); 15062306a36Sopenharmony_ci if (base_rtt > 0) { 15162306a36Sopenharmony_ci ca->nv_base_rtt = base_rtt; 15262306a36Sopenharmony_ci ca->nv_lower_bound_rtt = (base_rtt * 205) >> 8; /* 80% */ 15362306a36Sopenharmony_ci } else { 15462306a36Sopenharmony_ci ca->nv_base_rtt = 0; 15562306a36Sopenharmony_ci ca->nv_lower_bound_rtt = 0; 15662306a36Sopenharmony_ci } 15762306a36Sopenharmony_ci 15862306a36Sopenharmony_ci ca->nv_allow_cwnd_growth = 1; 15962306a36Sopenharmony_ci ca->nv_min_rtt_reset_jiffies = jiffies + 2 * HZ; 16062306a36Sopenharmony_ci ca->nv_min_rtt = NV_INIT_RTT; 16162306a36Sopenharmony_ci ca->nv_min_rtt_new = NV_INIT_RTT; 16262306a36Sopenharmony_ci ca->nv_min_cwnd = NV_MIN_CWND; 16362306a36Sopenharmony_ci ca->nv_catchup = 0; 16462306a36Sopenharmony_ci ca->cwnd_growth_factor = 0; 16562306a36Sopenharmony_ci} 16662306a36Sopenharmony_ci 16762306a36Sopenharmony_ci/* If provided, apply upper (base_rtt) and lower (lower_bound_rtt) 16862306a36Sopenharmony_ci * bounds to RTT. 16962306a36Sopenharmony_ci */ 17062306a36Sopenharmony_ciinline u32 nv_get_bounded_rtt(struct tcpnv *ca, u32 val) 17162306a36Sopenharmony_ci{ 17262306a36Sopenharmony_ci if (ca->nv_lower_bound_rtt > 0 && val < ca->nv_lower_bound_rtt) 17362306a36Sopenharmony_ci return ca->nv_lower_bound_rtt; 17462306a36Sopenharmony_ci else if (ca->nv_base_rtt > 0 && val > ca->nv_base_rtt) 17562306a36Sopenharmony_ci return ca->nv_base_rtt; 17662306a36Sopenharmony_ci else 17762306a36Sopenharmony_ci return val; 17862306a36Sopenharmony_ci} 17962306a36Sopenharmony_ci 18062306a36Sopenharmony_cistatic void tcpnv_cong_avoid(struct sock *sk, u32 ack, u32 acked) 18162306a36Sopenharmony_ci{ 18262306a36Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 18362306a36Sopenharmony_ci struct tcpnv *ca = inet_csk_ca(sk); 18462306a36Sopenharmony_ci u32 cnt; 18562306a36Sopenharmony_ci 18662306a36Sopenharmony_ci if (!tcp_is_cwnd_limited(sk)) 18762306a36Sopenharmony_ci return; 18862306a36Sopenharmony_ci 18962306a36Sopenharmony_ci /* Only grow cwnd if NV has not detected congestion */ 19062306a36Sopenharmony_ci if (!ca->nv_allow_cwnd_growth) 19162306a36Sopenharmony_ci return; 19262306a36Sopenharmony_ci 19362306a36Sopenharmony_ci if (tcp_in_slow_start(tp)) { 19462306a36Sopenharmony_ci acked = tcp_slow_start(tp, acked); 19562306a36Sopenharmony_ci if (!acked) 19662306a36Sopenharmony_ci return; 19762306a36Sopenharmony_ci } 19862306a36Sopenharmony_ci 19962306a36Sopenharmony_ci if (ca->cwnd_growth_factor < 0) { 20062306a36Sopenharmony_ci cnt = tcp_snd_cwnd(tp) << -ca->cwnd_growth_factor; 20162306a36Sopenharmony_ci tcp_cong_avoid_ai(tp, cnt, acked); 20262306a36Sopenharmony_ci } else { 20362306a36Sopenharmony_ci cnt = max(4U, tcp_snd_cwnd(tp) >> ca->cwnd_growth_factor); 20462306a36Sopenharmony_ci tcp_cong_avoid_ai(tp, cnt, acked); 20562306a36Sopenharmony_ci } 20662306a36Sopenharmony_ci} 20762306a36Sopenharmony_ci 20862306a36Sopenharmony_cistatic u32 tcpnv_recalc_ssthresh(struct sock *sk) 20962306a36Sopenharmony_ci{ 21062306a36Sopenharmony_ci const struct tcp_sock *tp = tcp_sk(sk); 21162306a36Sopenharmony_ci 21262306a36Sopenharmony_ci return max((tcp_snd_cwnd(tp) * nv_loss_dec_factor) >> 10, 2U); 21362306a36Sopenharmony_ci} 21462306a36Sopenharmony_ci 21562306a36Sopenharmony_cistatic void tcpnv_state(struct sock *sk, u8 new_state) 21662306a36Sopenharmony_ci{ 21762306a36Sopenharmony_ci struct tcpnv *ca = inet_csk_ca(sk); 21862306a36Sopenharmony_ci 21962306a36Sopenharmony_ci if (new_state == TCP_CA_Open && ca->nv_reset) { 22062306a36Sopenharmony_ci tcpnv_reset(ca, sk); 22162306a36Sopenharmony_ci } else if (new_state == TCP_CA_Loss || new_state == TCP_CA_CWR || 22262306a36Sopenharmony_ci new_state == TCP_CA_Recovery) { 22362306a36Sopenharmony_ci ca->nv_reset = 1; 22462306a36Sopenharmony_ci ca->nv_allow_cwnd_growth = 0; 22562306a36Sopenharmony_ci if (new_state == TCP_CA_Loss) { 22662306a36Sopenharmony_ci /* Reset cwnd growth factor to Reno value */ 22762306a36Sopenharmony_ci if (ca->cwnd_growth_factor > 0) 22862306a36Sopenharmony_ci ca->cwnd_growth_factor = 0; 22962306a36Sopenharmony_ci /* Decrease growth rate if allowed */ 23062306a36Sopenharmony_ci if (nv_cwnd_growth_rate_neg > 0 && 23162306a36Sopenharmony_ci ca->cwnd_growth_factor > -8) 23262306a36Sopenharmony_ci ca->cwnd_growth_factor--; 23362306a36Sopenharmony_ci } 23462306a36Sopenharmony_ci } 23562306a36Sopenharmony_ci} 23662306a36Sopenharmony_ci 23762306a36Sopenharmony_ci/* Do congestion avoidance calculations for TCP-NV 23862306a36Sopenharmony_ci */ 23962306a36Sopenharmony_cistatic void tcpnv_acked(struct sock *sk, const struct ack_sample *sample) 24062306a36Sopenharmony_ci{ 24162306a36Sopenharmony_ci const struct inet_connection_sock *icsk = inet_csk(sk); 24262306a36Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 24362306a36Sopenharmony_ci struct tcpnv *ca = inet_csk_ca(sk); 24462306a36Sopenharmony_ci unsigned long now = jiffies; 24562306a36Sopenharmony_ci u64 rate64; 24662306a36Sopenharmony_ci u32 rate, max_win, cwnd_by_slope; 24762306a36Sopenharmony_ci u32 avg_rtt; 24862306a36Sopenharmony_ci u32 bytes_acked = 0; 24962306a36Sopenharmony_ci 25062306a36Sopenharmony_ci /* Some calls are for duplicates without timetamps */ 25162306a36Sopenharmony_ci if (sample->rtt_us < 0) 25262306a36Sopenharmony_ci return; 25362306a36Sopenharmony_ci 25462306a36Sopenharmony_ci /* If not in TCP_CA_Open or TCP_CA_Disorder states, skip. */ 25562306a36Sopenharmony_ci if (icsk->icsk_ca_state != TCP_CA_Open && 25662306a36Sopenharmony_ci icsk->icsk_ca_state != TCP_CA_Disorder) 25762306a36Sopenharmony_ci return; 25862306a36Sopenharmony_ci 25962306a36Sopenharmony_ci /* Stop cwnd growth if we were in catch up mode */ 26062306a36Sopenharmony_ci if (ca->nv_catchup && tcp_snd_cwnd(tp) >= nv_min_cwnd) { 26162306a36Sopenharmony_ci ca->nv_catchup = 0; 26262306a36Sopenharmony_ci ca->nv_allow_cwnd_growth = 0; 26362306a36Sopenharmony_ci } 26462306a36Sopenharmony_ci 26562306a36Sopenharmony_ci bytes_acked = tp->snd_una - ca->nv_last_snd_una; 26662306a36Sopenharmony_ci ca->nv_last_snd_una = tp->snd_una; 26762306a36Sopenharmony_ci 26862306a36Sopenharmony_ci if (sample->in_flight == 0) 26962306a36Sopenharmony_ci return; 27062306a36Sopenharmony_ci 27162306a36Sopenharmony_ci /* Calculate moving average of RTT */ 27262306a36Sopenharmony_ci if (nv_rtt_factor > 0) { 27362306a36Sopenharmony_ci if (ca->nv_last_rtt > 0) { 27462306a36Sopenharmony_ci avg_rtt = (((u64)sample->rtt_us) * nv_rtt_factor + 27562306a36Sopenharmony_ci ((u64)ca->nv_last_rtt) 27662306a36Sopenharmony_ci * (256 - nv_rtt_factor)) >> 8; 27762306a36Sopenharmony_ci } else { 27862306a36Sopenharmony_ci avg_rtt = sample->rtt_us; 27962306a36Sopenharmony_ci ca->nv_min_rtt = avg_rtt << 1; 28062306a36Sopenharmony_ci } 28162306a36Sopenharmony_ci ca->nv_last_rtt = avg_rtt; 28262306a36Sopenharmony_ci } else { 28362306a36Sopenharmony_ci avg_rtt = sample->rtt_us; 28462306a36Sopenharmony_ci } 28562306a36Sopenharmony_ci 28662306a36Sopenharmony_ci /* rate in 100's bits per second */ 28762306a36Sopenharmony_ci rate64 = ((u64)sample->in_flight) * 80000; 28862306a36Sopenharmony_ci do_div(rate64, avg_rtt ?: 1); 28962306a36Sopenharmony_ci rate = (u32)rate64; 29062306a36Sopenharmony_ci 29162306a36Sopenharmony_ci /* Remember the maximum rate seen during this RTT 29262306a36Sopenharmony_ci * Note: It may be more than one RTT. This function should be 29362306a36Sopenharmony_ci * called at least nv_dec_eval_min_calls times. 29462306a36Sopenharmony_ci */ 29562306a36Sopenharmony_ci if (ca->nv_rtt_max_rate < rate) 29662306a36Sopenharmony_ci ca->nv_rtt_max_rate = rate; 29762306a36Sopenharmony_ci 29862306a36Sopenharmony_ci /* We have valid information, increment counter */ 29962306a36Sopenharmony_ci if (ca->nv_eval_call_cnt < 255) 30062306a36Sopenharmony_ci ca->nv_eval_call_cnt++; 30162306a36Sopenharmony_ci 30262306a36Sopenharmony_ci /* Apply bounds to rtt. Only used to update min_rtt */ 30362306a36Sopenharmony_ci avg_rtt = nv_get_bounded_rtt(ca, avg_rtt); 30462306a36Sopenharmony_ci 30562306a36Sopenharmony_ci /* update min rtt if necessary */ 30662306a36Sopenharmony_ci if (avg_rtt < ca->nv_min_rtt) 30762306a36Sopenharmony_ci ca->nv_min_rtt = avg_rtt; 30862306a36Sopenharmony_ci 30962306a36Sopenharmony_ci /* update future min_rtt if necessary */ 31062306a36Sopenharmony_ci if (avg_rtt < ca->nv_min_rtt_new) 31162306a36Sopenharmony_ci ca->nv_min_rtt_new = avg_rtt; 31262306a36Sopenharmony_ci 31362306a36Sopenharmony_ci /* nv_min_rtt is updated with the minimum (possibley averaged) rtt 31462306a36Sopenharmony_ci * seen in the last sysctl_tcp_nv_reset_period seconds (i.e. a 31562306a36Sopenharmony_ci * warm reset). This new nv_min_rtt will be continued to be updated 31662306a36Sopenharmony_ci * and be used for another sysctl_tcp_nv_reset_period seconds, 31762306a36Sopenharmony_ci * when it will be updated again. 31862306a36Sopenharmony_ci * In practice we introduce some randomness, so the actual period used 31962306a36Sopenharmony_ci * is chosen randomly from the range: 32062306a36Sopenharmony_ci * [sysctl_tcp_nv_reset_period*3/4, sysctl_tcp_nv_reset_period*5/4) 32162306a36Sopenharmony_ci */ 32262306a36Sopenharmony_ci if (time_after_eq(now, ca->nv_min_rtt_reset_jiffies)) { 32362306a36Sopenharmony_ci unsigned char rand; 32462306a36Sopenharmony_ci 32562306a36Sopenharmony_ci ca->nv_min_rtt = ca->nv_min_rtt_new; 32662306a36Sopenharmony_ci ca->nv_min_rtt_new = NV_INIT_RTT; 32762306a36Sopenharmony_ci get_random_bytes(&rand, 1); 32862306a36Sopenharmony_ci ca->nv_min_rtt_reset_jiffies = 32962306a36Sopenharmony_ci now + ((nv_reset_period * (384 + rand) * HZ) >> 9); 33062306a36Sopenharmony_ci /* Every so often we decrease ca->nv_min_cwnd in case previous 33162306a36Sopenharmony_ci * value is no longer accurate. 33262306a36Sopenharmony_ci */ 33362306a36Sopenharmony_ci ca->nv_min_cwnd = max(ca->nv_min_cwnd / 2, NV_MIN_CWND); 33462306a36Sopenharmony_ci } 33562306a36Sopenharmony_ci 33662306a36Sopenharmony_ci /* Once per RTT check if we need to do congestion avoidance */ 33762306a36Sopenharmony_ci if (before(ca->nv_rtt_start_seq, tp->snd_una)) { 33862306a36Sopenharmony_ci ca->nv_rtt_start_seq = tp->snd_nxt; 33962306a36Sopenharmony_ci if (ca->nv_rtt_cnt < 0xff) 34062306a36Sopenharmony_ci /* Increase counter for RTTs without CA decision */ 34162306a36Sopenharmony_ci ca->nv_rtt_cnt++; 34262306a36Sopenharmony_ci 34362306a36Sopenharmony_ci /* If this function is only called once within an RTT 34462306a36Sopenharmony_ci * the cwnd is probably too small (in some cases due to 34562306a36Sopenharmony_ci * tso, lro or interrupt coalescence), so we increase 34662306a36Sopenharmony_ci * ca->nv_min_cwnd. 34762306a36Sopenharmony_ci */ 34862306a36Sopenharmony_ci if (ca->nv_eval_call_cnt == 1 && 34962306a36Sopenharmony_ci bytes_acked >= (ca->nv_min_cwnd - 1) * tp->mss_cache && 35062306a36Sopenharmony_ci ca->nv_min_cwnd < (NV_TSO_CWND_BOUND + 1)) { 35162306a36Sopenharmony_ci ca->nv_min_cwnd = min(ca->nv_min_cwnd 35262306a36Sopenharmony_ci + NV_MIN_CWND_GROW, 35362306a36Sopenharmony_ci NV_TSO_CWND_BOUND + 1); 35462306a36Sopenharmony_ci ca->nv_rtt_start_seq = tp->snd_nxt + 35562306a36Sopenharmony_ci ca->nv_min_cwnd * tp->mss_cache; 35662306a36Sopenharmony_ci ca->nv_eval_call_cnt = 0; 35762306a36Sopenharmony_ci ca->nv_allow_cwnd_growth = 1; 35862306a36Sopenharmony_ci return; 35962306a36Sopenharmony_ci } 36062306a36Sopenharmony_ci 36162306a36Sopenharmony_ci /* Find the ideal cwnd for current rate from slope 36262306a36Sopenharmony_ci * slope = 80000.0 * mss / nv_min_rtt 36362306a36Sopenharmony_ci * cwnd_by_slope = nv_rtt_max_rate / slope 36462306a36Sopenharmony_ci */ 36562306a36Sopenharmony_ci cwnd_by_slope = (u32) 36662306a36Sopenharmony_ci div64_u64(((u64)ca->nv_rtt_max_rate) * ca->nv_min_rtt, 36762306a36Sopenharmony_ci 80000ULL * tp->mss_cache); 36862306a36Sopenharmony_ci max_win = cwnd_by_slope + nv_pad; 36962306a36Sopenharmony_ci 37062306a36Sopenharmony_ci /* If cwnd > max_win, decrease cwnd 37162306a36Sopenharmony_ci * if cwnd < max_win, grow cwnd 37262306a36Sopenharmony_ci * else leave the same 37362306a36Sopenharmony_ci */ 37462306a36Sopenharmony_ci if (tcp_snd_cwnd(tp) > max_win) { 37562306a36Sopenharmony_ci /* there is congestion, check that it is ok 37662306a36Sopenharmony_ci * to make a CA decision 37762306a36Sopenharmony_ci * 1. We should have at least nv_dec_eval_min_calls 37862306a36Sopenharmony_ci * data points before making a CA decision 37962306a36Sopenharmony_ci * 2. We only make a congesion decision after 38062306a36Sopenharmony_ci * nv_rtt_min_cnt RTTs 38162306a36Sopenharmony_ci */ 38262306a36Sopenharmony_ci if (ca->nv_rtt_cnt < nv_rtt_min_cnt) { 38362306a36Sopenharmony_ci return; 38462306a36Sopenharmony_ci } else if (tp->snd_ssthresh == TCP_INFINITE_SSTHRESH) { 38562306a36Sopenharmony_ci if (ca->nv_eval_call_cnt < 38662306a36Sopenharmony_ci nv_ssthresh_eval_min_calls) 38762306a36Sopenharmony_ci return; 38862306a36Sopenharmony_ci /* otherwise we will decrease cwnd */ 38962306a36Sopenharmony_ci } else if (ca->nv_eval_call_cnt < 39062306a36Sopenharmony_ci nv_dec_eval_min_calls) { 39162306a36Sopenharmony_ci if (ca->nv_allow_cwnd_growth && 39262306a36Sopenharmony_ci ca->nv_rtt_cnt > nv_stop_rtt_cnt) 39362306a36Sopenharmony_ci ca->nv_allow_cwnd_growth = 0; 39462306a36Sopenharmony_ci return; 39562306a36Sopenharmony_ci } 39662306a36Sopenharmony_ci 39762306a36Sopenharmony_ci /* We have enough data to determine we are congested */ 39862306a36Sopenharmony_ci ca->nv_allow_cwnd_growth = 0; 39962306a36Sopenharmony_ci tp->snd_ssthresh = 40062306a36Sopenharmony_ci (nv_ssthresh_factor * max_win) >> 3; 40162306a36Sopenharmony_ci if (tcp_snd_cwnd(tp) - max_win > 2) { 40262306a36Sopenharmony_ci /* gap > 2, we do exponential cwnd decrease */ 40362306a36Sopenharmony_ci int dec; 40462306a36Sopenharmony_ci 40562306a36Sopenharmony_ci dec = max(2U, ((tcp_snd_cwnd(tp) - max_win) * 40662306a36Sopenharmony_ci nv_cong_dec_mult) >> 7); 40762306a36Sopenharmony_ci tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) - dec); 40862306a36Sopenharmony_ci } else if (nv_cong_dec_mult > 0) { 40962306a36Sopenharmony_ci tcp_snd_cwnd_set(tp, max_win); 41062306a36Sopenharmony_ci } 41162306a36Sopenharmony_ci if (ca->cwnd_growth_factor > 0) 41262306a36Sopenharmony_ci ca->cwnd_growth_factor = 0; 41362306a36Sopenharmony_ci ca->nv_no_cong_cnt = 0; 41462306a36Sopenharmony_ci } else if (tcp_snd_cwnd(tp) <= max_win - nv_pad_buffer) { 41562306a36Sopenharmony_ci /* There is no congestion, grow cwnd if allowed*/ 41662306a36Sopenharmony_ci if (ca->nv_eval_call_cnt < nv_inc_eval_min_calls) 41762306a36Sopenharmony_ci return; 41862306a36Sopenharmony_ci 41962306a36Sopenharmony_ci ca->nv_allow_cwnd_growth = 1; 42062306a36Sopenharmony_ci ca->nv_no_cong_cnt++; 42162306a36Sopenharmony_ci if (ca->cwnd_growth_factor < 0 && 42262306a36Sopenharmony_ci nv_cwnd_growth_rate_neg > 0 && 42362306a36Sopenharmony_ci ca->nv_no_cong_cnt > nv_cwnd_growth_rate_neg) { 42462306a36Sopenharmony_ci ca->cwnd_growth_factor++; 42562306a36Sopenharmony_ci ca->nv_no_cong_cnt = 0; 42662306a36Sopenharmony_ci } else if (ca->cwnd_growth_factor >= 0 && 42762306a36Sopenharmony_ci nv_cwnd_growth_rate_pos > 0 && 42862306a36Sopenharmony_ci ca->nv_no_cong_cnt > 42962306a36Sopenharmony_ci nv_cwnd_growth_rate_pos) { 43062306a36Sopenharmony_ci ca->cwnd_growth_factor++; 43162306a36Sopenharmony_ci ca->nv_no_cong_cnt = 0; 43262306a36Sopenharmony_ci } 43362306a36Sopenharmony_ci } else { 43462306a36Sopenharmony_ci /* cwnd is in-between, so do nothing */ 43562306a36Sopenharmony_ci return; 43662306a36Sopenharmony_ci } 43762306a36Sopenharmony_ci 43862306a36Sopenharmony_ci /* update state */ 43962306a36Sopenharmony_ci ca->nv_eval_call_cnt = 0; 44062306a36Sopenharmony_ci ca->nv_rtt_cnt = 0; 44162306a36Sopenharmony_ci ca->nv_rtt_max_rate = 0; 44262306a36Sopenharmony_ci 44362306a36Sopenharmony_ci /* Don't want to make cwnd < nv_min_cwnd 44462306a36Sopenharmony_ci * (it wasn't before, if it is now is because nv 44562306a36Sopenharmony_ci * decreased it). 44662306a36Sopenharmony_ci */ 44762306a36Sopenharmony_ci if (tcp_snd_cwnd(tp) < nv_min_cwnd) 44862306a36Sopenharmony_ci tcp_snd_cwnd_set(tp, nv_min_cwnd); 44962306a36Sopenharmony_ci } 45062306a36Sopenharmony_ci} 45162306a36Sopenharmony_ci 45262306a36Sopenharmony_ci/* Extract info for Tcp socket info provided via netlink */ 45362306a36Sopenharmony_cistatic size_t tcpnv_get_info(struct sock *sk, u32 ext, int *attr, 45462306a36Sopenharmony_ci union tcp_cc_info *info) 45562306a36Sopenharmony_ci{ 45662306a36Sopenharmony_ci const struct tcpnv *ca = inet_csk_ca(sk); 45762306a36Sopenharmony_ci 45862306a36Sopenharmony_ci if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { 45962306a36Sopenharmony_ci info->vegas.tcpv_enabled = 1; 46062306a36Sopenharmony_ci info->vegas.tcpv_rttcnt = ca->nv_rtt_cnt; 46162306a36Sopenharmony_ci info->vegas.tcpv_rtt = ca->nv_last_rtt; 46262306a36Sopenharmony_ci info->vegas.tcpv_minrtt = ca->nv_min_rtt; 46362306a36Sopenharmony_ci 46462306a36Sopenharmony_ci *attr = INET_DIAG_VEGASINFO; 46562306a36Sopenharmony_ci return sizeof(struct tcpvegas_info); 46662306a36Sopenharmony_ci } 46762306a36Sopenharmony_ci return 0; 46862306a36Sopenharmony_ci} 46962306a36Sopenharmony_ci 47062306a36Sopenharmony_cistatic struct tcp_congestion_ops tcpnv __read_mostly = { 47162306a36Sopenharmony_ci .init = tcpnv_init, 47262306a36Sopenharmony_ci .ssthresh = tcpnv_recalc_ssthresh, 47362306a36Sopenharmony_ci .cong_avoid = tcpnv_cong_avoid, 47462306a36Sopenharmony_ci .set_state = tcpnv_state, 47562306a36Sopenharmony_ci .undo_cwnd = tcp_reno_undo_cwnd, 47662306a36Sopenharmony_ci .pkts_acked = tcpnv_acked, 47762306a36Sopenharmony_ci .get_info = tcpnv_get_info, 47862306a36Sopenharmony_ci 47962306a36Sopenharmony_ci .owner = THIS_MODULE, 48062306a36Sopenharmony_ci .name = "nv", 48162306a36Sopenharmony_ci}; 48262306a36Sopenharmony_ci 48362306a36Sopenharmony_cistatic int __init tcpnv_register(void) 48462306a36Sopenharmony_ci{ 48562306a36Sopenharmony_ci BUILD_BUG_ON(sizeof(struct tcpnv) > ICSK_CA_PRIV_SIZE); 48662306a36Sopenharmony_ci 48762306a36Sopenharmony_ci return tcp_register_congestion_control(&tcpnv); 48862306a36Sopenharmony_ci} 48962306a36Sopenharmony_ci 49062306a36Sopenharmony_cistatic void __exit tcpnv_unregister(void) 49162306a36Sopenharmony_ci{ 49262306a36Sopenharmony_ci tcp_unregister_congestion_control(&tcpnv); 49362306a36Sopenharmony_ci} 49462306a36Sopenharmony_ci 49562306a36Sopenharmony_cimodule_init(tcpnv_register); 49662306a36Sopenharmony_cimodule_exit(tcpnv_unregister); 49762306a36Sopenharmony_ci 49862306a36Sopenharmony_ciMODULE_AUTHOR("Lawrence Brakmo"); 49962306a36Sopenharmony_ciMODULE_LICENSE("GPL"); 50062306a36Sopenharmony_ciMODULE_DESCRIPTION("TCP NV"); 50162306a36Sopenharmony_ciMODULE_VERSION("1.0"); 502