162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * INET An implementation of the TCP/IP protocol suite for the LINUX 462306a36Sopenharmony_ci * operating system. INET is implemented using the BSD Socket 562306a36Sopenharmony_ci * interface as the means of communication with the user level. 662306a36Sopenharmony_ci * 762306a36Sopenharmony_ci * Implementation of the Transmission Control Protocol(TCP). 862306a36Sopenharmony_ci * 962306a36Sopenharmony_ci * Authors: Ross Biro 1062306a36Sopenharmony_ci * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 1162306a36Sopenharmony_ci * Mark Evans, <evansmp@uhura.aston.ac.uk> 1262306a36Sopenharmony_ci * Corey Minyard <wf-rch!minyard@relay.EU.net> 1362306a36Sopenharmony_ci * Florian La Roche, <flla@stud.uni-sb.de> 1462306a36Sopenharmony_ci * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> 1562306a36Sopenharmony_ci * Linus Torvalds, <torvalds@cs.helsinki.fi> 1662306a36Sopenharmony_ci * Alan Cox, <gw4pts@gw4pts.ampr.org> 1762306a36Sopenharmony_ci * Matthew Dillon, <dillon@apollo.west.oic.com> 1862306a36Sopenharmony_ci * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 1962306a36Sopenharmony_ci * Jorge Cwik, <jorge@laser.satlink.net> 2062306a36Sopenharmony_ci */ 2162306a36Sopenharmony_ci 2262306a36Sopenharmony_ci#include <net/tcp.h> 2362306a36Sopenharmony_ci#include <net/xfrm.h> 2462306a36Sopenharmony_ci#include <net/busy_poll.h> 2562306a36Sopenharmony_ci 2662306a36Sopenharmony_cistatic bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) 2762306a36Sopenharmony_ci{ 2862306a36Sopenharmony_ci if (seq == s_win) 2962306a36Sopenharmony_ci return true; 3062306a36Sopenharmony_ci if (after(end_seq, s_win) && before(seq, e_win)) 3162306a36Sopenharmony_ci return true; 3262306a36Sopenharmony_ci return seq == e_win && seq == end_seq; 3362306a36Sopenharmony_ci} 3462306a36Sopenharmony_ci 3562306a36Sopenharmony_cistatic enum tcp_tw_status 3662306a36Sopenharmony_citcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw, 3762306a36Sopenharmony_ci const struct sk_buff *skb, int mib_idx) 3862306a36Sopenharmony_ci{ 3962306a36Sopenharmony_ci struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); 4062306a36Sopenharmony_ci 4162306a36Sopenharmony_ci if (!tcp_oow_rate_limited(twsk_net(tw), skb, mib_idx, 4262306a36Sopenharmony_ci &tcptw->tw_last_oow_ack_time)) { 4362306a36Sopenharmony_ci /* Send ACK. Note, we do not put the bucket, 4462306a36Sopenharmony_ci * it will be released by caller. 4562306a36Sopenharmony_ci */ 4662306a36Sopenharmony_ci return TCP_TW_ACK; 4762306a36Sopenharmony_ci } 4862306a36Sopenharmony_ci 4962306a36Sopenharmony_ci /* We are rate-limiting, so just release the tw sock and drop skb. */ 5062306a36Sopenharmony_ci inet_twsk_put(tw); 5162306a36Sopenharmony_ci return TCP_TW_SUCCESS; 5262306a36Sopenharmony_ci} 5362306a36Sopenharmony_ci 5462306a36Sopenharmony_ci/* 5562306a36Sopenharmony_ci * * Main purpose of TIME-WAIT state is to close connection gracefully, 5662306a36Sopenharmony_ci * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN 5762306a36Sopenharmony_ci * (and, probably, tail of data) and one or more our ACKs are lost. 5862306a36Sopenharmony_ci * * What is TIME-WAIT timeout? It is associated with maximal packet 5962306a36Sopenharmony_ci * lifetime in the internet, which results in wrong conclusion, that 6062306a36Sopenharmony_ci * it is set to catch "old duplicate segments" wandering out of their path. 6162306a36Sopenharmony_ci * It is not quite correct. This timeout is calculated so that it exceeds 6262306a36Sopenharmony_ci * maximal retransmission timeout enough to allow to lose one (or more) 6362306a36Sopenharmony_ci * segments sent by peer and our ACKs. This time may be calculated from RTO. 6462306a36Sopenharmony_ci * * When TIME-WAIT socket receives RST, it means that another end 6562306a36Sopenharmony_ci * finally closed and we are allowed to kill TIME-WAIT too. 6662306a36Sopenharmony_ci * * Second purpose of TIME-WAIT is catching old duplicate segments. 6762306a36Sopenharmony_ci * Well, certainly it is pure paranoia, but if we load TIME-WAIT 6862306a36Sopenharmony_ci * with this semantics, we MUST NOT kill TIME-WAIT state with RSTs. 6962306a36Sopenharmony_ci * * If we invented some more clever way to catch duplicates 7062306a36Sopenharmony_ci * (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs. 7162306a36Sopenharmony_ci * 7262306a36Sopenharmony_ci * The algorithm below is based on FORMAL INTERPRETATION of RFCs. 7362306a36Sopenharmony_ci * When you compare it to RFCs, please, read section SEGMENT ARRIVES 7462306a36Sopenharmony_ci * from the very beginning. 7562306a36Sopenharmony_ci * 7662306a36Sopenharmony_ci * NOTE. With recycling (and later with fin-wait-2) TW bucket 7762306a36Sopenharmony_ci * is _not_ stateless. It means, that strictly speaking we must 7862306a36Sopenharmony_ci * spinlock it. I do not want! Well, probability of misbehaviour 7962306a36Sopenharmony_ci * is ridiculously low and, seems, we could use some mb() tricks 8062306a36Sopenharmony_ci * to avoid misread sequence numbers, states etc. --ANK 8162306a36Sopenharmony_ci * 8262306a36Sopenharmony_ci * We don't need to initialize tmp_out.sack_ok as we don't use the results 8362306a36Sopenharmony_ci */ 8462306a36Sopenharmony_cienum tcp_tw_status 8562306a36Sopenharmony_citcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, 8662306a36Sopenharmony_ci const struct tcphdr *th) 8762306a36Sopenharmony_ci{ 8862306a36Sopenharmony_ci struct tcp_options_received tmp_opt; 8962306a36Sopenharmony_ci struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); 9062306a36Sopenharmony_ci bool paws_reject = false; 9162306a36Sopenharmony_ci 9262306a36Sopenharmony_ci tmp_opt.saw_tstamp = 0; 9362306a36Sopenharmony_ci if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { 9462306a36Sopenharmony_ci tcp_parse_options(twsk_net(tw), skb, &tmp_opt, 0, NULL); 9562306a36Sopenharmony_ci 9662306a36Sopenharmony_ci if (tmp_opt.saw_tstamp) { 9762306a36Sopenharmony_ci if (tmp_opt.rcv_tsecr) 9862306a36Sopenharmony_ci tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset; 9962306a36Sopenharmony_ci tmp_opt.ts_recent = tcptw->tw_ts_recent; 10062306a36Sopenharmony_ci tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 10162306a36Sopenharmony_ci paws_reject = tcp_paws_reject(&tmp_opt, th->rst); 10262306a36Sopenharmony_ci } 10362306a36Sopenharmony_ci } 10462306a36Sopenharmony_ci 10562306a36Sopenharmony_ci if (tw->tw_substate == TCP_FIN_WAIT2) { 10662306a36Sopenharmony_ci /* Just repeat all the checks of tcp_rcv_state_process() */ 10762306a36Sopenharmony_ci 10862306a36Sopenharmony_ci /* Out of window, send ACK */ 10962306a36Sopenharmony_ci if (paws_reject || 11062306a36Sopenharmony_ci !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, 11162306a36Sopenharmony_ci tcptw->tw_rcv_nxt, 11262306a36Sopenharmony_ci tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd)) 11362306a36Sopenharmony_ci return tcp_timewait_check_oow_rate_limit( 11462306a36Sopenharmony_ci tw, skb, LINUX_MIB_TCPACKSKIPPEDFINWAIT2); 11562306a36Sopenharmony_ci 11662306a36Sopenharmony_ci if (th->rst) 11762306a36Sopenharmony_ci goto kill; 11862306a36Sopenharmony_ci 11962306a36Sopenharmony_ci if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt)) 12062306a36Sopenharmony_ci return TCP_TW_RST; 12162306a36Sopenharmony_ci 12262306a36Sopenharmony_ci /* Dup ACK? */ 12362306a36Sopenharmony_ci if (!th->ack || 12462306a36Sopenharmony_ci !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) || 12562306a36Sopenharmony_ci TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) { 12662306a36Sopenharmony_ci inet_twsk_put(tw); 12762306a36Sopenharmony_ci return TCP_TW_SUCCESS; 12862306a36Sopenharmony_ci } 12962306a36Sopenharmony_ci 13062306a36Sopenharmony_ci /* New data or FIN. If new data arrive after half-duplex close, 13162306a36Sopenharmony_ci * reset. 13262306a36Sopenharmony_ci */ 13362306a36Sopenharmony_ci if (!th->fin || 13462306a36Sopenharmony_ci TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) 13562306a36Sopenharmony_ci return TCP_TW_RST; 13662306a36Sopenharmony_ci 13762306a36Sopenharmony_ci /* FIN arrived, enter true time-wait state. */ 13862306a36Sopenharmony_ci tw->tw_substate = TCP_TIME_WAIT; 13962306a36Sopenharmony_ci tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq; 14062306a36Sopenharmony_ci if (tmp_opt.saw_tstamp) { 14162306a36Sopenharmony_ci tcptw->tw_ts_recent_stamp = ktime_get_seconds(); 14262306a36Sopenharmony_ci tcptw->tw_ts_recent = tmp_opt.rcv_tsval; 14362306a36Sopenharmony_ci } 14462306a36Sopenharmony_ci 14562306a36Sopenharmony_ci inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); 14662306a36Sopenharmony_ci return TCP_TW_ACK; 14762306a36Sopenharmony_ci } 14862306a36Sopenharmony_ci 14962306a36Sopenharmony_ci /* 15062306a36Sopenharmony_ci * Now real TIME-WAIT state. 15162306a36Sopenharmony_ci * 15262306a36Sopenharmony_ci * RFC 1122: 15362306a36Sopenharmony_ci * "When a connection is [...] on TIME-WAIT state [...] 15462306a36Sopenharmony_ci * [a TCP] MAY accept a new SYN from the remote TCP to 15562306a36Sopenharmony_ci * reopen the connection directly, if it: 15662306a36Sopenharmony_ci * 15762306a36Sopenharmony_ci * (1) assigns its initial sequence number for the new 15862306a36Sopenharmony_ci * connection to be larger than the largest sequence 15962306a36Sopenharmony_ci * number it used on the previous connection incarnation, 16062306a36Sopenharmony_ci * and 16162306a36Sopenharmony_ci * 16262306a36Sopenharmony_ci * (2) returns to TIME-WAIT state if the SYN turns out 16362306a36Sopenharmony_ci * to be an old duplicate". 16462306a36Sopenharmony_ci */ 16562306a36Sopenharmony_ci 16662306a36Sopenharmony_ci if (!paws_reject && 16762306a36Sopenharmony_ci (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt && 16862306a36Sopenharmony_ci (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) { 16962306a36Sopenharmony_ci /* In window segment, it may be only reset or bare ack. */ 17062306a36Sopenharmony_ci 17162306a36Sopenharmony_ci if (th->rst) { 17262306a36Sopenharmony_ci /* This is TIME_WAIT assassination, in two flavors. 17362306a36Sopenharmony_ci * Oh well... nobody has a sufficient solution to this 17462306a36Sopenharmony_ci * protocol bug yet. 17562306a36Sopenharmony_ci */ 17662306a36Sopenharmony_ci if (!READ_ONCE(twsk_net(tw)->ipv4.sysctl_tcp_rfc1337)) { 17762306a36Sopenharmony_cikill: 17862306a36Sopenharmony_ci inet_twsk_deschedule_put(tw); 17962306a36Sopenharmony_ci return TCP_TW_SUCCESS; 18062306a36Sopenharmony_ci } 18162306a36Sopenharmony_ci } else { 18262306a36Sopenharmony_ci inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); 18362306a36Sopenharmony_ci } 18462306a36Sopenharmony_ci 18562306a36Sopenharmony_ci if (tmp_opt.saw_tstamp) { 18662306a36Sopenharmony_ci tcptw->tw_ts_recent = tmp_opt.rcv_tsval; 18762306a36Sopenharmony_ci tcptw->tw_ts_recent_stamp = ktime_get_seconds(); 18862306a36Sopenharmony_ci } 18962306a36Sopenharmony_ci 19062306a36Sopenharmony_ci inet_twsk_put(tw); 19162306a36Sopenharmony_ci return TCP_TW_SUCCESS; 19262306a36Sopenharmony_ci } 19362306a36Sopenharmony_ci 19462306a36Sopenharmony_ci /* Out of window segment. 19562306a36Sopenharmony_ci 19662306a36Sopenharmony_ci All the segments are ACKed immediately. 19762306a36Sopenharmony_ci 19862306a36Sopenharmony_ci The only exception is new SYN. We accept it, if it is 19962306a36Sopenharmony_ci not old duplicate and we are not in danger to be killed 20062306a36Sopenharmony_ci by delayed old duplicates. RFC check is that it has 20162306a36Sopenharmony_ci newer sequence number works at rates <40Mbit/sec. 20262306a36Sopenharmony_ci However, if paws works, it is reliable AND even more, 20362306a36Sopenharmony_ci we even may relax silly seq space cutoff. 20462306a36Sopenharmony_ci 20562306a36Sopenharmony_ci RED-PEN: we violate main RFC requirement, if this SYN will appear 20662306a36Sopenharmony_ci old duplicate (i.e. we receive RST in reply to SYN-ACK), 20762306a36Sopenharmony_ci we must return socket to time-wait state. It is not good, 20862306a36Sopenharmony_ci but not fatal yet. 20962306a36Sopenharmony_ci */ 21062306a36Sopenharmony_ci 21162306a36Sopenharmony_ci if (th->syn && !th->rst && !th->ack && !paws_reject && 21262306a36Sopenharmony_ci (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) || 21362306a36Sopenharmony_ci (tmp_opt.saw_tstamp && 21462306a36Sopenharmony_ci (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) { 21562306a36Sopenharmony_ci u32 isn = tcptw->tw_snd_nxt + 65535 + 2; 21662306a36Sopenharmony_ci if (isn == 0) 21762306a36Sopenharmony_ci isn++; 21862306a36Sopenharmony_ci TCP_SKB_CB(skb)->tcp_tw_isn = isn; 21962306a36Sopenharmony_ci return TCP_TW_SYN; 22062306a36Sopenharmony_ci } 22162306a36Sopenharmony_ci 22262306a36Sopenharmony_ci if (paws_reject) 22362306a36Sopenharmony_ci __NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED); 22462306a36Sopenharmony_ci 22562306a36Sopenharmony_ci if (!th->rst) { 22662306a36Sopenharmony_ci /* In this case we must reset the TIMEWAIT timer. 22762306a36Sopenharmony_ci * 22862306a36Sopenharmony_ci * If it is ACKless SYN it may be both old duplicate 22962306a36Sopenharmony_ci * and new good SYN with random sequence number <rcv_nxt. 23062306a36Sopenharmony_ci * Do not reschedule in the last case. 23162306a36Sopenharmony_ci */ 23262306a36Sopenharmony_ci if (paws_reject || th->ack) 23362306a36Sopenharmony_ci inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); 23462306a36Sopenharmony_ci 23562306a36Sopenharmony_ci return tcp_timewait_check_oow_rate_limit( 23662306a36Sopenharmony_ci tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT); 23762306a36Sopenharmony_ci } 23862306a36Sopenharmony_ci inet_twsk_put(tw); 23962306a36Sopenharmony_ci return TCP_TW_SUCCESS; 24062306a36Sopenharmony_ci} 24162306a36Sopenharmony_ciEXPORT_SYMBOL(tcp_timewait_state_process); 24262306a36Sopenharmony_ci 24362306a36Sopenharmony_cistatic void tcp_time_wait_init(struct sock *sk, struct tcp_timewait_sock *tcptw) 24462306a36Sopenharmony_ci{ 24562306a36Sopenharmony_ci#ifdef CONFIG_TCP_MD5SIG 24662306a36Sopenharmony_ci const struct tcp_sock *tp = tcp_sk(sk); 24762306a36Sopenharmony_ci struct tcp_md5sig_key *key; 24862306a36Sopenharmony_ci 24962306a36Sopenharmony_ci /* 25062306a36Sopenharmony_ci * The timewait bucket does not have the key DB from the 25162306a36Sopenharmony_ci * sock structure. We just make a quick copy of the 25262306a36Sopenharmony_ci * md5 key being used (if indeed we are using one) 25362306a36Sopenharmony_ci * so the timewait ack generating code has the key. 25462306a36Sopenharmony_ci */ 25562306a36Sopenharmony_ci tcptw->tw_md5_key = NULL; 25662306a36Sopenharmony_ci if (!static_branch_unlikely(&tcp_md5_needed.key)) 25762306a36Sopenharmony_ci return; 25862306a36Sopenharmony_ci 25962306a36Sopenharmony_ci key = tp->af_specific->md5_lookup(sk, sk); 26062306a36Sopenharmony_ci if (key) { 26162306a36Sopenharmony_ci tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC); 26262306a36Sopenharmony_ci if (!tcptw->tw_md5_key) 26362306a36Sopenharmony_ci return; 26462306a36Sopenharmony_ci if (!tcp_alloc_md5sig_pool()) 26562306a36Sopenharmony_ci goto out_free; 26662306a36Sopenharmony_ci if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) 26762306a36Sopenharmony_ci goto out_free; 26862306a36Sopenharmony_ci } 26962306a36Sopenharmony_ci return; 27062306a36Sopenharmony_ciout_free: 27162306a36Sopenharmony_ci WARN_ON_ONCE(1); 27262306a36Sopenharmony_ci kfree(tcptw->tw_md5_key); 27362306a36Sopenharmony_ci tcptw->tw_md5_key = NULL; 27462306a36Sopenharmony_ci#endif 27562306a36Sopenharmony_ci} 27662306a36Sopenharmony_ci 27762306a36Sopenharmony_ci/* 27862306a36Sopenharmony_ci * Move a socket to time-wait or dead fin-wait-2 state. 27962306a36Sopenharmony_ci */ 28062306a36Sopenharmony_civoid tcp_time_wait(struct sock *sk, int state, int timeo) 28162306a36Sopenharmony_ci{ 28262306a36Sopenharmony_ci const struct inet_connection_sock *icsk = inet_csk(sk); 28362306a36Sopenharmony_ci const struct tcp_sock *tp = tcp_sk(sk); 28462306a36Sopenharmony_ci struct net *net = sock_net(sk); 28562306a36Sopenharmony_ci struct inet_timewait_sock *tw; 28662306a36Sopenharmony_ci 28762306a36Sopenharmony_ci tw = inet_twsk_alloc(sk, &net->ipv4.tcp_death_row, state); 28862306a36Sopenharmony_ci 28962306a36Sopenharmony_ci if (tw) { 29062306a36Sopenharmony_ci struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); 29162306a36Sopenharmony_ci const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); 29262306a36Sopenharmony_ci 29362306a36Sopenharmony_ci tw->tw_transparent = inet_test_bit(TRANSPARENT, sk); 29462306a36Sopenharmony_ci tw->tw_mark = sk->sk_mark; 29562306a36Sopenharmony_ci tw->tw_priority = sk->sk_priority; 29662306a36Sopenharmony_ci tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; 29762306a36Sopenharmony_ci tcptw->tw_rcv_nxt = tp->rcv_nxt; 29862306a36Sopenharmony_ci tcptw->tw_snd_nxt = tp->snd_nxt; 29962306a36Sopenharmony_ci tcptw->tw_rcv_wnd = tcp_receive_window(tp); 30062306a36Sopenharmony_ci tcptw->tw_ts_recent = tp->rx_opt.ts_recent; 30162306a36Sopenharmony_ci tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; 30262306a36Sopenharmony_ci tcptw->tw_ts_offset = tp->tsoffset; 30362306a36Sopenharmony_ci tcptw->tw_last_oow_ack_time = 0; 30462306a36Sopenharmony_ci tcptw->tw_tx_delay = tp->tcp_tx_delay; 30562306a36Sopenharmony_ci tw->tw_txhash = sk->sk_txhash; 30662306a36Sopenharmony_ci#if IS_ENABLED(CONFIG_IPV6) 30762306a36Sopenharmony_ci if (tw->tw_family == PF_INET6) { 30862306a36Sopenharmony_ci struct ipv6_pinfo *np = inet6_sk(sk); 30962306a36Sopenharmony_ci 31062306a36Sopenharmony_ci tw->tw_v6_daddr = sk->sk_v6_daddr; 31162306a36Sopenharmony_ci tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr; 31262306a36Sopenharmony_ci tw->tw_tclass = np->tclass; 31362306a36Sopenharmony_ci tw->tw_flowlabel = be32_to_cpu(np->flow_label & IPV6_FLOWLABEL_MASK); 31462306a36Sopenharmony_ci tw->tw_ipv6only = sk->sk_ipv6only; 31562306a36Sopenharmony_ci } 31662306a36Sopenharmony_ci#endif 31762306a36Sopenharmony_ci 31862306a36Sopenharmony_ci tcp_time_wait_init(sk, tcptw); 31962306a36Sopenharmony_ci 32062306a36Sopenharmony_ci /* Get the TIME_WAIT timeout firing. */ 32162306a36Sopenharmony_ci if (timeo < rto) 32262306a36Sopenharmony_ci timeo = rto; 32362306a36Sopenharmony_ci 32462306a36Sopenharmony_ci if (state == TCP_TIME_WAIT) 32562306a36Sopenharmony_ci timeo = TCP_TIMEWAIT_LEN; 32662306a36Sopenharmony_ci 32762306a36Sopenharmony_ci /* tw_timer is pinned, so we need to make sure BH are disabled 32862306a36Sopenharmony_ci * in following section, otherwise timer handler could run before 32962306a36Sopenharmony_ci * we complete the initialization. 33062306a36Sopenharmony_ci */ 33162306a36Sopenharmony_ci local_bh_disable(); 33262306a36Sopenharmony_ci inet_twsk_schedule(tw, timeo); 33362306a36Sopenharmony_ci /* Linkage updates. 33462306a36Sopenharmony_ci * Note that access to tw after this point is illegal. 33562306a36Sopenharmony_ci */ 33662306a36Sopenharmony_ci inet_twsk_hashdance(tw, sk, net->ipv4.tcp_death_row.hashinfo); 33762306a36Sopenharmony_ci local_bh_enable(); 33862306a36Sopenharmony_ci } else { 33962306a36Sopenharmony_ci /* Sorry, if we're out of memory, just CLOSE this 34062306a36Sopenharmony_ci * socket up. We've got bigger problems than 34162306a36Sopenharmony_ci * non-graceful socket closings. 34262306a36Sopenharmony_ci */ 34362306a36Sopenharmony_ci NET_INC_STATS(net, LINUX_MIB_TCPTIMEWAITOVERFLOW); 34462306a36Sopenharmony_ci } 34562306a36Sopenharmony_ci 34662306a36Sopenharmony_ci tcp_update_metrics(sk); 34762306a36Sopenharmony_ci tcp_done(sk); 34862306a36Sopenharmony_ci} 34962306a36Sopenharmony_ciEXPORT_SYMBOL(tcp_time_wait); 35062306a36Sopenharmony_ci 35162306a36Sopenharmony_civoid tcp_twsk_destructor(struct sock *sk) 35262306a36Sopenharmony_ci{ 35362306a36Sopenharmony_ci#ifdef CONFIG_TCP_MD5SIG 35462306a36Sopenharmony_ci if (static_branch_unlikely(&tcp_md5_needed.key)) { 35562306a36Sopenharmony_ci struct tcp_timewait_sock *twsk = tcp_twsk(sk); 35662306a36Sopenharmony_ci 35762306a36Sopenharmony_ci if (twsk->tw_md5_key) { 35862306a36Sopenharmony_ci kfree_rcu(twsk->tw_md5_key, rcu); 35962306a36Sopenharmony_ci static_branch_slow_dec_deferred(&tcp_md5_needed); 36062306a36Sopenharmony_ci } 36162306a36Sopenharmony_ci } 36262306a36Sopenharmony_ci#endif 36362306a36Sopenharmony_ci} 36462306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(tcp_twsk_destructor); 36562306a36Sopenharmony_ci 36662306a36Sopenharmony_civoid tcp_twsk_purge(struct list_head *net_exit_list, int family) 36762306a36Sopenharmony_ci{ 36862306a36Sopenharmony_ci bool purged_once = false; 36962306a36Sopenharmony_ci struct net *net; 37062306a36Sopenharmony_ci 37162306a36Sopenharmony_ci list_for_each_entry(net, net_exit_list, exit_list) { 37262306a36Sopenharmony_ci if (net->ipv4.tcp_death_row.hashinfo->pernet) { 37362306a36Sopenharmony_ci /* Even if tw_refcount == 1, we must clean up kernel reqsk */ 37462306a36Sopenharmony_ci inet_twsk_purge(net->ipv4.tcp_death_row.hashinfo, family); 37562306a36Sopenharmony_ci } else if (!purged_once) { 37662306a36Sopenharmony_ci inet_twsk_purge(&tcp_hashinfo, family); 37762306a36Sopenharmony_ci purged_once = true; 37862306a36Sopenharmony_ci } 37962306a36Sopenharmony_ci } 38062306a36Sopenharmony_ci} 38162306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(tcp_twsk_purge); 38262306a36Sopenharmony_ci 38362306a36Sopenharmony_ci/* Warning : This function is called without sk_listener being locked. 38462306a36Sopenharmony_ci * Be sure to read socket fields once, as their value could change under us. 38562306a36Sopenharmony_ci */ 38662306a36Sopenharmony_civoid tcp_openreq_init_rwin(struct request_sock *req, 38762306a36Sopenharmony_ci const struct sock *sk_listener, 38862306a36Sopenharmony_ci const struct dst_entry *dst) 38962306a36Sopenharmony_ci{ 39062306a36Sopenharmony_ci struct inet_request_sock *ireq = inet_rsk(req); 39162306a36Sopenharmony_ci const struct tcp_sock *tp = tcp_sk(sk_listener); 39262306a36Sopenharmony_ci int full_space = tcp_full_space(sk_listener); 39362306a36Sopenharmony_ci u32 window_clamp; 39462306a36Sopenharmony_ci __u8 rcv_wscale; 39562306a36Sopenharmony_ci u32 rcv_wnd; 39662306a36Sopenharmony_ci int mss; 39762306a36Sopenharmony_ci 39862306a36Sopenharmony_ci mss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); 39962306a36Sopenharmony_ci window_clamp = READ_ONCE(tp->window_clamp); 40062306a36Sopenharmony_ci /* Set this up on the first call only */ 40162306a36Sopenharmony_ci req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW); 40262306a36Sopenharmony_ci 40362306a36Sopenharmony_ci /* limit the window selection if the user enforce a smaller rx buffer */ 40462306a36Sopenharmony_ci if (sk_listener->sk_userlocks & SOCK_RCVBUF_LOCK && 40562306a36Sopenharmony_ci (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) 40662306a36Sopenharmony_ci req->rsk_window_clamp = full_space; 40762306a36Sopenharmony_ci 40862306a36Sopenharmony_ci rcv_wnd = tcp_rwnd_init_bpf((struct sock *)req); 40962306a36Sopenharmony_ci if (rcv_wnd == 0) 41062306a36Sopenharmony_ci rcv_wnd = dst_metric(dst, RTAX_INITRWND); 41162306a36Sopenharmony_ci else if (full_space < rcv_wnd * mss) 41262306a36Sopenharmony_ci full_space = rcv_wnd * mss; 41362306a36Sopenharmony_ci 41462306a36Sopenharmony_ci /* tcp_full_space because it is guaranteed to be the first packet */ 41562306a36Sopenharmony_ci tcp_select_initial_window(sk_listener, full_space, 41662306a36Sopenharmony_ci mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), 41762306a36Sopenharmony_ci &req->rsk_rcv_wnd, 41862306a36Sopenharmony_ci &req->rsk_window_clamp, 41962306a36Sopenharmony_ci ireq->wscale_ok, 42062306a36Sopenharmony_ci &rcv_wscale, 42162306a36Sopenharmony_ci rcv_wnd); 42262306a36Sopenharmony_ci ireq->rcv_wscale = rcv_wscale; 42362306a36Sopenharmony_ci} 42462306a36Sopenharmony_ciEXPORT_SYMBOL(tcp_openreq_init_rwin); 42562306a36Sopenharmony_ci 42662306a36Sopenharmony_cistatic void tcp_ecn_openreq_child(struct tcp_sock *tp, 42762306a36Sopenharmony_ci const struct request_sock *req) 42862306a36Sopenharmony_ci{ 42962306a36Sopenharmony_ci tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0; 43062306a36Sopenharmony_ci} 43162306a36Sopenharmony_ci 43262306a36Sopenharmony_civoid tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) 43362306a36Sopenharmony_ci{ 43462306a36Sopenharmony_ci struct inet_connection_sock *icsk = inet_csk(sk); 43562306a36Sopenharmony_ci u32 ca_key = dst_metric(dst, RTAX_CC_ALGO); 43662306a36Sopenharmony_ci bool ca_got_dst = false; 43762306a36Sopenharmony_ci 43862306a36Sopenharmony_ci if (ca_key != TCP_CA_UNSPEC) { 43962306a36Sopenharmony_ci const struct tcp_congestion_ops *ca; 44062306a36Sopenharmony_ci 44162306a36Sopenharmony_ci rcu_read_lock(); 44262306a36Sopenharmony_ci ca = tcp_ca_find_key(ca_key); 44362306a36Sopenharmony_ci if (likely(ca && bpf_try_module_get(ca, ca->owner))) { 44462306a36Sopenharmony_ci icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst); 44562306a36Sopenharmony_ci icsk->icsk_ca_ops = ca; 44662306a36Sopenharmony_ci ca_got_dst = true; 44762306a36Sopenharmony_ci } 44862306a36Sopenharmony_ci rcu_read_unlock(); 44962306a36Sopenharmony_ci } 45062306a36Sopenharmony_ci 45162306a36Sopenharmony_ci /* If no valid choice made yet, assign current system default ca. */ 45262306a36Sopenharmony_ci if (!ca_got_dst && 45362306a36Sopenharmony_ci (!icsk->icsk_ca_setsockopt || 45462306a36Sopenharmony_ci !bpf_try_module_get(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner))) 45562306a36Sopenharmony_ci tcp_assign_congestion_control(sk); 45662306a36Sopenharmony_ci 45762306a36Sopenharmony_ci tcp_set_ca_state(sk, TCP_CA_Open); 45862306a36Sopenharmony_ci} 45962306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(tcp_ca_openreq_child); 46062306a36Sopenharmony_ci 46162306a36Sopenharmony_cistatic void smc_check_reset_syn_req(const struct tcp_sock *oldtp, 46262306a36Sopenharmony_ci struct request_sock *req, 46362306a36Sopenharmony_ci struct tcp_sock *newtp) 46462306a36Sopenharmony_ci{ 46562306a36Sopenharmony_ci#if IS_ENABLED(CONFIG_SMC) 46662306a36Sopenharmony_ci struct inet_request_sock *ireq; 46762306a36Sopenharmony_ci 46862306a36Sopenharmony_ci if (static_branch_unlikely(&tcp_have_smc)) { 46962306a36Sopenharmony_ci ireq = inet_rsk(req); 47062306a36Sopenharmony_ci if (oldtp->syn_smc && !ireq->smc_ok) 47162306a36Sopenharmony_ci newtp->syn_smc = 0; 47262306a36Sopenharmony_ci } 47362306a36Sopenharmony_ci#endif 47462306a36Sopenharmony_ci} 47562306a36Sopenharmony_ci 47662306a36Sopenharmony_ci/* This is not only more efficient than what we used to do, it eliminates 47762306a36Sopenharmony_ci * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM 47862306a36Sopenharmony_ci * 47962306a36Sopenharmony_ci * Actually, we could lots of memory writes here. tp of listening 48062306a36Sopenharmony_ci * socket contains all necessary default parameters. 48162306a36Sopenharmony_ci */ 48262306a36Sopenharmony_cistruct sock *tcp_create_openreq_child(const struct sock *sk, 48362306a36Sopenharmony_ci struct request_sock *req, 48462306a36Sopenharmony_ci struct sk_buff *skb) 48562306a36Sopenharmony_ci{ 48662306a36Sopenharmony_ci struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC); 48762306a36Sopenharmony_ci const struct inet_request_sock *ireq = inet_rsk(req); 48862306a36Sopenharmony_ci struct tcp_request_sock *treq = tcp_rsk(req); 48962306a36Sopenharmony_ci struct inet_connection_sock *newicsk; 49062306a36Sopenharmony_ci const struct tcp_sock *oldtp; 49162306a36Sopenharmony_ci struct tcp_sock *newtp; 49262306a36Sopenharmony_ci u32 seq; 49362306a36Sopenharmony_ci 49462306a36Sopenharmony_ci if (!newsk) 49562306a36Sopenharmony_ci return NULL; 49662306a36Sopenharmony_ci 49762306a36Sopenharmony_ci newicsk = inet_csk(newsk); 49862306a36Sopenharmony_ci newtp = tcp_sk(newsk); 49962306a36Sopenharmony_ci oldtp = tcp_sk(sk); 50062306a36Sopenharmony_ci 50162306a36Sopenharmony_ci smc_check_reset_syn_req(oldtp, req, newtp); 50262306a36Sopenharmony_ci 50362306a36Sopenharmony_ci /* Now setup tcp_sock */ 50462306a36Sopenharmony_ci newtp->pred_flags = 0; 50562306a36Sopenharmony_ci 50662306a36Sopenharmony_ci seq = treq->rcv_isn + 1; 50762306a36Sopenharmony_ci newtp->rcv_wup = seq; 50862306a36Sopenharmony_ci WRITE_ONCE(newtp->copied_seq, seq); 50962306a36Sopenharmony_ci WRITE_ONCE(newtp->rcv_nxt, seq); 51062306a36Sopenharmony_ci newtp->segs_in = 1; 51162306a36Sopenharmony_ci 51262306a36Sopenharmony_ci seq = treq->snt_isn + 1; 51362306a36Sopenharmony_ci newtp->snd_sml = newtp->snd_una = seq; 51462306a36Sopenharmony_ci WRITE_ONCE(newtp->snd_nxt, seq); 51562306a36Sopenharmony_ci newtp->snd_up = seq; 51662306a36Sopenharmony_ci 51762306a36Sopenharmony_ci INIT_LIST_HEAD(&newtp->tsq_node); 51862306a36Sopenharmony_ci INIT_LIST_HEAD(&newtp->tsorted_sent_queue); 51962306a36Sopenharmony_ci 52062306a36Sopenharmony_ci tcp_init_wl(newtp, treq->rcv_isn); 52162306a36Sopenharmony_ci 52262306a36Sopenharmony_ci minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U); 52362306a36Sopenharmony_ci newicsk->icsk_ack.lrcvtime = tcp_jiffies32; 52462306a36Sopenharmony_ci 52562306a36Sopenharmony_ci newtp->lsndtime = tcp_jiffies32; 52662306a36Sopenharmony_ci newsk->sk_txhash = READ_ONCE(treq->txhash); 52762306a36Sopenharmony_ci newtp->total_retrans = req->num_retrans; 52862306a36Sopenharmony_ci 52962306a36Sopenharmony_ci tcp_init_xmit_timers(newsk); 53062306a36Sopenharmony_ci WRITE_ONCE(newtp->write_seq, newtp->pushed_seq = treq->snt_isn + 1); 53162306a36Sopenharmony_ci 53262306a36Sopenharmony_ci if (sock_flag(newsk, SOCK_KEEPOPEN)) 53362306a36Sopenharmony_ci inet_csk_reset_keepalive_timer(newsk, 53462306a36Sopenharmony_ci keepalive_time_when(newtp)); 53562306a36Sopenharmony_ci 53662306a36Sopenharmony_ci newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; 53762306a36Sopenharmony_ci newtp->rx_opt.sack_ok = ireq->sack_ok; 53862306a36Sopenharmony_ci newtp->window_clamp = req->rsk_window_clamp; 53962306a36Sopenharmony_ci newtp->rcv_ssthresh = req->rsk_rcv_wnd; 54062306a36Sopenharmony_ci newtp->rcv_wnd = req->rsk_rcv_wnd; 54162306a36Sopenharmony_ci newtp->rx_opt.wscale_ok = ireq->wscale_ok; 54262306a36Sopenharmony_ci if (newtp->rx_opt.wscale_ok) { 54362306a36Sopenharmony_ci newtp->rx_opt.snd_wscale = ireq->snd_wscale; 54462306a36Sopenharmony_ci newtp->rx_opt.rcv_wscale = ireq->rcv_wscale; 54562306a36Sopenharmony_ci } else { 54662306a36Sopenharmony_ci newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0; 54762306a36Sopenharmony_ci newtp->window_clamp = min(newtp->window_clamp, 65535U); 54862306a36Sopenharmony_ci } 54962306a36Sopenharmony_ci newtp->snd_wnd = ntohs(tcp_hdr(skb)->window) << newtp->rx_opt.snd_wscale; 55062306a36Sopenharmony_ci newtp->max_window = newtp->snd_wnd; 55162306a36Sopenharmony_ci 55262306a36Sopenharmony_ci if (newtp->rx_opt.tstamp_ok) { 55362306a36Sopenharmony_ci newtp->rx_opt.ts_recent = READ_ONCE(req->ts_recent); 55462306a36Sopenharmony_ci newtp->rx_opt.ts_recent_stamp = ktime_get_seconds(); 55562306a36Sopenharmony_ci newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; 55662306a36Sopenharmony_ci } else { 55762306a36Sopenharmony_ci newtp->rx_opt.ts_recent_stamp = 0; 55862306a36Sopenharmony_ci newtp->tcp_header_len = sizeof(struct tcphdr); 55962306a36Sopenharmony_ci } 56062306a36Sopenharmony_ci if (req->num_timeout) { 56162306a36Sopenharmony_ci newtp->undo_marker = treq->snt_isn; 56262306a36Sopenharmony_ci newtp->retrans_stamp = div_u64(treq->snt_synack, 56362306a36Sopenharmony_ci USEC_PER_SEC / TCP_TS_HZ); 56462306a36Sopenharmony_ci } 56562306a36Sopenharmony_ci newtp->tsoffset = treq->ts_off; 56662306a36Sopenharmony_ci#ifdef CONFIG_TCP_MD5SIG 56762306a36Sopenharmony_ci newtp->md5sig_info = NULL; /*XXX*/ 56862306a36Sopenharmony_ci#endif 56962306a36Sopenharmony_ci if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) 57062306a36Sopenharmony_ci newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; 57162306a36Sopenharmony_ci newtp->rx_opt.mss_clamp = req->mss; 57262306a36Sopenharmony_ci tcp_ecn_openreq_child(newtp, req); 57362306a36Sopenharmony_ci newtp->fastopen_req = NULL; 57462306a36Sopenharmony_ci RCU_INIT_POINTER(newtp->fastopen_rsk, NULL); 57562306a36Sopenharmony_ci 57662306a36Sopenharmony_ci newtp->bpf_chg_cc_inprogress = 0; 57762306a36Sopenharmony_ci tcp_bpf_clone(sk, newsk); 57862306a36Sopenharmony_ci 57962306a36Sopenharmony_ci __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); 58062306a36Sopenharmony_ci 58162306a36Sopenharmony_ci return newsk; 58262306a36Sopenharmony_ci} 58362306a36Sopenharmony_ciEXPORT_SYMBOL(tcp_create_openreq_child); 58462306a36Sopenharmony_ci 58562306a36Sopenharmony_ci/* 58662306a36Sopenharmony_ci * Process an incoming packet for SYN_RECV sockets represented as a 58762306a36Sopenharmony_ci * request_sock. Normally sk is the listener socket but for TFO it 58862306a36Sopenharmony_ci * points to the child socket. 58962306a36Sopenharmony_ci * 59062306a36Sopenharmony_ci * XXX (TFO) - The current impl contains a special check for ack 59162306a36Sopenharmony_ci * validation and inside tcp_v4_reqsk_send_ack(). Can we do better? 59262306a36Sopenharmony_ci * 59362306a36Sopenharmony_ci * We don't need to initialize tmp_opt.sack_ok as we don't use the results 59462306a36Sopenharmony_ci * 59562306a36Sopenharmony_ci * Note: If @fastopen is true, this can be called from process context. 59662306a36Sopenharmony_ci * Otherwise, this is from BH context. 59762306a36Sopenharmony_ci */ 59862306a36Sopenharmony_ci 59962306a36Sopenharmony_cistruct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, 60062306a36Sopenharmony_ci struct request_sock *req, 60162306a36Sopenharmony_ci bool fastopen, bool *req_stolen) 60262306a36Sopenharmony_ci{ 60362306a36Sopenharmony_ci struct tcp_options_received tmp_opt; 60462306a36Sopenharmony_ci struct sock *child; 60562306a36Sopenharmony_ci const struct tcphdr *th = tcp_hdr(skb); 60662306a36Sopenharmony_ci __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); 60762306a36Sopenharmony_ci bool paws_reject = false; 60862306a36Sopenharmony_ci bool own_req; 60962306a36Sopenharmony_ci 61062306a36Sopenharmony_ci tmp_opt.saw_tstamp = 0; 61162306a36Sopenharmony_ci if (th->doff > (sizeof(struct tcphdr)>>2)) { 61262306a36Sopenharmony_ci tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL); 61362306a36Sopenharmony_ci 61462306a36Sopenharmony_ci if (tmp_opt.saw_tstamp) { 61562306a36Sopenharmony_ci tmp_opt.ts_recent = READ_ONCE(req->ts_recent); 61662306a36Sopenharmony_ci if (tmp_opt.rcv_tsecr) 61762306a36Sopenharmony_ci tmp_opt.rcv_tsecr -= tcp_rsk(req)->ts_off; 61862306a36Sopenharmony_ci /* We do not store true stamp, but it is not required, 61962306a36Sopenharmony_ci * it can be estimated (approximately) 62062306a36Sopenharmony_ci * from another data. 62162306a36Sopenharmony_ci */ 62262306a36Sopenharmony_ci tmp_opt.ts_recent_stamp = ktime_get_seconds() - reqsk_timeout(req, TCP_RTO_MAX) / HZ; 62362306a36Sopenharmony_ci paws_reject = tcp_paws_reject(&tmp_opt, th->rst); 62462306a36Sopenharmony_ci } 62562306a36Sopenharmony_ci } 62662306a36Sopenharmony_ci 62762306a36Sopenharmony_ci /* Check for pure retransmitted SYN. */ 62862306a36Sopenharmony_ci if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn && 62962306a36Sopenharmony_ci flg == TCP_FLAG_SYN && 63062306a36Sopenharmony_ci !paws_reject) { 63162306a36Sopenharmony_ci /* 63262306a36Sopenharmony_ci * RFC793 draws (Incorrectly! It was fixed in RFC1122) 63362306a36Sopenharmony_ci * this case on figure 6 and figure 8, but formal 63462306a36Sopenharmony_ci * protocol description says NOTHING. 63562306a36Sopenharmony_ci * To be more exact, it says that we should send ACK, 63662306a36Sopenharmony_ci * because this segment (at least, if it has no data) 63762306a36Sopenharmony_ci * is out of window. 63862306a36Sopenharmony_ci * 63962306a36Sopenharmony_ci * CONCLUSION: RFC793 (even with RFC1122) DOES NOT 64062306a36Sopenharmony_ci * describe SYN-RECV state. All the description 64162306a36Sopenharmony_ci * is wrong, we cannot believe to it and should 64262306a36Sopenharmony_ci * rely only on common sense and implementation 64362306a36Sopenharmony_ci * experience. 64462306a36Sopenharmony_ci * 64562306a36Sopenharmony_ci * Enforce "SYN-ACK" according to figure 8, figure 6 64662306a36Sopenharmony_ci * of RFC793, fixed by RFC1122. 64762306a36Sopenharmony_ci * 64862306a36Sopenharmony_ci * Note that even if there is new data in the SYN packet 64962306a36Sopenharmony_ci * they will be thrown away too. 65062306a36Sopenharmony_ci * 65162306a36Sopenharmony_ci * Reset timer after retransmitting SYNACK, similar to 65262306a36Sopenharmony_ci * the idea of fast retransmit in recovery. 65362306a36Sopenharmony_ci */ 65462306a36Sopenharmony_ci if (!tcp_oow_rate_limited(sock_net(sk), skb, 65562306a36Sopenharmony_ci LINUX_MIB_TCPACKSKIPPEDSYNRECV, 65662306a36Sopenharmony_ci &tcp_rsk(req)->last_oow_ack_time) && 65762306a36Sopenharmony_ci 65862306a36Sopenharmony_ci !inet_rtx_syn_ack(sk, req)) { 65962306a36Sopenharmony_ci unsigned long expires = jiffies; 66062306a36Sopenharmony_ci 66162306a36Sopenharmony_ci expires += reqsk_timeout(req, TCP_RTO_MAX); 66262306a36Sopenharmony_ci if (!fastopen) 66362306a36Sopenharmony_ci mod_timer_pending(&req->rsk_timer, expires); 66462306a36Sopenharmony_ci else 66562306a36Sopenharmony_ci req->rsk_timer.expires = expires; 66662306a36Sopenharmony_ci } 66762306a36Sopenharmony_ci return NULL; 66862306a36Sopenharmony_ci } 66962306a36Sopenharmony_ci 67062306a36Sopenharmony_ci /* Further reproduces section "SEGMENT ARRIVES" 67162306a36Sopenharmony_ci for state SYN-RECEIVED of RFC793. 67262306a36Sopenharmony_ci It is broken, however, it does not work only 67362306a36Sopenharmony_ci when SYNs are crossed. 67462306a36Sopenharmony_ci 67562306a36Sopenharmony_ci You would think that SYN crossing is impossible here, since 67662306a36Sopenharmony_ci we should have a SYN_SENT socket (from connect()) on our end, 67762306a36Sopenharmony_ci but this is not true if the crossed SYNs were sent to both 67862306a36Sopenharmony_ci ends by a malicious third party. We must defend against this, 67962306a36Sopenharmony_ci and to do that we first verify the ACK (as per RFC793, page 68062306a36Sopenharmony_ci 36) and reset if it is invalid. Is this a true full defense? 68162306a36Sopenharmony_ci To convince ourselves, let us consider a way in which the ACK 68262306a36Sopenharmony_ci test can still pass in this 'malicious crossed SYNs' case. 68362306a36Sopenharmony_ci Malicious sender sends identical SYNs (and thus identical sequence 68462306a36Sopenharmony_ci numbers) to both A and B: 68562306a36Sopenharmony_ci 68662306a36Sopenharmony_ci A: gets SYN, seq=7 68762306a36Sopenharmony_ci B: gets SYN, seq=7 68862306a36Sopenharmony_ci 68962306a36Sopenharmony_ci By our good fortune, both A and B select the same initial 69062306a36Sopenharmony_ci send sequence number of seven :-) 69162306a36Sopenharmony_ci 69262306a36Sopenharmony_ci A: sends SYN|ACK, seq=7, ack_seq=8 69362306a36Sopenharmony_ci B: sends SYN|ACK, seq=7, ack_seq=8 69462306a36Sopenharmony_ci 69562306a36Sopenharmony_ci So we are now A eating this SYN|ACK, ACK test passes. So 69662306a36Sopenharmony_ci does sequence test, SYN is truncated, and thus we consider 69762306a36Sopenharmony_ci it a bare ACK. 69862306a36Sopenharmony_ci 69962306a36Sopenharmony_ci If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this 70062306a36Sopenharmony_ci bare ACK. Otherwise, we create an established connection. Both 70162306a36Sopenharmony_ci ends (listening sockets) accept the new incoming connection and try 70262306a36Sopenharmony_ci to talk to each other. 8-) 70362306a36Sopenharmony_ci 70462306a36Sopenharmony_ci Note: This case is both harmless, and rare. Possibility is about the 70562306a36Sopenharmony_ci same as us discovering intelligent life on another plant tomorrow. 70662306a36Sopenharmony_ci 70762306a36Sopenharmony_ci But generally, we should (RFC lies!) to accept ACK 70862306a36Sopenharmony_ci from SYNACK both here and in tcp_rcv_state_process(). 70962306a36Sopenharmony_ci tcp_rcv_state_process() does not, hence, we do not too. 71062306a36Sopenharmony_ci 71162306a36Sopenharmony_ci Note that the case is absolutely generic: 71262306a36Sopenharmony_ci we cannot optimize anything here without 71362306a36Sopenharmony_ci violating protocol. All the checks must be made 71462306a36Sopenharmony_ci before attempt to create socket. 71562306a36Sopenharmony_ci */ 71662306a36Sopenharmony_ci 71762306a36Sopenharmony_ci /* RFC793 page 36: "If the connection is in any non-synchronized state ... 71862306a36Sopenharmony_ci * and the incoming segment acknowledges something not yet 71962306a36Sopenharmony_ci * sent (the segment carries an unacceptable ACK) ... 72062306a36Sopenharmony_ci * a reset is sent." 72162306a36Sopenharmony_ci * 72262306a36Sopenharmony_ci * Invalid ACK: reset will be sent by listening socket. 72362306a36Sopenharmony_ci * Note that the ACK validity check for a Fast Open socket is done 72462306a36Sopenharmony_ci * elsewhere and is checked directly against the child socket rather 72562306a36Sopenharmony_ci * than req because user data may have been sent out. 72662306a36Sopenharmony_ci */ 72762306a36Sopenharmony_ci if ((flg & TCP_FLAG_ACK) && !fastopen && 72862306a36Sopenharmony_ci (TCP_SKB_CB(skb)->ack_seq != 72962306a36Sopenharmony_ci tcp_rsk(req)->snt_isn + 1)) 73062306a36Sopenharmony_ci return sk; 73162306a36Sopenharmony_ci 73262306a36Sopenharmony_ci /* Also, it would be not so bad idea to check rcv_tsecr, which 73362306a36Sopenharmony_ci * is essentially ACK extension and too early or too late values 73462306a36Sopenharmony_ci * should cause reset in unsynchronized states. 73562306a36Sopenharmony_ci */ 73662306a36Sopenharmony_ci 73762306a36Sopenharmony_ci /* RFC793: "first check sequence number". */ 73862306a36Sopenharmony_ci 73962306a36Sopenharmony_ci if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, 74062306a36Sopenharmony_ci tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rsk_rcv_wnd)) { 74162306a36Sopenharmony_ci /* Out of window: send ACK and drop. */ 74262306a36Sopenharmony_ci if (!(flg & TCP_FLAG_RST) && 74362306a36Sopenharmony_ci !tcp_oow_rate_limited(sock_net(sk), skb, 74462306a36Sopenharmony_ci LINUX_MIB_TCPACKSKIPPEDSYNRECV, 74562306a36Sopenharmony_ci &tcp_rsk(req)->last_oow_ack_time)) 74662306a36Sopenharmony_ci req->rsk_ops->send_ack(sk, skb, req); 74762306a36Sopenharmony_ci if (paws_reject) 74862306a36Sopenharmony_ci NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); 74962306a36Sopenharmony_ci return NULL; 75062306a36Sopenharmony_ci } 75162306a36Sopenharmony_ci 75262306a36Sopenharmony_ci /* In sequence, PAWS is OK. */ 75362306a36Sopenharmony_ci 75462306a36Sopenharmony_ci /* TODO: We probably should defer ts_recent change once 75562306a36Sopenharmony_ci * we take ownership of @req. 75662306a36Sopenharmony_ci */ 75762306a36Sopenharmony_ci if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt)) 75862306a36Sopenharmony_ci WRITE_ONCE(req->ts_recent, tmp_opt.rcv_tsval); 75962306a36Sopenharmony_ci 76062306a36Sopenharmony_ci if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { 76162306a36Sopenharmony_ci /* Truncate SYN, it is out of window starting 76262306a36Sopenharmony_ci at tcp_rsk(req)->rcv_isn + 1. */ 76362306a36Sopenharmony_ci flg &= ~TCP_FLAG_SYN; 76462306a36Sopenharmony_ci } 76562306a36Sopenharmony_ci 76662306a36Sopenharmony_ci /* RFC793: "second check the RST bit" and 76762306a36Sopenharmony_ci * "fourth, check the SYN bit" 76862306a36Sopenharmony_ci */ 76962306a36Sopenharmony_ci if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) { 77062306a36Sopenharmony_ci TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS); 77162306a36Sopenharmony_ci goto embryonic_reset; 77262306a36Sopenharmony_ci } 77362306a36Sopenharmony_ci 77462306a36Sopenharmony_ci /* ACK sequence verified above, just make sure ACK is 77562306a36Sopenharmony_ci * set. If ACK not set, just silently drop the packet. 77662306a36Sopenharmony_ci * 77762306a36Sopenharmony_ci * XXX (TFO) - if we ever allow "data after SYN", the 77862306a36Sopenharmony_ci * following check needs to be removed. 77962306a36Sopenharmony_ci */ 78062306a36Sopenharmony_ci if (!(flg & TCP_FLAG_ACK)) 78162306a36Sopenharmony_ci return NULL; 78262306a36Sopenharmony_ci 78362306a36Sopenharmony_ci /* For Fast Open no more processing is needed (sk is the 78462306a36Sopenharmony_ci * child socket). 78562306a36Sopenharmony_ci */ 78662306a36Sopenharmony_ci if (fastopen) 78762306a36Sopenharmony_ci return sk; 78862306a36Sopenharmony_ci 78962306a36Sopenharmony_ci /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ 79062306a36Sopenharmony_ci if (req->num_timeout < READ_ONCE(inet_csk(sk)->icsk_accept_queue.rskq_defer_accept) && 79162306a36Sopenharmony_ci TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { 79262306a36Sopenharmony_ci inet_rsk(req)->acked = 1; 79362306a36Sopenharmony_ci __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP); 79462306a36Sopenharmony_ci return NULL; 79562306a36Sopenharmony_ci } 79662306a36Sopenharmony_ci 79762306a36Sopenharmony_ci /* OK, ACK is valid, create big socket and 79862306a36Sopenharmony_ci * feed this segment to it. It will repeat all 79962306a36Sopenharmony_ci * the tests. THIS SEGMENT MUST MOVE SOCKET TO 80062306a36Sopenharmony_ci * ESTABLISHED STATE. If it will be dropped after 80162306a36Sopenharmony_ci * socket is created, wait for troubles. 80262306a36Sopenharmony_ci */ 80362306a36Sopenharmony_ci child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, 80462306a36Sopenharmony_ci req, &own_req); 80562306a36Sopenharmony_ci if (!child) 80662306a36Sopenharmony_ci goto listen_overflow; 80762306a36Sopenharmony_ci 80862306a36Sopenharmony_ci if (own_req && rsk_drop_req(req)) { 80962306a36Sopenharmony_ci reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req); 81062306a36Sopenharmony_ci inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req); 81162306a36Sopenharmony_ci return child; 81262306a36Sopenharmony_ci } 81362306a36Sopenharmony_ci 81462306a36Sopenharmony_ci sock_rps_save_rxhash(child, skb); 81562306a36Sopenharmony_ci tcp_synack_rtt_meas(child, req); 81662306a36Sopenharmony_ci *req_stolen = !own_req; 81762306a36Sopenharmony_ci return inet_csk_complete_hashdance(sk, child, req, own_req); 81862306a36Sopenharmony_ci 81962306a36Sopenharmony_cilisten_overflow: 82062306a36Sopenharmony_ci if (sk != req->rsk_listener) 82162306a36Sopenharmony_ci __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); 82262306a36Sopenharmony_ci 82362306a36Sopenharmony_ci if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow)) { 82462306a36Sopenharmony_ci inet_rsk(req)->acked = 1; 82562306a36Sopenharmony_ci return NULL; 82662306a36Sopenharmony_ci } 82762306a36Sopenharmony_ci 82862306a36Sopenharmony_ciembryonic_reset: 82962306a36Sopenharmony_ci if (!(flg & TCP_FLAG_RST)) { 83062306a36Sopenharmony_ci /* Received a bad SYN pkt - for TFO We try not to reset 83162306a36Sopenharmony_ci * the local connection unless it's really necessary to 83262306a36Sopenharmony_ci * avoid becoming vulnerable to outside attack aiming at 83362306a36Sopenharmony_ci * resetting legit local connections. 83462306a36Sopenharmony_ci */ 83562306a36Sopenharmony_ci req->rsk_ops->send_reset(sk, skb); 83662306a36Sopenharmony_ci } else if (fastopen) { /* received a valid RST pkt */ 83762306a36Sopenharmony_ci reqsk_fastopen_remove(sk, req, true); 83862306a36Sopenharmony_ci tcp_reset(sk, skb); 83962306a36Sopenharmony_ci } 84062306a36Sopenharmony_ci if (!fastopen) { 84162306a36Sopenharmony_ci bool unlinked = inet_csk_reqsk_queue_drop(sk, req); 84262306a36Sopenharmony_ci 84362306a36Sopenharmony_ci if (unlinked) 84462306a36Sopenharmony_ci __NET_INC_STATS(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); 84562306a36Sopenharmony_ci *req_stolen = !unlinked; 84662306a36Sopenharmony_ci } 84762306a36Sopenharmony_ci return NULL; 84862306a36Sopenharmony_ci} 84962306a36Sopenharmony_ciEXPORT_SYMBOL(tcp_check_req); 85062306a36Sopenharmony_ci 85162306a36Sopenharmony_ci/* 85262306a36Sopenharmony_ci * Queue segment on the new socket if the new socket is active, 85362306a36Sopenharmony_ci * otherwise we just shortcircuit this and continue with 85462306a36Sopenharmony_ci * the new socket. 85562306a36Sopenharmony_ci * 85662306a36Sopenharmony_ci * For the vast majority of cases child->sk_state will be TCP_SYN_RECV 85762306a36Sopenharmony_ci * when entering. But other states are possible due to a race condition 85862306a36Sopenharmony_ci * where after __inet_lookup_established() fails but before the listener 85962306a36Sopenharmony_ci * locked is obtained, other packets cause the same connection to 86062306a36Sopenharmony_ci * be created. 86162306a36Sopenharmony_ci */ 86262306a36Sopenharmony_ci 86362306a36Sopenharmony_ciint tcp_child_process(struct sock *parent, struct sock *child, 86462306a36Sopenharmony_ci struct sk_buff *skb) 86562306a36Sopenharmony_ci __releases(&((child)->sk_lock.slock)) 86662306a36Sopenharmony_ci{ 86762306a36Sopenharmony_ci int ret = 0; 86862306a36Sopenharmony_ci int state = child->sk_state; 86962306a36Sopenharmony_ci 87062306a36Sopenharmony_ci /* record sk_napi_id and sk_rx_queue_mapping of child. */ 87162306a36Sopenharmony_ci sk_mark_napi_id_set(child, skb); 87262306a36Sopenharmony_ci 87362306a36Sopenharmony_ci tcp_segs_in(tcp_sk(child), skb); 87462306a36Sopenharmony_ci if (!sock_owned_by_user(child)) { 87562306a36Sopenharmony_ci ret = tcp_rcv_state_process(child, skb); 87662306a36Sopenharmony_ci /* Wakeup parent, send SIGIO */ 87762306a36Sopenharmony_ci if (state == TCP_SYN_RECV && child->sk_state != state) 87862306a36Sopenharmony_ci parent->sk_data_ready(parent); 87962306a36Sopenharmony_ci } else { 88062306a36Sopenharmony_ci /* Alas, it is possible again, because we do lookup 88162306a36Sopenharmony_ci * in main socket hash table and lock on listening 88262306a36Sopenharmony_ci * socket does not protect us more. 88362306a36Sopenharmony_ci */ 88462306a36Sopenharmony_ci __sk_add_backlog(child, skb); 88562306a36Sopenharmony_ci } 88662306a36Sopenharmony_ci 88762306a36Sopenharmony_ci bh_unlock_sock(child); 88862306a36Sopenharmony_ci sock_put(child); 88962306a36Sopenharmony_ci return ret; 89062306a36Sopenharmony_ci} 89162306a36Sopenharmony_ciEXPORT_SYMBOL(tcp_child_process); 892