162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * INET An implementation of the TCP/IP protocol suite for the LINUX 462306a36Sopenharmony_ci * operating system. INET is implemented using the BSD Socket 562306a36Sopenharmony_ci * interface as the means of communication with the user level. 662306a36Sopenharmony_ci * 762306a36Sopenharmony_ci * Generic TIME_WAIT sockets functions 862306a36Sopenharmony_ci * 962306a36Sopenharmony_ci * From code orinally in TCP 1062306a36Sopenharmony_ci */ 1162306a36Sopenharmony_ci 1262306a36Sopenharmony_ci#include <linux/kernel.h> 1362306a36Sopenharmony_ci#include <linux/slab.h> 1462306a36Sopenharmony_ci#include <linux/module.h> 1562306a36Sopenharmony_ci#include <net/inet_hashtables.h> 1662306a36Sopenharmony_ci#include <net/inet_timewait_sock.h> 1762306a36Sopenharmony_ci#include <net/ip.h> 1862306a36Sopenharmony_ci 1962306a36Sopenharmony_ci 2062306a36Sopenharmony_ci/** 2162306a36Sopenharmony_ci * inet_twsk_bind_unhash - unhash a timewait socket from bind hash 2262306a36Sopenharmony_ci * @tw: timewait socket 2362306a36Sopenharmony_ci * @hashinfo: hashinfo pointer 2462306a36Sopenharmony_ci * 2562306a36Sopenharmony_ci * unhash a timewait socket from bind hash, if hashed. 2662306a36Sopenharmony_ci * bind hash lock must be held by caller. 2762306a36Sopenharmony_ci * Returns 1 if caller should call inet_twsk_put() after lock release. 2862306a36Sopenharmony_ci */ 2962306a36Sopenharmony_civoid inet_twsk_bind_unhash(struct inet_timewait_sock *tw, 3062306a36Sopenharmony_ci struct inet_hashinfo *hashinfo) 3162306a36Sopenharmony_ci{ 3262306a36Sopenharmony_ci struct inet_bind2_bucket *tb2 = tw->tw_tb2; 3362306a36Sopenharmony_ci struct inet_bind_bucket *tb = tw->tw_tb; 3462306a36Sopenharmony_ci 3562306a36Sopenharmony_ci if (!tb) 3662306a36Sopenharmony_ci return; 3762306a36Sopenharmony_ci 3862306a36Sopenharmony_ci __hlist_del(&tw->tw_bind_node); 3962306a36Sopenharmony_ci tw->tw_tb = NULL; 4062306a36Sopenharmony_ci inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); 4162306a36Sopenharmony_ci 4262306a36Sopenharmony_ci __hlist_del(&tw->tw_bind2_node); 4362306a36Sopenharmony_ci tw->tw_tb2 = NULL; 4462306a36Sopenharmony_ci inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2); 4562306a36Sopenharmony_ci 4662306a36Sopenharmony_ci __sock_put((struct sock *)tw); 4762306a36Sopenharmony_ci} 4862306a36Sopenharmony_ci 4962306a36Sopenharmony_ci/* Must be called with locally disabled BHs. */ 5062306a36Sopenharmony_cistatic void inet_twsk_kill(struct inet_timewait_sock *tw) 5162306a36Sopenharmony_ci{ 5262306a36Sopenharmony_ci struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo; 5362306a36Sopenharmony_ci spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); 5462306a36Sopenharmony_ci struct inet_bind_hashbucket *bhead, *bhead2; 5562306a36Sopenharmony_ci 5662306a36Sopenharmony_ci spin_lock(lock); 5762306a36Sopenharmony_ci sk_nulls_del_node_init_rcu((struct sock *)tw); 5862306a36Sopenharmony_ci spin_unlock(lock); 5962306a36Sopenharmony_ci 6062306a36Sopenharmony_ci /* Disassociate with bind bucket. */ 6162306a36Sopenharmony_ci bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num, 6262306a36Sopenharmony_ci hashinfo->bhash_size)]; 6362306a36Sopenharmony_ci bhead2 = inet_bhashfn_portaddr(hashinfo, (struct sock *)tw, 6462306a36Sopenharmony_ci twsk_net(tw), tw->tw_num); 6562306a36Sopenharmony_ci 6662306a36Sopenharmony_ci spin_lock(&bhead->lock); 6762306a36Sopenharmony_ci spin_lock(&bhead2->lock); 6862306a36Sopenharmony_ci inet_twsk_bind_unhash(tw, hashinfo); 6962306a36Sopenharmony_ci spin_unlock(&bhead2->lock); 7062306a36Sopenharmony_ci spin_unlock(&bhead->lock); 7162306a36Sopenharmony_ci 7262306a36Sopenharmony_ci refcount_dec(&tw->tw_dr->tw_refcount); 7362306a36Sopenharmony_ci inet_twsk_put(tw); 7462306a36Sopenharmony_ci} 7562306a36Sopenharmony_ci 7662306a36Sopenharmony_civoid inet_twsk_free(struct inet_timewait_sock *tw) 7762306a36Sopenharmony_ci{ 7862306a36Sopenharmony_ci struct module *owner = tw->tw_prot->owner; 7962306a36Sopenharmony_ci twsk_destructor((struct sock *)tw); 8062306a36Sopenharmony_ci kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw); 8162306a36Sopenharmony_ci module_put(owner); 8262306a36Sopenharmony_ci} 8362306a36Sopenharmony_ci 8462306a36Sopenharmony_civoid inet_twsk_put(struct inet_timewait_sock *tw) 8562306a36Sopenharmony_ci{ 8662306a36Sopenharmony_ci if (refcount_dec_and_test(&tw->tw_refcnt)) 8762306a36Sopenharmony_ci inet_twsk_free(tw); 8862306a36Sopenharmony_ci} 8962306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(inet_twsk_put); 9062306a36Sopenharmony_ci 9162306a36Sopenharmony_cistatic void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw, 9262306a36Sopenharmony_ci struct hlist_nulls_head *list) 9362306a36Sopenharmony_ci{ 9462306a36Sopenharmony_ci hlist_nulls_add_head_rcu(&tw->tw_node, list); 9562306a36Sopenharmony_ci} 9662306a36Sopenharmony_ci 9762306a36Sopenharmony_cistatic void inet_twsk_add_bind_node(struct inet_timewait_sock *tw, 9862306a36Sopenharmony_ci struct hlist_head *list) 9962306a36Sopenharmony_ci{ 10062306a36Sopenharmony_ci hlist_add_head(&tw->tw_bind_node, list); 10162306a36Sopenharmony_ci} 10262306a36Sopenharmony_ci 10362306a36Sopenharmony_cistatic void inet_twsk_add_bind2_node(struct inet_timewait_sock *tw, 10462306a36Sopenharmony_ci struct hlist_head *list) 10562306a36Sopenharmony_ci{ 10662306a36Sopenharmony_ci hlist_add_head(&tw->tw_bind2_node, list); 10762306a36Sopenharmony_ci} 10862306a36Sopenharmony_ci 10962306a36Sopenharmony_ci/* 11062306a36Sopenharmony_ci * Enter the time wait state. This is called with locally disabled BH. 11162306a36Sopenharmony_ci * Essentially we whip up a timewait bucket, copy the relevant info into it 11262306a36Sopenharmony_ci * from the SK, and mess with hash chains and list linkage. 11362306a36Sopenharmony_ci */ 11462306a36Sopenharmony_civoid inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, 11562306a36Sopenharmony_ci struct inet_hashinfo *hashinfo) 11662306a36Sopenharmony_ci{ 11762306a36Sopenharmony_ci const struct inet_sock *inet = inet_sk(sk); 11862306a36Sopenharmony_ci const struct inet_connection_sock *icsk = inet_csk(sk); 11962306a36Sopenharmony_ci struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash); 12062306a36Sopenharmony_ci spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 12162306a36Sopenharmony_ci struct inet_bind_hashbucket *bhead, *bhead2; 12262306a36Sopenharmony_ci 12362306a36Sopenharmony_ci /* Step 1: Put TW into bind hash. Original socket stays there too. 12462306a36Sopenharmony_ci Note, that any socket with inet->num != 0 MUST be bound in 12562306a36Sopenharmony_ci binding cache, even if it is closed. 12662306a36Sopenharmony_ci */ 12762306a36Sopenharmony_ci bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num, 12862306a36Sopenharmony_ci hashinfo->bhash_size)]; 12962306a36Sopenharmony_ci bhead2 = inet_bhashfn_portaddr(hashinfo, sk, twsk_net(tw), inet->inet_num); 13062306a36Sopenharmony_ci 13162306a36Sopenharmony_ci spin_lock(&bhead->lock); 13262306a36Sopenharmony_ci spin_lock(&bhead2->lock); 13362306a36Sopenharmony_ci 13462306a36Sopenharmony_ci tw->tw_tb = icsk->icsk_bind_hash; 13562306a36Sopenharmony_ci WARN_ON(!icsk->icsk_bind_hash); 13662306a36Sopenharmony_ci inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); 13762306a36Sopenharmony_ci 13862306a36Sopenharmony_ci tw->tw_tb2 = icsk->icsk_bind2_hash; 13962306a36Sopenharmony_ci WARN_ON(!icsk->icsk_bind2_hash); 14062306a36Sopenharmony_ci inet_twsk_add_bind2_node(tw, &tw->tw_tb2->deathrow); 14162306a36Sopenharmony_ci 14262306a36Sopenharmony_ci spin_unlock(&bhead2->lock); 14362306a36Sopenharmony_ci spin_unlock(&bhead->lock); 14462306a36Sopenharmony_ci 14562306a36Sopenharmony_ci spin_lock(lock); 14662306a36Sopenharmony_ci 14762306a36Sopenharmony_ci inet_twsk_add_node_rcu(tw, &ehead->chain); 14862306a36Sopenharmony_ci 14962306a36Sopenharmony_ci /* Step 3: Remove SK from hash chain */ 15062306a36Sopenharmony_ci if (__sk_nulls_del_node_init_rcu(sk)) 15162306a36Sopenharmony_ci sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 15262306a36Sopenharmony_ci 15362306a36Sopenharmony_ci spin_unlock(lock); 15462306a36Sopenharmony_ci 15562306a36Sopenharmony_ci /* tw_refcnt is set to 3 because we have : 15662306a36Sopenharmony_ci * - one reference for bhash chain. 15762306a36Sopenharmony_ci * - one reference for ehash chain. 15862306a36Sopenharmony_ci * - one reference for timer. 15962306a36Sopenharmony_ci * We can use atomic_set() because prior spin_lock()/spin_unlock() 16062306a36Sopenharmony_ci * committed into memory all tw fields. 16162306a36Sopenharmony_ci * Also note that after this point, we lost our implicit reference 16262306a36Sopenharmony_ci * so we are not allowed to use tw anymore. 16362306a36Sopenharmony_ci */ 16462306a36Sopenharmony_ci refcount_set(&tw->tw_refcnt, 3); 16562306a36Sopenharmony_ci} 16662306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(inet_twsk_hashdance); 16762306a36Sopenharmony_ci 16862306a36Sopenharmony_cistatic void tw_timer_handler(struct timer_list *t) 16962306a36Sopenharmony_ci{ 17062306a36Sopenharmony_ci struct inet_timewait_sock *tw = from_timer(tw, t, tw_timer); 17162306a36Sopenharmony_ci 17262306a36Sopenharmony_ci inet_twsk_kill(tw); 17362306a36Sopenharmony_ci} 17462306a36Sopenharmony_ci 17562306a36Sopenharmony_cistruct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, 17662306a36Sopenharmony_ci struct inet_timewait_death_row *dr, 17762306a36Sopenharmony_ci const int state) 17862306a36Sopenharmony_ci{ 17962306a36Sopenharmony_ci struct inet_timewait_sock *tw; 18062306a36Sopenharmony_ci 18162306a36Sopenharmony_ci if (refcount_read(&dr->tw_refcount) - 1 >= 18262306a36Sopenharmony_ci READ_ONCE(dr->sysctl_max_tw_buckets)) 18362306a36Sopenharmony_ci return NULL; 18462306a36Sopenharmony_ci 18562306a36Sopenharmony_ci tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab, 18662306a36Sopenharmony_ci GFP_ATOMIC); 18762306a36Sopenharmony_ci if (tw) { 18862306a36Sopenharmony_ci const struct inet_sock *inet = inet_sk(sk); 18962306a36Sopenharmony_ci 19062306a36Sopenharmony_ci tw->tw_dr = dr; 19162306a36Sopenharmony_ci /* Give us an identity. */ 19262306a36Sopenharmony_ci tw->tw_daddr = inet->inet_daddr; 19362306a36Sopenharmony_ci tw->tw_rcv_saddr = inet->inet_rcv_saddr; 19462306a36Sopenharmony_ci tw->tw_bound_dev_if = sk->sk_bound_dev_if; 19562306a36Sopenharmony_ci tw->tw_tos = inet->tos; 19662306a36Sopenharmony_ci tw->tw_num = inet->inet_num; 19762306a36Sopenharmony_ci tw->tw_state = TCP_TIME_WAIT; 19862306a36Sopenharmony_ci tw->tw_substate = state; 19962306a36Sopenharmony_ci tw->tw_sport = inet->inet_sport; 20062306a36Sopenharmony_ci tw->tw_dport = inet->inet_dport; 20162306a36Sopenharmony_ci tw->tw_family = sk->sk_family; 20262306a36Sopenharmony_ci tw->tw_reuse = sk->sk_reuse; 20362306a36Sopenharmony_ci tw->tw_reuseport = sk->sk_reuseport; 20462306a36Sopenharmony_ci tw->tw_hash = sk->sk_hash; 20562306a36Sopenharmony_ci tw->tw_ipv6only = 0; 20662306a36Sopenharmony_ci tw->tw_transparent = inet_test_bit(TRANSPARENT, sk); 20762306a36Sopenharmony_ci tw->tw_prot = sk->sk_prot_creator; 20862306a36Sopenharmony_ci atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie)); 20962306a36Sopenharmony_ci twsk_net_set(tw, sock_net(sk)); 21062306a36Sopenharmony_ci timer_setup(&tw->tw_timer, tw_timer_handler, TIMER_PINNED); 21162306a36Sopenharmony_ci /* 21262306a36Sopenharmony_ci * Because we use RCU lookups, we should not set tw_refcnt 21362306a36Sopenharmony_ci * to a non null value before everything is setup for this 21462306a36Sopenharmony_ci * timewait socket. 21562306a36Sopenharmony_ci */ 21662306a36Sopenharmony_ci refcount_set(&tw->tw_refcnt, 0); 21762306a36Sopenharmony_ci 21862306a36Sopenharmony_ci __module_get(tw->tw_prot->owner); 21962306a36Sopenharmony_ci } 22062306a36Sopenharmony_ci 22162306a36Sopenharmony_ci return tw; 22262306a36Sopenharmony_ci} 22362306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(inet_twsk_alloc); 22462306a36Sopenharmony_ci 22562306a36Sopenharmony_ci/* These are always called from BH context. See callers in 22662306a36Sopenharmony_ci * tcp_input.c to verify this. 22762306a36Sopenharmony_ci */ 22862306a36Sopenharmony_ci 22962306a36Sopenharmony_ci/* This is for handling early-kills of TIME_WAIT sockets. 23062306a36Sopenharmony_ci * Warning : consume reference. 23162306a36Sopenharmony_ci * Caller should not access tw anymore. 23262306a36Sopenharmony_ci */ 23362306a36Sopenharmony_civoid inet_twsk_deschedule_put(struct inet_timewait_sock *tw) 23462306a36Sopenharmony_ci{ 23562306a36Sopenharmony_ci if (del_timer_sync(&tw->tw_timer)) 23662306a36Sopenharmony_ci inet_twsk_kill(tw); 23762306a36Sopenharmony_ci inet_twsk_put(tw); 23862306a36Sopenharmony_ci} 23962306a36Sopenharmony_ciEXPORT_SYMBOL(inet_twsk_deschedule_put); 24062306a36Sopenharmony_ci 24162306a36Sopenharmony_civoid __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm) 24262306a36Sopenharmony_ci{ 24362306a36Sopenharmony_ci /* timeout := RTO * 3.5 24462306a36Sopenharmony_ci * 24562306a36Sopenharmony_ci * 3.5 = 1+2+0.5 to wait for two retransmits. 24662306a36Sopenharmony_ci * 24762306a36Sopenharmony_ci * RATIONALE: if FIN arrived and we entered TIME-WAIT state, 24862306a36Sopenharmony_ci * our ACK acking that FIN can be lost. If N subsequent retransmitted 24962306a36Sopenharmony_ci * FINs (or previous seqments) are lost (probability of such event 25062306a36Sopenharmony_ci * is p^(N+1), where p is probability to lose single packet and 25162306a36Sopenharmony_ci * time to detect the loss is about RTO*(2^N - 1) with exponential 25262306a36Sopenharmony_ci * backoff). Normal timewait length is calculated so, that we 25362306a36Sopenharmony_ci * waited at least for one retransmitted FIN (maximal RTO is 120sec). 25462306a36Sopenharmony_ci * [ BTW Linux. following BSD, violates this requirement waiting 25562306a36Sopenharmony_ci * only for 60sec, we should wait at least for 240 secs. 25662306a36Sopenharmony_ci * Well, 240 consumes too much of resources 8) 25762306a36Sopenharmony_ci * ] 25862306a36Sopenharmony_ci * This interval is not reduced to catch old duplicate and 25962306a36Sopenharmony_ci * responces to our wandering segments living for two MSLs. 26062306a36Sopenharmony_ci * However, if we use PAWS to detect 26162306a36Sopenharmony_ci * old duplicates, we can reduce the interval to bounds required 26262306a36Sopenharmony_ci * by RTO, rather than MSL. So, if peer understands PAWS, we 26362306a36Sopenharmony_ci * kill tw bucket after 3.5*RTO (it is important that this number 26462306a36Sopenharmony_ci * is greater than TS tick!) and detect old duplicates with help 26562306a36Sopenharmony_ci * of PAWS. 26662306a36Sopenharmony_ci */ 26762306a36Sopenharmony_ci 26862306a36Sopenharmony_ci if (!rearm) { 26962306a36Sopenharmony_ci bool kill = timeo <= 4*HZ; 27062306a36Sopenharmony_ci 27162306a36Sopenharmony_ci __NET_INC_STATS(twsk_net(tw), kill ? LINUX_MIB_TIMEWAITKILLED : 27262306a36Sopenharmony_ci LINUX_MIB_TIMEWAITED); 27362306a36Sopenharmony_ci BUG_ON(mod_timer(&tw->tw_timer, jiffies + timeo)); 27462306a36Sopenharmony_ci refcount_inc(&tw->tw_dr->tw_refcount); 27562306a36Sopenharmony_ci } else { 27662306a36Sopenharmony_ci mod_timer_pending(&tw->tw_timer, jiffies + timeo); 27762306a36Sopenharmony_ci } 27862306a36Sopenharmony_ci} 27962306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(__inet_twsk_schedule); 28062306a36Sopenharmony_ci 28162306a36Sopenharmony_ci/* Remove all non full sockets (TIME_WAIT and NEW_SYN_RECV) for dead netns */ 28262306a36Sopenharmony_civoid inet_twsk_purge(struct inet_hashinfo *hashinfo, int family) 28362306a36Sopenharmony_ci{ 28462306a36Sopenharmony_ci struct hlist_nulls_node *node; 28562306a36Sopenharmony_ci unsigned int slot; 28662306a36Sopenharmony_ci struct sock *sk; 28762306a36Sopenharmony_ci 28862306a36Sopenharmony_ci for (slot = 0; slot <= hashinfo->ehash_mask; slot++) { 28962306a36Sopenharmony_ci struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; 29062306a36Sopenharmony_cirestart_rcu: 29162306a36Sopenharmony_ci cond_resched(); 29262306a36Sopenharmony_ci rcu_read_lock(); 29362306a36Sopenharmony_cirestart: 29462306a36Sopenharmony_ci sk_nulls_for_each_rcu(sk, node, &head->chain) { 29562306a36Sopenharmony_ci int state = inet_sk_state_load(sk); 29662306a36Sopenharmony_ci 29762306a36Sopenharmony_ci if ((1 << state) & ~(TCPF_TIME_WAIT | 29862306a36Sopenharmony_ci TCPF_NEW_SYN_RECV)) 29962306a36Sopenharmony_ci continue; 30062306a36Sopenharmony_ci 30162306a36Sopenharmony_ci if (sk->sk_family != family || 30262306a36Sopenharmony_ci refcount_read(&sock_net(sk)->ns.count)) 30362306a36Sopenharmony_ci continue; 30462306a36Sopenharmony_ci 30562306a36Sopenharmony_ci if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) 30662306a36Sopenharmony_ci continue; 30762306a36Sopenharmony_ci 30862306a36Sopenharmony_ci if (unlikely(sk->sk_family != family || 30962306a36Sopenharmony_ci refcount_read(&sock_net(sk)->ns.count))) { 31062306a36Sopenharmony_ci sock_gen_put(sk); 31162306a36Sopenharmony_ci goto restart; 31262306a36Sopenharmony_ci } 31362306a36Sopenharmony_ci 31462306a36Sopenharmony_ci rcu_read_unlock(); 31562306a36Sopenharmony_ci local_bh_disable(); 31662306a36Sopenharmony_ci if (state == TCP_TIME_WAIT) { 31762306a36Sopenharmony_ci inet_twsk_deschedule_put(inet_twsk(sk)); 31862306a36Sopenharmony_ci } else { 31962306a36Sopenharmony_ci struct request_sock *req = inet_reqsk(sk); 32062306a36Sopenharmony_ci 32162306a36Sopenharmony_ci inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, 32262306a36Sopenharmony_ci req); 32362306a36Sopenharmony_ci } 32462306a36Sopenharmony_ci local_bh_enable(); 32562306a36Sopenharmony_ci goto restart_rcu; 32662306a36Sopenharmony_ci } 32762306a36Sopenharmony_ci /* If the nulls value we got at the end of this lookup is 32862306a36Sopenharmony_ci * not the expected one, we must restart lookup. 32962306a36Sopenharmony_ci * We probably met an item that was moved to another chain. 33062306a36Sopenharmony_ci */ 33162306a36Sopenharmony_ci if (get_nulls_value(node) != slot) 33262306a36Sopenharmony_ci goto restart; 33362306a36Sopenharmony_ci rcu_read_unlock(); 33462306a36Sopenharmony_ci } 33562306a36Sopenharmony_ci} 33662306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(inet_twsk_purge); 337