162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 262306a36Sopenharmony_ci/* Multipath TCP 362306a36Sopenharmony_ci * 462306a36Sopenharmony_ci * Copyright (c) 2017 - 2019, Intel Corporation. 562306a36Sopenharmony_ci */ 662306a36Sopenharmony_ci 762306a36Sopenharmony_ci#define pr_fmt(fmt) "MPTCP: " fmt 862306a36Sopenharmony_ci 962306a36Sopenharmony_ci#include <linux/kernel.h> 1062306a36Sopenharmony_ci#include <linux/module.h> 1162306a36Sopenharmony_ci#include <linux/netdevice.h> 1262306a36Sopenharmony_ci#include <linux/sched/signal.h> 1362306a36Sopenharmony_ci#include <linux/atomic.h> 1462306a36Sopenharmony_ci#include <net/sock.h> 1562306a36Sopenharmony_ci#include <net/inet_common.h> 1662306a36Sopenharmony_ci#include <net/inet_hashtables.h> 1762306a36Sopenharmony_ci#include <net/protocol.h> 1862306a36Sopenharmony_ci#include <net/tcp.h> 1962306a36Sopenharmony_ci#include <net/tcp_states.h> 2062306a36Sopenharmony_ci#if IS_ENABLED(CONFIG_MPTCP_IPV6) 2162306a36Sopenharmony_ci#include <net/transp_v6.h> 2262306a36Sopenharmony_ci#endif 2362306a36Sopenharmony_ci#include <net/mptcp.h> 2462306a36Sopenharmony_ci#include <net/xfrm.h> 2562306a36Sopenharmony_ci#include <asm/ioctls.h> 2662306a36Sopenharmony_ci#include "protocol.h" 2762306a36Sopenharmony_ci#include "mib.h" 2862306a36Sopenharmony_ci 2962306a36Sopenharmony_ci#define CREATE_TRACE_POINTS 3062306a36Sopenharmony_ci#include <trace/events/mptcp.h> 3162306a36Sopenharmony_ci 3262306a36Sopenharmony_ci#if IS_ENABLED(CONFIG_MPTCP_IPV6) 3362306a36Sopenharmony_cistruct mptcp6_sock { 3462306a36Sopenharmony_ci struct mptcp_sock msk; 3562306a36Sopenharmony_ci struct ipv6_pinfo np; 3662306a36Sopenharmony_ci}; 3762306a36Sopenharmony_ci#endif 3862306a36Sopenharmony_ci 3962306a36Sopenharmony_cienum { 4062306a36Sopenharmony_ci MPTCP_CMSG_TS = BIT(0), 4162306a36Sopenharmony_ci MPTCP_CMSG_INQ = BIT(1), 4262306a36Sopenharmony_ci}; 4362306a36Sopenharmony_ci 4462306a36Sopenharmony_cistatic struct percpu_counter mptcp_sockets_allocated ____cacheline_aligned_in_smp; 4562306a36Sopenharmony_ci 4662306a36Sopenharmony_cistatic void __mptcp_destroy_sock(struct sock *sk); 4762306a36Sopenharmony_cistatic void mptcp_check_send_data_fin(struct sock *sk); 4862306a36Sopenharmony_ci 4962306a36Sopenharmony_ciDEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions); 5062306a36Sopenharmony_cistatic struct net_device mptcp_napi_dev; 5162306a36Sopenharmony_ci 5262306a36Sopenharmony_ci/* Returns end sequence number of the receiver's advertised window */ 5362306a36Sopenharmony_cistatic u64 mptcp_wnd_end(const struct mptcp_sock *msk) 5462306a36Sopenharmony_ci{ 5562306a36Sopenharmony_ci return READ_ONCE(msk->wnd_end); 5662306a36Sopenharmony_ci} 5762306a36Sopenharmony_ci 5862306a36Sopenharmony_cistatic bool mptcp_is_tcpsk(struct sock *sk) 5962306a36Sopenharmony_ci{ 6062306a36Sopenharmony_ci struct socket *sock = sk->sk_socket; 6162306a36Sopenharmony_ci 6262306a36Sopenharmony_ci if (unlikely(sk->sk_prot == &tcp_prot)) { 6362306a36Sopenharmony_ci /* we are being invoked after mptcp_accept() has 6462306a36Sopenharmony_ci * accepted a non-mp-capable flow: sk is a tcp_sk, 6562306a36Sopenharmony_ci * not an mptcp one. 6662306a36Sopenharmony_ci * 6762306a36Sopenharmony_ci * Hand the socket over to tcp so all further socket ops 6862306a36Sopenharmony_ci * bypass mptcp. 6962306a36Sopenharmony_ci */ 7062306a36Sopenharmony_ci WRITE_ONCE(sock->ops, &inet_stream_ops); 7162306a36Sopenharmony_ci return true; 7262306a36Sopenharmony_ci#if IS_ENABLED(CONFIG_MPTCP_IPV6) 7362306a36Sopenharmony_ci } else if (unlikely(sk->sk_prot == &tcpv6_prot)) { 7462306a36Sopenharmony_ci WRITE_ONCE(sock->ops, &inet6_stream_ops); 7562306a36Sopenharmony_ci return true; 7662306a36Sopenharmony_ci#endif 7762306a36Sopenharmony_ci } 7862306a36Sopenharmony_ci 7962306a36Sopenharmony_ci return false; 8062306a36Sopenharmony_ci} 8162306a36Sopenharmony_ci 8262306a36Sopenharmony_cistatic int __mptcp_socket_create(struct mptcp_sock *msk) 8362306a36Sopenharmony_ci{ 8462306a36Sopenharmony_ci struct mptcp_subflow_context *subflow; 8562306a36Sopenharmony_ci struct sock *sk = (struct sock *)msk; 8662306a36Sopenharmony_ci struct socket *ssock; 8762306a36Sopenharmony_ci int err; 8862306a36Sopenharmony_ci 8962306a36Sopenharmony_ci err = mptcp_subflow_create_socket(sk, sk->sk_family, &ssock); 9062306a36Sopenharmony_ci if (err) 9162306a36Sopenharmony_ci return err; 9262306a36Sopenharmony_ci 9362306a36Sopenharmony_ci msk->scaling_ratio = tcp_sk(ssock->sk)->scaling_ratio; 9462306a36Sopenharmony_ci WRITE_ONCE(msk->first, ssock->sk); 9562306a36Sopenharmony_ci subflow = mptcp_subflow_ctx(ssock->sk); 9662306a36Sopenharmony_ci list_add(&subflow->node, &msk->conn_list); 9762306a36Sopenharmony_ci sock_hold(ssock->sk); 9862306a36Sopenharmony_ci subflow->request_mptcp = 1; 9962306a36Sopenharmony_ci subflow->subflow_id = msk->subflow_id++; 10062306a36Sopenharmony_ci 10162306a36Sopenharmony_ci /* This is the first subflow, always with id 0 */ 10262306a36Sopenharmony_ci WRITE_ONCE(subflow->local_id, 0); 10362306a36Sopenharmony_ci mptcp_sock_graft(msk->first, sk->sk_socket); 10462306a36Sopenharmony_ci iput(SOCK_INODE(ssock)); 10562306a36Sopenharmony_ci 10662306a36Sopenharmony_ci return 0; 10762306a36Sopenharmony_ci} 10862306a36Sopenharmony_ci 10962306a36Sopenharmony_ci/* If the MPC handshake is not started, returns the first subflow, 11062306a36Sopenharmony_ci * eventually allocating it. 11162306a36Sopenharmony_ci */ 11262306a36Sopenharmony_cistruct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk) 11362306a36Sopenharmony_ci{ 11462306a36Sopenharmony_ci struct sock *sk = (struct sock *)msk; 11562306a36Sopenharmony_ci int ret; 11662306a36Sopenharmony_ci 11762306a36Sopenharmony_ci if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) 11862306a36Sopenharmony_ci return ERR_PTR(-EINVAL); 11962306a36Sopenharmony_ci 12062306a36Sopenharmony_ci if (!msk->first) { 12162306a36Sopenharmony_ci ret = __mptcp_socket_create(msk); 12262306a36Sopenharmony_ci if (ret) 12362306a36Sopenharmony_ci return ERR_PTR(ret); 12462306a36Sopenharmony_ci 12562306a36Sopenharmony_ci mptcp_sockopt_sync(msk, msk->first); 12662306a36Sopenharmony_ci } 12762306a36Sopenharmony_ci 12862306a36Sopenharmony_ci return msk->first; 12962306a36Sopenharmony_ci} 13062306a36Sopenharmony_ci 13162306a36Sopenharmony_cistatic void mptcp_drop(struct sock *sk, struct sk_buff *skb) 13262306a36Sopenharmony_ci{ 13362306a36Sopenharmony_ci sk_drops_add(sk, skb); 13462306a36Sopenharmony_ci __kfree_skb(skb); 13562306a36Sopenharmony_ci} 13662306a36Sopenharmony_ci 13762306a36Sopenharmony_cistatic void mptcp_rmem_fwd_alloc_add(struct sock *sk, int size) 13862306a36Sopenharmony_ci{ 13962306a36Sopenharmony_ci WRITE_ONCE(mptcp_sk(sk)->rmem_fwd_alloc, 14062306a36Sopenharmony_ci mptcp_sk(sk)->rmem_fwd_alloc + size); 14162306a36Sopenharmony_ci} 14262306a36Sopenharmony_ci 14362306a36Sopenharmony_cistatic void mptcp_rmem_charge(struct sock *sk, int size) 14462306a36Sopenharmony_ci{ 14562306a36Sopenharmony_ci mptcp_rmem_fwd_alloc_add(sk, -size); 14662306a36Sopenharmony_ci} 14762306a36Sopenharmony_ci 14862306a36Sopenharmony_cistatic bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, 14962306a36Sopenharmony_ci struct sk_buff *from) 15062306a36Sopenharmony_ci{ 15162306a36Sopenharmony_ci bool fragstolen; 15262306a36Sopenharmony_ci int delta; 15362306a36Sopenharmony_ci 15462306a36Sopenharmony_ci if (MPTCP_SKB_CB(from)->offset || 15562306a36Sopenharmony_ci !skb_try_coalesce(to, from, &fragstolen, &delta)) 15662306a36Sopenharmony_ci return false; 15762306a36Sopenharmony_ci 15862306a36Sopenharmony_ci pr_debug("colesced seq %llx into %llx new len %d new end seq %llx", 15962306a36Sopenharmony_ci MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq, 16062306a36Sopenharmony_ci to->len, MPTCP_SKB_CB(from)->end_seq); 16162306a36Sopenharmony_ci MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq; 16262306a36Sopenharmony_ci 16362306a36Sopenharmony_ci /* note the fwd memory can reach a negative value after accounting 16462306a36Sopenharmony_ci * for the delta, but the later skb free will restore a non 16562306a36Sopenharmony_ci * negative one 16662306a36Sopenharmony_ci */ 16762306a36Sopenharmony_ci atomic_add(delta, &sk->sk_rmem_alloc); 16862306a36Sopenharmony_ci mptcp_rmem_charge(sk, delta); 16962306a36Sopenharmony_ci kfree_skb_partial(from, fragstolen); 17062306a36Sopenharmony_ci 17162306a36Sopenharmony_ci return true; 17262306a36Sopenharmony_ci} 17362306a36Sopenharmony_ci 17462306a36Sopenharmony_cistatic bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to, 17562306a36Sopenharmony_ci struct sk_buff *from) 17662306a36Sopenharmony_ci{ 17762306a36Sopenharmony_ci if (MPTCP_SKB_CB(from)->map_seq != MPTCP_SKB_CB(to)->end_seq) 17862306a36Sopenharmony_ci return false; 17962306a36Sopenharmony_ci 18062306a36Sopenharmony_ci return mptcp_try_coalesce((struct sock *)msk, to, from); 18162306a36Sopenharmony_ci} 18262306a36Sopenharmony_ci 18362306a36Sopenharmony_cistatic void __mptcp_rmem_reclaim(struct sock *sk, int amount) 18462306a36Sopenharmony_ci{ 18562306a36Sopenharmony_ci amount >>= PAGE_SHIFT; 18662306a36Sopenharmony_ci mptcp_rmem_charge(sk, amount << PAGE_SHIFT); 18762306a36Sopenharmony_ci __sk_mem_reduce_allocated(sk, amount); 18862306a36Sopenharmony_ci} 18962306a36Sopenharmony_ci 19062306a36Sopenharmony_cistatic void mptcp_rmem_uncharge(struct sock *sk, int size) 19162306a36Sopenharmony_ci{ 19262306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sk); 19362306a36Sopenharmony_ci int reclaimable; 19462306a36Sopenharmony_ci 19562306a36Sopenharmony_ci mptcp_rmem_fwd_alloc_add(sk, size); 19662306a36Sopenharmony_ci reclaimable = msk->rmem_fwd_alloc - sk_unused_reserved_mem(sk); 19762306a36Sopenharmony_ci 19862306a36Sopenharmony_ci /* see sk_mem_uncharge() for the rationale behind the following schema */ 19962306a36Sopenharmony_ci if (unlikely(reclaimable >= PAGE_SIZE)) 20062306a36Sopenharmony_ci __mptcp_rmem_reclaim(sk, reclaimable); 20162306a36Sopenharmony_ci} 20262306a36Sopenharmony_ci 20362306a36Sopenharmony_cistatic void mptcp_rfree(struct sk_buff *skb) 20462306a36Sopenharmony_ci{ 20562306a36Sopenharmony_ci unsigned int len = skb->truesize; 20662306a36Sopenharmony_ci struct sock *sk = skb->sk; 20762306a36Sopenharmony_ci 20862306a36Sopenharmony_ci atomic_sub(len, &sk->sk_rmem_alloc); 20962306a36Sopenharmony_ci mptcp_rmem_uncharge(sk, len); 21062306a36Sopenharmony_ci} 21162306a36Sopenharmony_ci 21262306a36Sopenharmony_civoid mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk) 21362306a36Sopenharmony_ci{ 21462306a36Sopenharmony_ci skb_orphan(skb); 21562306a36Sopenharmony_ci skb->sk = sk; 21662306a36Sopenharmony_ci skb->destructor = mptcp_rfree; 21762306a36Sopenharmony_ci atomic_add(skb->truesize, &sk->sk_rmem_alloc); 21862306a36Sopenharmony_ci mptcp_rmem_charge(sk, skb->truesize); 21962306a36Sopenharmony_ci} 22062306a36Sopenharmony_ci 22162306a36Sopenharmony_ci/* "inspired" by tcp_data_queue_ofo(), main differences: 22262306a36Sopenharmony_ci * - use mptcp seqs 22362306a36Sopenharmony_ci * - don't cope with sacks 22462306a36Sopenharmony_ci */ 22562306a36Sopenharmony_cistatic void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb) 22662306a36Sopenharmony_ci{ 22762306a36Sopenharmony_ci struct sock *sk = (struct sock *)msk; 22862306a36Sopenharmony_ci struct rb_node **p, *parent; 22962306a36Sopenharmony_ci u64 seq, end_seq, max_seq; 23062306a36Sopenharmony_ci struct sk_buff *skb1; 23162306a36Sopenharmony_ci 23262306a36Sopenharmony_ci seq = MPTCP_SKB_CB(skb)->map_seq; 23362306a36Sopenharmony_ci end_seq = MPTCP_SKB_CB(skb)->end_seq; 23462306a36Sopenharmony_ci max_seq = atomic64_read(&msk->rcv_wnd_sent); 23562306a36Sopenharmony_ci 23662306a36Sopenharmony_ci pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk, seq, max_seq, 23762306a36Sopenharmony_ci RB_EMPTY_ROOT(&msk->out_of_order_queue)); 23862306a36Sopenharmony_ci if (after64(end_seq, max_seq)) { 23962306a36Sopenharmony_ci /* out of window */ 24062306a36Sopenharmony_ci mptcp_drop(sk, skb); 24162306a36Sopenharmony_ci pr_debug("oow by %lld, rcv_wnd_sent %llu\n", 24262306a36Sopenharmony_ci (unsigned long long)end_seq - (unsigned long)max_seq, 24362306a36Sopenharmony_ci (unsigned long long)atomic64_read(&msk->rcv_wnd_sent)); 24462306a36Sopenharmony_ci MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW); 24562306a36Sopenharmony_ci return; 24662306a36Sopenharmony_ci } 24762306a36Sopenharmony_ci 24862306a36Sopenharmony_ci p = &msk->out_of_order_queue.rb_node; 24962306a36Sopenharmony_ci MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUE); 25062306a36Sopenharmony_ci if (RB_EMPTY_ROOT(&msk->out_of_order_queue)) { 25162306a36Sopenharmony_ci rb_link_node(&skb->rbnode, NULL, p); 25262306a36Sopenharmony_ci rb_insert_color(&skb->rbnode, &msk->out_of_order_queue); 25362306a36Sopenharmony_ci msk->ooo_last_skb = skb; 25462306a36Sopenharmony_ci goto end; 25562306a36Sopenharmony_ci } 25662306a36Sopenharmony_ci 25762306a36Sopenharmony_ci /* with 2 subflows, adding at end of ooo queue is quite likely 25862306a36Sopenharmony_ci * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. 25962306a36Sopenharmony_ci */ 26062306a36Sopenharmony_ci if (mptcp_ooo_try_coalesce(msk, msk->ooo_last_skb, skb)) { 26162306a36Sopenharmony_ci MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE); 26262306a36Sopenharmony_ci MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL); 26362306a36Sopenharmony_ci return; 26462306a36Sopenharmony_ci } 26562306a36Sopenharmony_ci 26662306a36Sopenharmony_ci /* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */ 26762306a36Sopenharmony_ci if (!before64(seq, MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq)) { 26862306a36Sopenharmony_ci MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL); 26962306a36Sopenharmony_ci parent = &msk->ooo_last_skb->rbnode; 27062306a36Sopenharmony_ci p = &parent->rb_right; 27162306a36Sopenharmony_ci goto insert; 27262306a36Sopenharmony_ci } 27362306a36Sopenharmony_ci 27462306a36Sopenharmony_ci /* Find place to insert this segment. Handle overlaps on the way. */ 27562306a36Sopenharmony_ci parent = NULL; 27662306a36Sopenharmony_ci while (*p) { 27762306a36Sopenharmony_ci parent = *p; 27862306a36Sopenharmony_ci skb1 = rb_to_skb(parent); 27962306a36Sopenharmony_ci if (before64(seq, MPTCP_SKB_CB(skb1)->map_seq)) { 28062306a36Sopenharmony_ci p = &parent->rb_left; 28162306a36Sopenharmony_ci continue; 28262306a36Sopenharmony_ci } 28362306a36Sopenharmony_ci if (before64(seq, MPTCP_SKB_CB(skb1)->end_seq)) { 28462306a36Sopenharmony_ci if (!after64(end_seq, MPTCP_SKB_CB(skb1)->end_seq)) { 28562306a36Sopenharmony_ci /* All the bits are present. Drop. */ 28662306a36Sopenharmony_ci mptcp_drop(sk, skb); 28762306a36Sopenharmony_ci MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); 28862306a36Sopenharmony_ci return; 28962306a36Sopenharmony_ci } 29062306a36Sopenharmony_ci if (after64(seq, MPTCP_SKB_CB(skb1)->map_seq)) { 29162306a36Sopenharmony_ci /* partial overlap: 29262306a36Sopenharmony_ci * | skb | 29362306a36Sopenharmony_ci * | skb1 | 29462306a36Sopenharmony_ci * continue traversing 29562306a36Sopenharmony_ci */ 29662306a36Sopenharmony_ci } else { 29762306a36Sopenharmony_ci /* skb's seq == skb1's seq and skb covers skb1. 29862306a36Sopenharmony_ci * Replace skb1 with skb. 29962306a36Sopenharmony_ci */ 30062306a36Sopenharmony_ci rb_replace_node(&skb1->rbnode, &skb->rbnode, 30162306a36Sopenharmony_ci &msk->out_of_order_queue); 30262306a36Sopenharmony_ci mptcp_drop(sk, skb1); 30362306a36Sopenharmony_ci MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); 30462306a36Sopenharmony_ci goto merge_right; 30562306a36Sopenharmony_ci } 30662306a36Sopenharmony_ci } else if (mptcp_ooo_try_coalesce(msk, skb1, skb)) { 30762306a36Sopenharmony_ci MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE); 30862306a36Sopenharmony_ci return; 30962306a36Sopenharmony_ci } 31062306a36Sopenharmony_ci p = &parent->rb_right; 31162306a36Sopenharmony_ci } 31262306a36Sopenharmony_ci 31362306a36Sopenharmony_ciinsert: 31462306a36Sopenharmony_ci /* Insert segment into RB tree. */ 31562306a36Sopenharmony_ci rb_link_node(&skb->rbnode, parent, p); 31662306a36Sopenharmony_ci rb_insert_color(&skb->rbnode, &msk->out_of_order_queue); 31762306a36Sopenharmony_ci 31862306a36Sopenharmony_cimerge_right: 31962306a36Sopenharmony_ci /* Remove other segments covered by skb. */ 32062306a36Sopenharmony_ci while ((skb1 = skb_rb_next(skb)) != NULL) { 32162306a36Sopenharmony_ci if (before64(end_seq, MPTCP_SKB_CB(skb1)->end_seq)) 32262306a36Sopenharmony_ci break; 32362306a36Sopenharmony_ci rb_erase(&skb1->rbnode, &msk->out_of_order_queue); 32462306a36Sopenharmony_ci mptcp_drop(sk, skb1); 32562306a36Sopenharmony_ci MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); 32662306a36Sopenharmony_ci } 32762306a36Sopenharmony_ci /* If there is no skb after us, we are the last_skb ! */ 32862306a36Sopenharmony_ci if (!skb1) 32962306a36Sopenharmony_ci msk->ooo_last_skb = skb; 33062306a36Sopenharmony_ci 33162306a36Sopenharmony_ciend: 33262306a36Sopenharmony_ci skb_condense(skb); 33362306a36Sopenharmony_ci mptcp_set_owner_r(skb, sk); 33462306a36Sopenharmony_ci} 33562306a36Sopenharmony_ci 33662306a36Sopenharmony_cistatic bool mptcp_rmem_schedule(struct sock *sk, struct sock *ssk, int size) 33762306a36Sopenharmony_ci{ 33862306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sk); 33962306a36Sopenharmony_ci int amt, amount; 34062306a36Sopenharmony_ci 34162306a36Sopenharmony_ci if (size <= msk->rmem_fwd_alloc) 34262306a36Sopenharmony_ci return true; 34362306a36Sopenharmony_ci 34462306a36Sopenharmony_ci size -= msk->rmem_fwd_alloc; 34562306a36Sopenharmony_ci amt = sk_mem_pages(size); 34662306a36Sopenharmony_ci amount = amt << PAGE_SHIFT; 34762306a36Sopenharmony_ci if (!__sk_mem_raise_allocated(sk, size, amt, SK_MEM_RECV)) 34862306a36Sopenharmony_ci return false; 34962306a36Sopenharmony_ci 35062306a36Sopenharmony_ci mptcp_rmem_fwd_alloc_add(sk, amount); 35162306a36Sopenharmony_ci return true; 35262306a36Sopenharmony_ci} 35362306a36Sopenharmony_ci 35462306a36Sopenharmony_cistatic bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, 35562306a36Sopenharmony_ci struct sk_buff *skb, unsigned int offset, 35662306a36Sopenharmony_ci size_t copy_len) 35762306a36Sopenharmony_ci{ 35862306a36Sopenharmony_ci struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 35962306a36Sopenharmony_ci struct sock *sk = (struct sock *)msk; 36062306a36Sopenharmony_ci struct sk_buff *tail; 36162306a36Sopenharmony_ci bool has_rxtstamp; 36262306a36Sopenharmony_ci 36362306a36Sopenharmony_ci __skb_unlink(skb, &ssk->sk_receive_queue); 36462306a36Sopenharmony_ci 36562306a36Sopenharmony_ci skb_ext_reset(skb); 36662306a36Sopenharmony_ci skb_orphan(skb); 36762306a36Sopenharmony_ci 36862306a36Sopenharmony_ci /* try to fetch required memory from subflow */ 36962306a36Sopenharmony_ci if (!mptcp_rmem_schedule(sk, ssk, skb->truesize)) 37062306a36Sopenharmony_ci goto drop; 37162306a36Sopenharmony_ci 37262306a36Sopenharmony_ci has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; 37362306a36Sopenharmony_ci 37462306a36Sopenharmony_ci /* the skb map_seq accounts for the skb offset: 37562306a36Sopenharmony_ci * mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq 37662306a36Sopenharmony_ci * value 37762306a36Sopenharmony_ci */ 37862306a36Sopenharmony_ci MPTCP_SKB_CB(skb)->map_seq = mptcp_subflow_get_mapped_dsn(subflow); 37962306a36Sopenharmony_ci MPTCP_SKB_CB(skb)->end_seq = MPTCP_SKB_CB(skb)->map_seq + copy_len; 38062306a36Sopenharmony_ci MPTCP_SKB_CB(skb)->offset = offset; 38162306a36Sopenharmony_ci MPTCP_SKB_CB(skb)->has_rxtstamp = has_rxtstamp; 38262306a36Sopenharmony_ci 38362306a36Sopenharmony_ci if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) { 38462306a36Sopenharmony_ci /* in sequence */ 38562306a36Sopenharmony_ci msk->bytes_received += copy_len; 38662306a36Sopenharmony_ci WRITE_ONCE(msk->ack_seq, msk->ack_seq + copy_len); 38762306a36Sopenharmony_ci tail = skb_peek_tail(&sk->sk_receive_queue); 38862306a36Sopenharmony_ci if (tail && mptcp_try_coalesce(sk, tail, skb)) 38962306a36Sopenharmony_ci return true; 39062306a36Sopenharmony_ci 39162306a36Sopenharmony_ci mptcp_set_owner_r(skb, sk); 39262306a36Sopenharmony_ci __skb_queue_tail(&sk->sk_receive_queue, skb); 39362306a36Sopenharmony_ci return true; 39462306a36Sopenharmony_ci } else if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) { 39562306a36Sopenharmony_ci mptcp_data_queue_ofo(msk, skb); 39662306a36Sopenharmony_ci return false; 39762306a36Sopenharmony_ci } 39862306a36Sopenharmony_ci 39962306a36Sopenharmony_ci /* old data, keep it simple and drop the whole pkt, sender 40062306a36Sopenharmony_ci * will retransmit as needed, if needed. 40162306a36Sopenharmony_ci */ 40262306a36Sopenharmony_ci MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); 40362306a36Sopenharmony_cidrop: 40462306a36Sopenharmony_ci mptcp_drop(sk, skb); 40562306a36Sopenharmony_ci return false; 40662306a36Sopenharmony_ci} 40762306a36Sopenharmony_ci 40862306a36Sopenharmony_cistatic void mptcp_stop_rtx_timer(struct sock *sk) 40962306a36Sopenharmony_ci{ 41062306a36Sopenharmony_ci struct inet_connection_sock *icsk = inet_csk(sk); 41162306a36Sopenharmony_ci 41262306a36Sopenharmony_ci sk_stop_timer(sk, &icsk->icsk_retransmit_timer); 41362306a36Sopenharmony_ci mptcp_sk(sk)->timer_ival = 0; 41462306a36Sopenharmony_ci} 41562306a36Sopenharmony_ci 41662306a36Sopenharmony_cistatic void mptcp_close_wake_up(struct sock *sk) 41762306a36Sopenharmony_ci{ 41862306a36Sopenharmony_ci if (sock_flag(sk, SOCK_DEAD)) 41962306a36Sopenharmony_ci return; 42062306a36Sopenharmony_ci 42162306a36Sopenharmony_ci sk->sk_state_change(sk); 42262306a36Sopenharmony_ci if (sk->sk_shutdown == SHUTDOWN_MASK || 42362306a36Sopenharmony_ci sk->sk_state == TCP_CLOSE) 42462306a36Sopenharmony_ci sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); 42562306a36Sopenharmony_ci else 42662306a36Sopenharmony_ci sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); 42762306a36Sopenharmony_ci} 42862306a36Sopenharmony_ci 42962306a36Sopenharmony_cistatic bool mptcp_pending_data_fin_ack(struct sock *sk) 43062306a36Sopenharmony_ci{ 43162306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sk); 43262306a36Sopenharmony_ci 43362306a36Sopenharmony_ci return ((1 << sk->sk_state) & 43462306a36Sopenharmony_ci (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) && 43562306a36Sopenharmony_ci msk->write_seq == READ_ONCE(msk->snd_una); 43662306a36Sopenharmony_ci} 43762306a36Sopenharmony_ci 43862306a36Sopenharmony_cistatic void mptcp_check_data_fin_ack(struct sock *sk) 43962306a36Sopenharmony_ci{ 44062306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sk); 44162306a36Sopenharmony_ci 44262306a36Sopenharmony_ci /* Look for an acknowledged DATA_FIN */ 44362306a36Sopenharmony_ci if (mptcp_pending_data_fin_ack(sk)) { 44462306a36Sopenharmony_ci WRITE_ONCE(msk->snd_data_fin_enable, 0); 44562306a36Sopenharmony_ci 44662306a36Sopenharmony_ci switch (sk->sk_state) { 44762306a36Sopenharmony_ci case TCP_FIN_WAIT1: 44862306a36Sopenharmony_ci mptcp_set_state(sk, TCP_FIN_WAIT2); 44962306a36Sopenharmony_ci break; 45062306a36Sopenharmony_ci case TCP_CLOSING: 45162306a36Sopenharmony_ci case TCP_LAST_ACK: 45262306a36Sopenharmony_ci mptcp_set_state(sk, TCP_CLOSE); 45362306a36Sopenharmony_ci break; 45462306a36Sopenharmony_ci } 45562306a36Sopenharmony_ci 45662306a36Sopenharmony_ci mptcp_close_wake_up(sk); 45762306a36Sopenharmony_ci } 45862306a36Sopenharmony_ci} 45962306a36Sopenharmony_ci 46062306a36Sopenharmony_cistatic bool mptcp_pending_data_fin(struct sock *sk, u64 *seq) 46162306a36Sopenharmony_ci{ 46262306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sk); 46362306a36Sopenharmony_ci 46462306a36Sopenharmony_ci if (READ_ONCE(msk->rcv_data_fin) && 46562306a36Sopenharmony_ci ((1 << sk->sk_state) & 46662306a36Sopenharmony_ci (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2))) { 46762306a36Sopenharmony_ci u64 rcv_data_fin_seq = READ_ONCE(msk->rcv_data_fin_seq); 46862306a36Sopenharmony_ci 46962306a36Sopenharmony_ci if (msk->ack_seq == rcv_data_fin_seq) { 47062306a36Sopenharmony_ci if (seq) 47162306a36Sopenharmony_ci *seq = rcv_data_fin_seq; 47262306a36Sopenharmony_ci 47362306a36Sopenharmony_ci return true; 47462306a36Sopenharmony_ci } 47562306a36Sopenharmony_ci } 47662306a36Sopenharmony_ci 47762306a36Sopenharmony_ci return false; 47862306a36Sopenharmony_ci} 47962306a36Sopenharmony_ci 48062306a36Sopenharmony_cistatic void mptcp_set_datafin_timeout(struct sock *sk) 48162306a36Sopenharmony_ci{ 48262306a36Sopenharmony_ci struct inet_connection_sock *icsk = inet_csk(sk); 48362306a36Sopenharmony_ci u32 retransmits; 48462306a36Sopenharmony_ci 48562306a36Sopenharmony_ci retransmits = min_t(u32, icsk->icsk_retransmits, 48662306a36Sopenharmony_ci ilog2(TCP_RTO_MAX / TCP_RTO_MIN)); 48762306a36Sopenharmony_ci 48862306a36Sopenharmony_ci mptcp_sk(sk)->timer_ival = TCP_RTO_MIN << retransmits; 48962306a36Sopenharmony_ci} 49062306a36Sopenharmony_ci 49162306a36Sopenharmony_cistatic void __mptcp_set_timeout(struct sock *sk, long tout) 49262306a36Sopenharmony_ci{ 49362306a36Sopenharmony_ci mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN; 49462306a36Sopenharmony_ci} 49562306a36Sopenharmony_ci 49662306a36Sopenharmony_cistatic long mptcp_timeout_from_subflow(const struct mptcp_subflow_context *subflow) 49762306a36Sopenharmony_ci{ 49862306a36Sopenharmony_ci const struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 49962306a36Sopenharmony_ci 50062306a36Sopenharmony_ci return inet_csk(ssk)->icsk_pending && !subflow->stale_count ? 50162306a36Sopenharmony_ci inet_csk(ssk)->icsk_timeout - jiffies : 0; 50262306a36Sopenharmony_ci} 50362306a36Sopenharmony_ci 50462306a36Sopenharmony_cistatic void mptcp_set_timeout(struct sock *sk) 50562306a36Sopenharmony_ci{ 50662306a36Sopenharmony_ci struct mptcp_subflow_context *subflow; 50762306a36Sopenharmony_ci long tout = 0; 50862306a36Sopenharmony_ci 50962306a36Sopenharmony_ci mptcp_for_each_subflow(mptcp_sk(sk), subflow) 51062306a36Sopenharmony_ci tout = max(tout, mptcp_timeout_from_subflow(subflow)); 51162306a36Sopenharmony_ci __mptcp_set_timeout(sk, tout); 51262306a36Sopenharmony_ci} 51362306a36Sopenharmony_ci 51462306a36Sopenharmony_cistatic inline bool tcp_can_send_ack(const struct sock *ssk) 51562306a36Sopenharmony_ci{ 51662306a36Sopenharmony_ci return !((1 << inet_sk_state_load(ssk)) & 51762306a36Sopenharmony_ci (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_TIME_WAIT | TCPF_CLOSE | TCPF_LISTEN)); 51862306a36Sopenharmony_ci} 51962306a36Sopenharmony_ci 52062306a36Sopenharmony_civoid __mptcp_subflow_send_ack(struct sock *ssk) 52162306a36Sopenharmony_ci{ 52262306a36Sopenharmony_ci if (tcp_can_send_ack(ssk)) 52362306a36Sopenharmony_ci tcp_send_ack(ssk); 52462306a36Sopenharmony_ci} 52562306a36Sopenharmony_ci 52662306a36Sopenharmony_cistatic void mptcp_subflow_send_ack(struct sock *ssk) 52762306a36Sopenharmony_ci{ 52862306a36Sopenharmony_ci bool slow; 52962306a36Sopenharmony_ci 53062306a36Sopenharmony_ci slow = lock_sock_fast(ssk); 53162306a36Sopenharmony_ci __mptcp_subflow_send_ack(ssk); 53262306a36Sopenharmony_ci unlock_sock_fast(ssk, slow); 53362306a36Sopenharmony_ci} 53462306a36Sopenharmony_ci 53562306a36Sopenharmony_cistatic void mptcp_send_ack(struct mptcp_sock *msk) 53662306a36Sopenharmony_ci{ 53762306a36Sopenharmony_ci struct mptcp_subflow_context *subflow; 53862306a36Sopenharmony_ci 53962306a36Sopenharmony_ci mptcp_for_each_subflow(msk, subflow) 54062306a36Sopenharmony_ci mptcp_subflow_send_ack(mptcp_subflow_tcp_sock(subflow)); 54162306a36Sopenharmony_ci} 54262306a36Sopenharmony_ci 54362306a36Sopenharmony_cistatic void mptcp_subflow_cleanup_rbuf(struct sock *ssk) 54462306a36Sopenharmony_ci{ 54562306a36Sopenharmony_ci bool slow; 54662306a36Sopenharmony_ci 54762306a36Sopenharmony_ci slow = lock_sock_fast(ssk); 54862306a36Sopenharmony_ci if (tcp_can_send_ack(ssk)) 54962306a36Sopenharmony_ci tcp_cleanup_rbuf(ssk, 1); 55062306a36Sopenharmony_ci unlock_sock_fast(ssk, slow); 55162306a36Sopenharmony_ci} 55262306a36Sopenharmony_ci 55362306a36Sopenharmony_cistatic bool mptcp_subflow_could_cleanup(const struct sock *ssk, bool rx_empty) 55462306a36Sopenharmony_ci{ 55562306a36Sopenharmony_ci const struct inet_connection_sock *icsk = inet_csk(ssk); 55662306a36Sopenharmony_ci u8 ack_pending = READ_ONCE(icsk->icsk_ack.pending); 55762306a36Sopenharmony_ci const struct tcp_sock *tp = tcp_sk(ssk); 55862306a36Sopenharmony_ci 55962306a36Sopenharmony_ci return (ack_pending & ICSK_ACK_SCHED) && 56062306a36Sopenharmony_ci ((READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->rcv_wup) > 56162306a36Sopenharmony_ci READ_ONCE(icsk->icsk_ack.rcv_mss)) || 56262306a36Sopenharmony_ci (rx_empty && ack_pending & 56362306a36Sopenharmony_ci (ICSK_ACK_PUSHED2 | ICSK_ACK_PUSHED))); 56462306a36Sopenharmony_ci} 56562306a36Sopenharmony_ci 56662306a36Sopenharmony_cistatic void mptcp_cleanup_rbuf(struct mptcp_sock *msk) 56762306a36Sopenharmony_ci{ 56862306a36Sopenharmony_ci int old_space = READ_ONCE(msk->old_wspace); 56962306a36Sopenharmony_ci struct mptcp_subflow_context *subflow; 57062306a36Sopenharmony_ci struct sock *sk = (struct sock *)msk; 57162306a36Sopenharmony_ci int space = __mptcp_space(sk); 57262306a36Sopenharmony_ci bool cleanup, rx_empty; 57362306a36Sopenharmony_ci 57462306a36Sopenharmony_ci cleanup = (space > 0) && (space >= (old_space << 1)); 57562306a36Sopenharmony_ci rx_empty = !__mptcp_rmem(sk); 57662306a36Sopenharmony_ci 57762306a36Sopenharmony_ci mptcp_for_each_subflow(msk, subflow) { 57862306a36Sopenharmony_ci struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 57962306a36Sopenharmony_ci 58062306a36Sopenharmony_ci if (cleanup || mptcp_subflow_could_cleanup(ssk, rx_empty)) 58162306a36Sopenharmony_ci mptcp_subflow_cleanup_rbuf(ssk); 58262306a36Sopenharmony_ci } 58362306a36Sopenharmony_ci} 58462306a36Sopenharmony_ci 58562306a36Sopenharmony_cistatic bool mptcp_check_data_fin(struct sock *sk) 58662306a36Sopenharmony_ci{ 58762306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sk); 58862306a36Sopenharmony_ci u64 rcv_data_fin_seq; 58962306a36Sopenharmony_ci bool ret = false; 59062306a36Sopenharmony_ci 59162306a36Sopenharmony_ci /* Need to ack a DATA_FIN received from a peer while this side 59262306a36Sopenharmony_ci * of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2. 59362306a36Sopenharmony_ci * msk->rcv_data_fin was set when parsing the incoming options 59462306a36Sopenharmony_ci * at the subflow level and the msk lock was not held, so this 59562306a36Sopenharmony_ci * is the first opportunity to act on the DATA_FIN and change 59662306a36Sopenharmony_ci * the msk state. 59762306a36Sopenharmony_ci * 59862306a36Sopenharmony_ci * If we are caught up to the sequence number of the incoming 59962306a36Sopenharmony_ci * DATA_FIN, send the DATA_ACK now and do state transition. If 60062306a36Sopenharmony_ci * not caught up, do nothing and let the recv code send DATA_ACK 60162306a36Sopenharmony_ci * when catching up. 60262306a36Sopenharmony_ci */ 60362306a36Sopenharmony_ci 60462306a36Sopenharmony_ci if (mptcp_pending_data_fin(sk, &rcv_data_fin_seq)) { 60562306a36Sopenharmony_ci WRITE_ONCE(msk->ack_seq, msk->ack_seq + 1); 60662306a36Sopenharmony_ci WRITE_ONCE(msk->rcv_data_fin, 0); 60762306a36Sopenharmony_ci 60862306a36Sopenharmony_ci WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | RCV_SHUTDOWN); 60962306a36Sopenharmony_ci smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ 61062306a36Sopenharmony_ci 61162306a36Sopenharmony_ci switch (sk->sk_state) { 61262306a36Sopenharmony_ci case TCP_ESTABLISHED: 61362306a36Sopenharmony_ci mptcp_set_state(sk, TCP_CLOSE_WAIT); 61462306a36Sopenharmony_ci break; 61562306a36Sopenharmony_ci case TCP_FIN_WAIT1: 61662306a36Sopenharmony_ci mptcp_set_state(sk, TCP_CLOSING); 61762306a36Sopenharmony_ci break; 61862306a36Sopenharmony_ci case TCP_FIN_WAIT2: 61962306a36Sopenharmony_ci mptcp_set_state(sk, TCP_CLOSE); 62062306a36Sopenharmony_ci break; 62162306a36Sopenharmony_ci default: 62262306a36Sopenharmony_ci /* Other states not expected */ 62362306a36Sopenharmony_ci WARN_ON_ONCE(1); 62462306a36Sopenharmony_ci break; 62562306a36Sopenharmony_ci } 62662306a36Sopenharmony_ci 62762306a36Sopenharmony_ci ret = true; 62862306a36Sopenharmony_ci if (!__mptcp_check_fallback(msk)) 62962306a36Sopenharmony_ci mptcp_send_ack(msk); 63062306a36Sopenharmony_ci mptcp_close_wake_up(sk); 63162306a36Sopenharmony_ci } 63262306a36Sopenharmony_ci return ret; 63362306a36Sopenharmony_ci} 63462306a36Sopenharmony_ci 63562306a36Sopenharmony_cistatic bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, 63662306a36Sopenharmony_ci struct sock *ssk, 63762306a36Sopenharmony_ci unsigned int *bytes) 63862306a36Sopenharmony_ci{ 63962306a36Sopenharmony_ci struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 64062306a36Sopenharmony_ci struct sock *sk = (struct sock *)msk; 64162306a36Sopenharmony_ci unsigned int moved = 0; 64262306a36Sopenharmony_ci bool more_data_avail; 64362306a36Sopenharmony_ci struct tcp_sock *tp; 64462306a36Sopenharmony_ci bool done = false; 64562306a36Sopenharmony_ci int sk_rbuf; 64662306a36Sopenharmony_ci 64762306a36Sopenharmony_ci sk_rbuf = READ_ONCE(sk->sk_rcvbuf); 64862306a36Sopenharmony_ci 64962306a36Sopenharmony_ci if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { 65062306a36Sopenharmony_ci int ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf); 65162306a36Sopenharmony_ci 65262306a36Sopenharmony_ci if (unlikely(ssk_rbuf > sk_rbuf)) { 65362306a36Sopenharmony_ci WRITE_ONCE(sk->sk_rcvbuf, ssk_rbuf); 65462306a36Sopenharmony_ci sk_rbuf = ssk_rbuf; 65562306a36Sopenharmony_ci } 65662306a36Sopenharmony_ci } 65762306a36Sopenharmony_ci 65862306a36Sopenharmony_ci pr_debug("msk=%p ssk=%p", msk, ssk); 65962306a36Sopenharmony_ci tp = tcp_sk(ssk); 66062306a36Sopenharmony_ci do { 66162306a36Sopenharmony_ci u32 map_remaining, offset; 66262306a36Sopenharmony_ci u32 seq = tp->copied_seq; 66362306a36Sopenharmony_ci struct sk_buff *skb; 66462306a36Sopenharmony_ci bool fin; 66562306a36Sopenharmony_ci 66662306a36Sopenharmony_ci /* try to move as much data as available */ 66762306a36Sopenharmony_ci map_remaining = subflow->map_data_len - 66862306a36Sopenharmony_ci mptcp_subflow_get_map_offset(subflow); 66962306a36Sopenharmony_ci 67062306a36Sopenharmony_ci skb = skb_peek(&ssk->sk_receive_queue); 67162306a36Sopenharmony_ci if (!skb) { 67262306a36Sopenharmony_ci /* With racing move_skbs_to_msk() and __mptcp_move_skbs(), 67362306a36Sopenharmony_ci * a different CPU can have already processed the pending 67462306a36Sopenharmony_ci * data, stop here or we can enter an infinite loop 67562306a36Sopenharmony_ci */ 67662306a36Sopenharmony_ci if (!moved) 67762306a36Sopenharmony_ci done = true; 67862306a36Sopenharmony_ci break; 67962306a36Sopenharmony_ci } 68062306a36Sopenharmony_ci 68162306a36Sopenharmony_ci if (__mptcp_check_fallback(msk)) { 68262306a36Sopenharmony_ci /* Under fallback skbs have no MPTCP extension and TCP could 68362306a36Sopenharmony_ci * collapse them between the dummy map creation and the 68462306a36Sopenharmony_ci * current dequeue. Be sure to adjust the map size. 68562306a36Sopenharmony_ci */ 68662306a36Sopenharmony_ci map_remaining = skb->len; 68762306a36Sopenharmony_ci subflow->map_data_len = skb->len; 68862306a36Sopenharmony_ci } 68962306a36Sopenharmony_ci 69062306a36Sopenharmony_ci offset = seq - TCP_SKB_CB(skb)->seq; 69162306a36Sopenharmony_ci fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; 69262306a36Sopenharmony_ci if (fin) { 69362306a36Sopenharmony_ci done = true; 69462306a36Sopenharmony_ci seq++; 69562306a36Sopenharmony_ci } 69662306a36Sopenharmony_ci 69762306a36Sopenharmony_ci if (offset < skb->len) { 69862306a36Sopenharmony_ci size_t len = skb->len - offset; 69962306a36Sopenharmony_ci 70062306a36Sopenharmony_ci if (tp->urg_data) 70162306a36Sopenharmony_ci done = true; 70262306a36Sopenharmony_ci 70362306a36Sopenharmony_ci if (__mptcp_move_skb(msk, ssk, skb, offset, len)) 70462306a36Sopenharmony_ci moved += len; 70562306a36Sopenharmony_ci seq += len; 70662306a36Sopenharmony_ci 70762306a36Sopenharmony_ci if (WARN_ON_ONCE(map_remaining < len)) 70862306a36Sopenharmony_ci break; 70962306a36Sopenharmony_ci } else { 71062306a36Sopenharmony_ci WARN_ON_ONCE(!fin); 71162306a36Sopenharmony_ci sk_eat_skb(ssk, skb); 71262306a36Sopenharmony_ci done = true; 71362306a36Sopenharmony_ci } 71462306a36Sopenharmony_ci 71562306a36Sopenharmony_ci WRITE_ONCE(tp->copied_seq, seq); 71662306a36Sopenharmony_ci more_data_avail = mptcp_subflow_data_available(ssk); 71762306a36Sopenharmony_ci 71862306a36Sopenharmony_ci if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf) { 71962306a36Sopenharmony_ci done = true; 72062306a36Sopenharmony_ci break; 72162306a36Sopenharmony_ci } 72262306a36Sopenharmony_ci } while (more_data_avail); 72362306a36Sopenharmony_ci 72462306a36Sopenharmony_ci *bytes += moved; 72562306a36Sopenharmony_ci return done; 72662306a36Sopenharmony_ci} 72762306a36Sopenharmony_ci 72862306a36Sopenharmony_cistatic bool __mptcp_ofo_queue(struct mptcp_sock *msk) 72962306a36Sopenharmony_ci{ 73062306a36Sopenharmony_ci struct sock *sk = (struct sock *)msk; 73162306a36Sopenharmony_ci struct sk_buff *skb, *tail; 73262306a36Sopenharmony_ci bool moved = false; 73362306a36Sopenharmony_ci struct rb_node *p; 73462306a36Sopenharmony_ci u64 end_seq; 73562306a36Sopenharmony_ci 73662306a36Sopenharmony_ci p = rb_first(&msk->out_of_order_queue); 73762306a36Sopenharmony_ci pr_debug("msk=%p empty=%d", msk, RB_EMPTY_ROOT(&msk->out_of_order_queue)); 73862306a36Sopenharmony_ci while (p) { 73962306a36Sopenharmony_ci skb = rb_to_skb(p); 74062306a36Sopenharmony_ci if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) 74162306a36Sopenharmony_ci break; 74262306a36Sopenharmony_ci 74362306a36Sopenharmony_ci p = rb_next(p); 74462306a36Sopenharmony_ci rb_erase(&skb->rbnode, &msk->out_of_order_queue); 74562306a36Sopenharmony_ci 74662306a36Sopenharmony_ci if (unlikely(!after64(MPTCP_SKB_CB(skb)->end_seq, 74762306a36Sopenharmony_ci msk->ack_seq))) { 74862306a36Sopenharmony_ci mptcp_drop(sk, skb); 74962306a36Sopenharmony_ci MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); 75062306a36Sopenharmony_ci continue; 75162306a36Sopenharmony_ci } 75262306a36Sopenharmony_ci 75362306a36Sopenharmony_ci end_seq = MPTCP_SKB_CB(skb)->end_seq; 75462306a36Sopenharmony_ci tail = skb_peek_tail(&sk->sk_receive_queue); 75562306a36Sopenharmony_ci if (!tail || !mptcp_ooo_try_coalesce(msk, tail, skb)) { 75662306a36Sopenharmony_ci int delta = msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq; 75762306a36Sopenharmony_ci 75862306a36Sopenharmony_ci /* skip overlapping data, if any */ 75962306a36Sopenharmony_ci pr_debug("uncoalesced seq=%llx ack seq=%llx delta=%d", 76062306a36Sopenharmony_ci MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq, 76162306a36Sopenharmony_ci delta); 76262306a36Sopenharmony_ci MPTCP_SKB_CB(skb)->offset += delta; 76362306a36Sopenharmony_ci MPTCP_SKB_CB(skb)->map_seq += delta; 76462306a36Sopenharmony_ci __skb_queue_tail(&sk->sk_receive_queue, skb); 76562306a36Sopenharmony_ci } 76662306a36Sopenharmony_ci msk->bytes_received += end_seq - msk->ack_seq; 76762306a36Sopenharmony_ci msk->ack_seq = end_seq; 76862306a36Sopenharmony_ci moved = true; 76962306a36Sopenharmony_ci } 77062306a36Sopenharmony_ci return moved; 77162306a36Sopenharmony_ci} 77262306a36Sopenharmony_ci 77362306a36Sopenharmony_cistatic bool __mptcp_subflow_error_report(struct sock *sk, struct sock *ssk) 77462306a36Sopenharmony_ci{ 77562306a36Sopenharmony_ci int err = sock_error(ssk); 77662306a36Sopenharmony_ci int ssk_state; 77762306a36Sopenharmony_ci 77862306a36Sopenharmony_ci if (!err) 77962306a36Sopenharmony_ci return false; 78062306a36Sopenharmony_ci 78162306a36Sopenharmony_ci /* only propagate errors on fallen-back sockets or 78262306a36Sopenharmony_ci * on MPC connect 78362306a36Sopenharmony_ci */ 78462306a36Sopenharmony_ci if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(mptcp_sk(sk))) 78562306a36Sopenharmony_ci return false; 78662306a36Sopenharmony_ci 78762306a36Sopenharmony_ci /* We need to propagate only transition to CLOSE state. 78862306a36Sopenharmony_ci * Orphaned socket will see such state change via 78962306a36Sopenharmony_ci * subflow_sched_work_if_closed() and that path will properly 79062306a36Sopenharmony_ci * destroy the msk as needed. 79162306a36Sopenharmony_ci */ 79262306a36Sopenharmony_ci ssk_state = inet_sk_state_load(ssk); 79362306a36Sopenharmony_ci if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD)) 79462306a36Sopenharmony_ci mptcp_set_state(sk, ssk_state); 79562306a36Sopenharmony_ci WRITE_ONCE(sk->sk_err, -err); 79662306a36Sopenharmony_ci 79762306a36Sopenharmony_ci /* This barrier is coupled with smp_rmb() in mptcp_poll() */ 79862306a36Sopenharmony_ci smp_wmb(); 79962306a36Sopenharmony_ci sk_error_report(sk); 80062306a36Sopenharmony_ci return true; 80162306a36Sopenharmony_ci} 80262306a36Sopenharmony_ci 80362306a36Sopenharmony_civoid __mptcp_error_report(struct sock *sk) 80462306a36Sopenharmony_ci{ 80562306a36Sopenharmony_ci struct mptcp_subflow_context *subflow; 80662306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sk); 80762306a36Sopenharmony_ci 80862306a36Sopenharmony_ci mptcp_for_each_subflow(msk, subflow) 80962306a36Sopenharmony_ci if (__mptcp_subflow_error_report(sk, mptcp_subflow_tcp_sock(subflow))) 81062306a36Sopenharmony_ci break; 81162306a36Sopenharmony_ci} 81262306a36Sopenharmony_ci 81362306a36Sopenharmony_ci/* In most cases we will be able to lock the mptcp socket. If its already 81462306a36Sopenharmony_ci * owned, we need to defer to the work queue to avoid ABBA deadlock. 81562306a36Sopenharmony_ci */ 81662306a36Sopenharmony_cistatic bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) 81762306a36Sopenharmony_ci{ 81862306a36Sopenharmony_ci struct sock *sk = (struct sock *)msk; 81962306a36Sopenharmony_ci unsigned int moved = 0; 82062306a36Sopenharmony_ci 82162306a36Sopenharmony_ci __mptcp_move_skbs_from_subflow(msk, ssk, &moved); 82262306a36Sopenharmony_ci __mptcp_ofo_queue(msk); 82362306a36Sopenharmony_ci if (unlikely(ssk->sk_err)) { 82462306a36Sopenharmony_ci if (!sock_owned_by_user(sk)) 82562306a36Sopenharmony_ci __mptcp_error_report(sk); 82662306a36Sopenharmony_ci else 82762306a36Sopenharmony_ci __set_bit(MPTCP_ERROR_REPORT, &msk->cb_flags); 82862306a36Sopenharmony_ci } 82962306a36Sopenharmony_ci 83062306a36Sopenharmony_ci /* If the moves have caught up with the DATA_FIN sequence number 83162306a36Sopenharmony_ci * it's time to ack the DATA_FIN and change socket state, but 83262306a36Sopenharmony_ci * this is not a good place to change state. Let the workqueue 83362306a36Sopenharmony_ci * do it. 83462306a36Sopenharmony_ci */ 83562306a36Sopenharmony_ci if (mptcp_pending_data_fin(sk, NULL)) 83662306a36Sopenharmony_ci mptcp_schedule_work(sk); 83762306a36Sopenharmony_ci return moved > 0; 83862306a36Sopenharmony_ci} 83962306a36Sopenharmony_ci 84062306a36Sopenharmony_civoid mptcp_data_ready(struct sock *sk, struct sock *ssk) 84162306a36Sopenharmony_ci{ 84262306a36Sopenharmony_ci struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 84362306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sk); 84462306a36Sopenharmony_ci int sk_rbuf, ssk_rbuf; 84562306a36Sopenharmony_ci 84662306a36Sopenharmony_ci /* The peer can send data while we are shutting down this 84762306a36Sopenharmony_ci * subflow at msk destruction time, but we must avoid enqueuing 84862306a36Sopenharmony_ci * more data to the msk receive queue 84962306a36Sopenharmony_ci */ 85062306a36Sopenharmony_ci if (unlikely(subflow->disposable)) 85162306a36Sopenharmony_ci return; 85262306a36Sopenharmony_ci 85362306a36Sopenharmony_ci ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf); 85462306a36Sopenharmony_ci sk_rbuf = READ_ONCE(sk->sk_rcvbuf); 85562306a36Sopenharmony_ci if (unlikely(ssk_rbuf > sk_rbuf)) 85662306a36Sopenharmony_ci sk_rbuf = ssk_rbuf; 85762306a36Sopenharmony_ci 85862306a36Sopenharmony_ci /* over limit? can't append more skbs to msk, Also, no need to wake-up*/ 85962306a36Sopenharmony_ci if (__mptcp_rmem(sk) > sk_rbuf) { 86062306a36Sopenharmony_ci MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED); 86162306a36Sopenharmony_ci return; 86262306a36Sopenharmony_ci } 86362306a36Sopenharmony_ci 86462306a36Sopenharmony_ci /* Wake-up the reader only for in-sequence data */ 86562306a36Sopenharmony_ci mptcp_data_lock(sk); 86662306a36Sopenharmony_ci if (move_skbs_to_msk(msk, ssk)) 86762306a36Sopenharmony_ci sk->sk_data_ready(sk); 86862306a36Sopenharmony_ci 86962306a36Sopenharmony_ci mptcp_data_unlock(sk); 87062306a36Sopenharmony_ci} 87162306a36Sopenharmony_ci 87262306a36Sopenharmony_cistatic void mptcp_subflow_joined(struct mptcp_sock *msk, struct sock *ssk) 87362306a36Sopenharmony_ci{ 87462306a36Sopenharmony_ci mptcp_subflow_ctx(ssk)->map_seq = READ_ONCE(msk->ack_seq); 87562306a36Sopenharmony_ci WRITE_ONCE(msk->allow_infinite_fallback, false); 87662306a36Sopenharmony_ci mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC); 87762306a36Sopenharmony_ci} 87862306a36Sopenharmony_ci 87962306a36Sopenharmony_cistatic bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk) 88062306a36Sopenharmony_ci{ 88162306a36Sopenharmony_ci struct sock *sk = (struct sock *)msk; 88262306a36Sopenharmony_ci 88362306a36Sopenharmony_ci if (sk->sk_state != TCP_ESTABLISHED) 88462306a36Sopenharmony_ci return false; 88562306a36Sopenharmony_ci 88662306a36Sopenharmony_ci /* attach to msk socket only after we are sure we will deal with it 88762306a36Sopenharmony_ci * at close time 88862306a36Sopenharmony_ci */ 88962306a36Sopenharmony_ci if (sk->sk_socket && !ssk->sk_socket) 89062306a36Sopenharmony_ci mptcp_sock_graft(ssk, sk->sk_socket); 89162306a36Sopenharmony_ci 89262306a36Sopenharmony_ci mptcp_subflow_ctx(ssk)->subflow_id = msk->subflow_id++; 89362306a36Sopenharmony_ci mptcp_sockopt_sync_locked(msk, ssk); 89462306a36Sopenharmony_ci mptcp_subflow_joined(msk, ssk); 89562306a36Sopenharmony_ci mptcp_stop_tout_timer(sk); 89662306a36Sopenharmony_ci __mptcp_propagate_sndbuf(sk, ssk); 89762306a36Sopenharmony_ci return true; 89862306a36Sopenharmony_ci} 89962306a36Sopenharmony_ci 90062306a36Sopenharmony_cistatic void __mptcp_flush_join_list(struct sock *sk, struct list_head *join_list) 90162306a36Sopenharmony_ci{ 90262306a36Sopenharmony_ci struct mptcp_subflow_context *tmp, *subflow; 90362306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sk); 90462306a36Sopenharmony_ci 90562306a36Sopenharmony_ci list_for_each_entry_safe(subflow, tmp, join_list, node) { 90662306a36Sopenharmony_ci struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 90762306a36Sopenharmony_ci bool slow = lock_sock_fast(ssk); 90862306a36Sopenharmony_ci 90962306a36Sopenharmony_ci list_move_tail(&subflow->node, &msk->conn_list); 91062306a36Sopenharmony_ci if (!__mptcp_finish_join(msk, ssk)) 91162306a36Sopenharmony_ci mptcp_subflow_reset(ssk); 91262306a36Sopenharmony_ci unlock_sock_fast(ssk, slow); 91362306a36Sopenharmony_ci } 91462306a36Sopenharmony_ci} 91562306a36Sopenharmony_ci 91662306a36Sopenharmony_cistatic bool mptcp_rtx_timer_pending(struct sock *sk) 91762306a36Sopenharmony_ci{ 91862306a36Sopenharmony_ci return timer_pending(&inet_csk(sk)->icsk_retransmit_timer); 91962306a36Sopenharmony_ci} 92062306a36Sopenharmony_ci 92162306a36Sopenharmony_cistatic void mptcp_reset_rtx_timer(struct sock *sk) 92262306a36Sopenharmony_ci{ 92362306a36Sopenharmony_ci struct inet_connection_sock *icsk = inet_csk(sk); 92462306a36Sopenharmony_ci unsigned long tout; 92562306a36Sopenharmony_ci 92662306a36Sopenharmony_ci /* prevent rescheduling on close */ 92762306a36Sopenharmony_ci if (unlikely(inet_sk_state_load(sk) == TCP_CLOSE)) 92862306a36Sopenharmony_ci return; 92962306a36Sopenharmony_ci 93062306a36Sopenharmony_ci tout = mptcp_sk(sk)->timer_ival; 93162306a36Sopenharmony_ci sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout); 93262306a36Sopenharmony_ci} 93362306a36Sopenharmony_ci 93462306a36Sopenharmony_cibool mptcp_schedule_work(struct sock *sk) 93562306a36Sopenharmony_ci{ 93662306a36Sopenharmony_ci if (inet_sk_state_load(sk) != TCP_CLOSE && 93762306a36Sopenharmony_ci schedule_work(&mptcp_sk(sk)->work)) { 93862306a36Sopenharmony_ci /* each subflow already holds a reference to the sk, and the 93962306a36Sopenharmony_ci * workqueue is invoked by a subflow, so sk can't go away here. 94062306a36Sopenharmony_ci */ 94162306a36Sopenharmony_ci sock_hold(sk); 94262306a36Sopenharmony_ci return true; 94362306a36Sopenharmony_ci } 94462306a36Sopenharmony_ci return false; 94562306a36Sopenharmony_ci} 94662306a36Sopenharmony_ci 94762306a36Sopenharmony_cistatic struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) 94862306a36Sopenharmony_ci{ 94962306a36Sopenharmony_ci struct mptcp_subflow_context *subflow; 95062306a36Sopenharmony_ci 95162306a36Sopenharmony_ci msk_owned_by_me(msk); 95262306a36Sopenharmony_ci 95362306a36Sopenharmony_ci mptcp_for_each_subflow(msk, subflow) { 95462306a36Sopenharmony_ci if (READ_ONCE(subflow->data_avail)) 95562306a36Sopenharmony_ci return mptcp_subflow_tcp_sock(subflow); 95662306a36Sopenharmony_ci } 95762306a36Sopenharmony_ci 95862306a36Sopenharmony_ci return NULL; 95962306a36Sopenharmony_ci} 96062306a36Sopenharmony_ci 96162306a36Sopenharmony_cistatic bool mptcp_skb_can_collapse_to(u64 write_seq, 96262306a36Sopenharmony_ci const struct sk_buff *skb, 96362306a36Sopenharmony_ci const struct mptcp_ext *mpext) 96462306a36Sopenharmony_ci{ 96562306a36Sopenharmony_ci if (!tcp_skb_can_collapse_to(skb)) 96662306a36Sopenharmony_ci return false; 96762306a36Sopenharmony_ci 96862306a36Sopenharmony_ci /* can collapse only if MPTCP level sequence is in order and this 96962306a36Sopenharmony_ci * mapping has not been xmitted yet 97062306a36Sopenharmony_ci */ 97162306a36Sopenharmony_ci return mpext && mpext->data_seq + mpext->data_len == write_seq && 97262306a36Sopenharmony_ci !mpext->frozen; 97362306a36Sopenharmony_ci} 97462306a36Sopenharmony_ci 97562306a36Sopenharmony_ci/* we can append data to the given data frag if: 97662306a36Sopenharmony_ci * - there is space available in the backing page_frag 97762306a36Sopenharmony_ci * - the data frag tail matches the current page_frag free offset 97862306a36Sopenharmony_ci * - the data frag end sequence number matches the current write seq 97962306a36Sopenharmony_ci */ 98062306a36Sopenharmony_cistatic bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, 98162306a36Sopenharmony_ci const struct page_frag *pfrag, 98262306a36Sopenharmony_ci const struct mptcp_data_frag *df) 98362306a36Sopenharmony_ci{ 98462306a36Sopenharmony_ci return df && pfrag->page == df->page && 98562306a36Sopenharmony_ci pfrag->size - pfrag->offset > 0 && 98662306a36Sopenharmony_ci pfrag->offset == (df->offset + df->data_len) && 98762306a36Sopenharmony_ci df->data_seq + df->data_len == msk->write_seq; 98862306a36Sopenharmony_ci} 98962306a36Sopenharmony_ci 99062306a36Sopenharmony_cistatic void dfrag_uncharge(struct sock *sk, int len) 99162306a36Sopenharmony_ci{ 99262306a36Sopenharmony_ci sk_mem_uncharge(sk, len); 99362306a36Sopenharmony_ci sk_wmem_queued_add(sk, -len); 99462306a36Sopenharmony_ci} 99562306a36Sopenharmony_ci 99662306a36Sopenharmony_cistatic void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag) 99762306a36Sopenharmony_ci{ 99862306a36Sopenharmony_ci int len = dfrag->data_len + dfrag->overhead; 99962306a36Sopenharmony_ci 100062306a36Sopenharmony_ci list_del(&dfrag->list); 100162306a36Sopenharmony_ci dfrag_uncharge(sk, len); 100262306a36Sopenharmony_ci put_page(dfrag->page); 100362306a36Sopenharmony_ci} 100462306a36Sopenharmony_ci 100562306a36Sopenharmony_cistatic void __mptcp_clean_una(struct sock *sk) 100662306a36Sopenharmony_ci{ 100762306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sk); 100862306a36Sopenharmony_ci struct mptcp_data_frag *dtmp, *dfrag; 100962306a36Sopenharmony_ci u64 snd_una; 101062306a36Sopenharmony_ci 101162306a36Sopenharmony_ci snd_una = msk->snd_una; 101262306a36Sopenharmony_ci list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) { 101362306a36Sopenharmony_ci if (after64(dfrag->data_seq + dfrag->data_len, snd_una)) 101462306a36Sopenharmony_ci break; 101562306a36Sopenharmony_ci 101662306a36Sopenharmony_ci if (unlikely(dfrag == msk->first_pending)) { 101762306a36Sopenharmony_ci /* in recovery mode can see ack after the current snd head */ 101862306a36Sopenharmony_ci if (WARN_ON_ONCE(!msk->recovery)) 101962306a36Sopenharmony_ci break; 102062306a36Sopenharmony_ci 102162306a36Sopenharmony_ci WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); 102262306a36Sopenharmony_ci } 102362306a36Sopenharmony_ci 102462306a36Sopenharmony_ci dfrag_clear(sk, dfrag); 102562306a36Sopenharmony_ci } 102662306a36Sopenharmony_ci 102762306a36Sopenharmony_ci dfrag = mptcp_rtx_head(sk); 102862306a36Sopenharmony_ci if (dfrag && after64(snd_una, dfrag->data_seq)) { 102962306a36Sopenharmony_ci u64 delta = snd_una - dfrag->data_seq; 103062306a36Sopenharmony_ci 103162306a36Sopenharmony_ci /* prevent wrap around in recovery mode */ 103262306a36Sopenharmony_ci if (unlikely(delta > dfrag->already_sent)) { 103362306a36Sopenharmony_ci if (WARN_ON_ONCE(!msk->recovery)) 103462306a36Sopenharmony_ci goto out; 103562306a36Sopenharmony_ci if (WARN_ON_ONCE(delta > dfrag->data_len)) 103662306a36Sopenharmony_ci goto out; 103762306a36Sopenharmony_ci dfrag->already_sent += delta - dfrag->already_sent; 103862306a36Sopenharmony_ci } 103962306a36Sopenharmony_ci 104062306a36Sopenharmony_ci dfrag->data_seq += delta; 104162306a36Sopenharmony_ci dfrag->offset += delta; 104262306a36Sopenharmony_ci dfrag->data_len -= delta; 104362306a36Sopenharmony_ci dfrag->already_sent -= delta; 104462306a36Sopenharmony_ci 104562306a36Sopenharmony_ci dfrag_uncharge(sk, delta); 104662306a36Sopenharmony_ci } 104762306a36Sopenharmony_ci 104862306a36Sopenharmony_ci /* all retransmitted data acked, recovery completed */ 104962306a36Sopenharmony_ci if (unlikely(msk->recovery) && after64(msk->snd_una, msk->recovery_snd_nxt)) 105062306a36Sopenharmony_ci msk->recovery = false; 105162306a36Sopenharmony_ci 105262306a36Sopenharmony_ciout: 105362306a36Sopenharmony_ci if (snd_una == READ_ONCE(msk->snd_nxt) && 105462306a36Sopenharmony_ci snd_una == READ_ONCE(msk->write_seq)) { 105562306a36Sopenharmony_ci if (mptcp_rtx_timer_pending(sk) && !mptcp_data_fin_enabled(msk)) 105662306a36Sopenharmony_ci mptcp_stop_rtx_timer(sk); 105762306a36Sopenharmony_ci } else { 105862306a36Sopenharmony_ci mptcp_reset_rtx_timer(sk); 105962306a36Sopenharmony_ci } 106062306a36Sopenharmony_ci} 106162306a36Sopenharmony_ci 106262306a36Sopenharmony_cistatic void __mptcp_clean_una_wakeup(struct sock *sk) 106362306a36Sopenharmony_ci{ 106462306a36Sopenharmony_ci lockdep_assert_held_once(&sk->sk_lock.slock); 106562306a36Sopenharmony_ci 106662306a36Sopenharmony_ci __mptcp_clean_una(sk); 106762306a36Sopenharmony_ci mptcp_write_space(sk); 106862306a36Sopenharmony_ci} 106962306a36Sopenharmony_ci 107062306a36Sopenharmony_cistatic void mptcp_clean_una_wakeup(struct sock *sk) 107162306a36Sopenharmony_ci{ 107262306a36Sopenharmony_ci mptcp_data_lock(sk); 107362306a36Sopenharmony_ci __mptcp_clean_una_wakeup(sk); 107462306a36Sopenharmony_ci mptcp_data_unlock(sk); 107562306a36Sopenharmony_ci} 107662306a36Sopenharmony_ci 107762306a36Sopenharmony_cistatic void mptcp_enter_memory_pressure(struct sock *sk) 107862306a36Sopenharmony_ci{ 107962306a36Sopenharmony_ci struct mptcp_subflow_context *subflow; 108062306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sk); 108162306a36Sopenharmony_ci bool first = true; 108262306a36Sopenharmony_ci 108362306a36Sopenharmony_ci mptcp_for_each_subflow(msk, subflow) { 108462306a36Sopenharmony_ci struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 108562306a36Sopenharmony_ci 108662306a36Sopenharmony_ci if (first) 108762306a36Sopenharmony_ci tcp_enter_memory_pressure(ssk); 108862306a36Sopenharmony_ci sk_stream_moderate_sndbuf(ssk); 108962306a36Sopenharmony_ci 109062306a36Sopenharmony_ci first = false; 109162306a36Sopenharmony_ci } 109262306a36Sopenharmony_ci __mptcp_sync_sndbuf(sk); 109362306a36Sopenharmony_ci} 109462306a36Sopenharmony_ci 109562306a36Sopenharmony_ci/* ensure we get enough memory for the frag hdr, beyond some minimal amount of 109662306a36Sopenharmony_ci * data 109762306a36Sopenharmony_ci */ 109862306a36Sopenharmony_cistatic bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag) 109962306a36Sopenharmony_ci{ 110062306a36Sopenharmony_ci if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag), 110162306a36Sopenharmony_ci pfrag, sk->sk_allocation))) 110262306a36Sopenharmony_ci return true; 110362306a36Sopenharmony_ci 110462306a36Sopenharmony_ci mptcp_enter_memory_pressure(sk); 110562306a36Sopenharmony_ci return false; 110662306a36Sopenharmony_ci} 110762306a36Sopenharmony_ci 110862306a36Sopenharmony_cistatic struct mptcp_data_frag * 110962306a36Sopenharmony_cimptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag, 111062306a36Sopenharmony_ci int orig_offset) 111162306a36Sopenharmony_ci{ 111262306a36Sopenharmony_ci int offset = ALIGN(orig_offset, sizeof(long)); 111362306a36Sopenharmony_ci struct mptcp_data_frag *dfrag; 111462306a36Sopenharmony_ci 111562306a36Sopenharmony_ci dfrag = (struct mptcp_data_frag *)(page_to_virt(pfrag->page) + offset); 111662306a36Sopenharmony_ci dfrag->data_len = 0; 111762306a36Sopenharmony_ci dfrag->data_seq = msk->write_seq; 111862306a36Sopenharmony_ci dfrag->overhead = offset - orig_offset + sizeof(struct mptcp_data_frag); 111962306a36Sopenharmony_ci dfrag->offset = offset + sizeof(struct mptcp_data_frag); 112062306a36Sopenharmony_ci dfrag->already_sent = 0; 112162306a36Sopenharmony_ci dfrag->page = pfrag->page; 112262306a36Sopenharmony_ci 112362306a36Sopenharmony_ci return dfrag; 112462306a36Sopenharmony_ci} 112562306a36Sopenharmony_ci 112662306a36Sopenharmony_cistruct mptcp_sendmsg_info { 112762306a36Sopenharmony_ci int mss_now; 112862306a36Sopenharmony_ci int size_goal; 112962306a36Sopenharmony_ci u16 limit; 113062306a36Sopenharmony_ci u16 sent; 113162306a36Sopenharmony_ci unsigned int flags; 113262306a36Sopenharmony_ci bool data_lock_held; 113362306a36Sopenharmony_ci}; 113462306a36Sopenharmony_ci 113562306a36Sopenharmony_cistatic int mptcp_check_allowed_size(const struct mptcp_sock *msk, struct sock *ssk, 113662306a36Sopenharmony_ci u64 data_seq, int avail_size) 113762306a36Sopenharmony_ci{ 113862306a36Sopenharmony_ci u64 window_end = mptcp_wnd_end(msk); 113962306a36Sopenharmony_ci u64 mptcp_snd_wnd; 114062306a36Sopenharmony_ci 114162306a36Sopenharmony_ci if (__mptcp_check_fallback(msk)) 114262306a36Sopenharmony_ci return avail_size; 114362306a36Sopenharmony_ci 114462306a36Sopenharmony_ci mptcp_snd_wnd = window_end - data_seq; 114562306a36Sopenharmony_ci avail_size = min_t(unsigned int, mptcp_snd_wnd, avail_size); 114662306a36Sopenharmony_ci 114762306a36Sopenharmony_ci if (unlikely(tcp_sk(ssk)->snd_wnd < mptcp_snd_wnd)) { 114862306a36Sopenharmony_ci tcp_sk(ssk)->snd_wnd = min_t(u64, U32_MAX, mptcp_snd_wnd); 114962306a36Sopenharmony_ci MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_SNDWNDSHARED); 115062306a36Sopenharmony_ci } 115162306a36Sopenharmony_ci 115262306a36Sopenharmony_ci return avail_size; 115362306a36Sopenharmony_ci} 115462306a36Sopenharmony_ci 115562306a36Sopenharmony_cistatic bool __mptcp_add_ext(struct sk_buff *skb, gfp_t gfp) 115662306a36Sopenharmony_ci{ 115762306a36Sopenharmony_ci struct skb_ext *mpext = __skb_ext_alloc(gfp); 115862306a36Sopenharmony_ci 115962306a36Sopenharmony_ci if (!mpext) 116062306a36Sopenharmony_ci return false; 116162306a36Sopenharmony_ci __skb_ext_set(skb, SKB_EXT_MPTCP, mpext); 116262306a36Sopenharmony_ci return true; 116362306a36Sopenharmony_ci} 116462306a36Sopenharmony_ci 116562306a36Sopenharmony_cistatic struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp) 116662306a36Sopenharmony_ci{ 116762306a36Sopenharmony_ci struct sk_buff *skb; 116862306a36Sopenharmony_ci 116962306a36Sopenharmony_ci skb = alloc_skb_fclone(MAX_TCP_HEADER, gfp); 117062306a36Sopenharmony_ci if (likely(skb)) { 117162306a36Sopenharmony_ci if (likely(__mptcp_add_ext(skb, gfp))) { 117262306a36Sopenharmony_ci skb_reserve(skb, MAX_TCP_HEADER); 117362306a36Sopenharmony_ci skb->ip_summed = CHECKSUM_PARTIAL; 117462306a36Sopenharmony_ci INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); 117562306a36Sopenharmony_ci return skb; 117662306a36Sopenharmony_ci } 117762306a36Sopenharmony_ci __kfree_skb(skb); 117862306a36Sopenharmony_ci } else { 117962306a36Sopenharmony_ci mptcp_enter_memory_pressure(sk); 118062306a36Sopenharmony_ci } 118162306a36Sopenharmony_ci return NULL; 118262306a36Sopenharmony_ci} 118362306a36Sopenharmony_ci 118462306a36Sopenharmony_cistatic struct sk_buff *__mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) 118562306a36Sopenharmony_ci{ 118662306a36Sopenharmony_ci struct sk_buff *skb; 118762306a36Sopenharmony_ci 118862306a36Sopenharmony_ci skb = __mptcp_do_alloc_tx_skb(sk, gfp); 118962306a36Sopenharmony_ci if (!skb) 119062306a36Sopenharmony_ci return NULL; 119162306a36Sopenharmony_ci 119262306a36Sopenharmony_ci if (likely(sk_wmem_schedule(ssk, skb->truesize))) { 119362306a36Sopenharmony_ci tcp_skb_entail(ssk, skb); 119462306a36Sopenharmony_ci return skb; 119562306a36Sopenharmony_ci } 119662306a36Sopenharmony_ci tcp_skb_tsorted_anchor_cleanup(skb); 119762306a36Sopenharmony_ci kfree_skb(skb); 119862306a36Sopenharmony_ci return NULL; 119962306a36Sopenharmony_ci} 120062306a36Sopenharmony_ci 120162306a36Sopenharmony_cistatic struct sk_buff *mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, bool data_lock_held) 120262306a36Sopenharmony_ci{ 120362306a36Sopenharmony_ci gfp_t gfp = data_lock_held ? GFP_ATOMIC : sk->sk_allocation; 120462306a36Sopenharmony_ci 120562306a36Sopenharmony_ci return __mptcp_alloc_tx_skb(sk, ssk, gfp); 120662306a36Sopenharmony_ci} 120762306a36Sopenharmony_ci 120862306a36Sopenharmony_ci/* note: this always recompute the csum on the whole skb, even 120962306a36Sopenharmony_ci * if we just appended a single frag. More status info needed 121062306a36Sopenharmony_ci */ 121162306a36Sopenharmony_cistatic void mptcp_update_data_checksum(struct sk_buff *skb, int added) 121262306a36Sopenharmony_ci{ 121362306a36Sopenharmony_ci struct mptcp_ext *mpext = mptcp_get_ext(skb); 121462306a36Sopenharmony_ci __wsum csum = ~csum_unfold(mpext->csum); 121562306a36Sopenharmony_ci int offset = skb->len - added; 121662306a36Sopenharmony_ci 121762306a36Sopenharmony_ci mpext->csum = csum_fold(csum_block_add(csum, skb_checksum(skb, offset, added, 0), offset)); 121862306a36Sopenharmony_ci} 121962306a36Sopenharmony_ci 122062306a36Sopenharmony_cistatic void mptcp_update_infinite_map(struct mptcp_sock *msk, 122162306a36Sopenharmony_ci struct sock *ssk, 122262306a36Sopenharmony_ci struct mptcp_ext *mpext) 122362306a36Sopenharmony_ci{ 122462306a36Sopenharmony_ci if (!mpext) 122562306a36Sopenharmony_ci return; 122662306a36Sopenharmony_ci 122762306a36Sopenharmony_ci mpext->infinite_map = 1; 122862306a36Sopenharmony_ci mpext->data_len = 0; 122962306a36Sopenharmony_ci 123062306a36Sopenharmony_ci MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPTX); 123162306a36Sopenharmony_ci mptcp_subflow_ctx(ssk)->send_infinite_map = 0; 123262306a36Sopenharmony_ci pr_fallback(msk); 123362306a36Sopenharmony_ci mptcp_do_fallback(ssk); 123462306a36Sopenharmony_ci} 123562306a36Sopenharmony_ci 123662306a36Sopenharmony_ci#define MPTCP_MAX_GSO_SIZE (GSO_LEGACY_MAX_SIZE - (MAX_TCP_HEADER + 1)) 123762306a36Sopenharmony_ci 123862306a36Sopenharmony_cistatic int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, 123962306a36Sopenharmony_ci struct mptcp_data_frag *dfrag, 124062306a36Sopenharmony_ci struct mptcp_sendmsg_info *info) 124162306a36Sopenharmony_ci{ 124262306a36Sopenharmony_ci u64 data_seq = dfrag->data_seq + info->sent; 124362306a36Sopenharmony_ci int offset = dfrag->offset + info->sent; 124462306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sk); 124562306a36Sopenharmony_ci bool zero_window_probe = false; 124662306a36Sopenharmony_ci struct mptcp_ext *mpext = NULL; 124762306a36Sopenharmony_ci bool can_coalesce = false; 124862306a36Sopenharmony_ci bool reuse_skb = true; 124962306a36Sopenharmony_ci struct sk_buff *skb; 125062306a36Sopenharmony_ci size_t copy; 125162306a36Sopenharmony_ci int i; 125262306a36Sopenharmony_ci 125362306a36Sopenharmony_ci pr_debug("msk=%p ssk=%p sending dfrag at seq=%llu len=%u already sent=%u", 125462306a36Sopenharmony_ci msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent); 125562306a36Sopenharmony_ci 125662306a36Sopenharmony_ci if (WARN_ON_ONCE(info->sent > info->limit || 125762306a36Sopenharmony_ci info->limit > dfrag->data_len)) 125862306a36Sopenharmony_ci return 0; 125962306a36Sopenharmony_ci 126062306a36Sopenharmony_ci if (unlikely(!__tcp_can_send(ssk))) 126162306a36Sopenharmony_ci return -EAGAIN; 126262306a36Sopenharmony_ci 126362306a36Sopenharmony_ci /* compute send limit */ 126462306a36Sopenharmony_ci if (unlikely(ssk->sk_gso_max_size > MPTCP_MAX_GSO_SIZE)) 126562306a36Sopenharmony_ci ssk->sk_gso_max_size = MPTCP_MAX_GSO_SIZE; 126662306a36Sopenharmony_ci info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags); 126762306a36Sopenharmony_ci copy = info->size_goal; 126862306a36Sopenharmony_ci 126962306a36Sopenharmony_ci skb = tcp_write_queue_tail(ssk); 127062306a36Sopenharmony_ci if (skb && copy > skb->len) { 127162306a36Sopenharmony_ci /* Limit the write to the size available in the 127262306a36Sopenharmony_ci * current skb, if any, so that we create at most a new skb. 127362306a36Sopenharmony_ci * Explicitly tells TCP internals to avoid collapsing on later 127462306a36Sopenharmony_ci * queue management operation, to avoid breaking the ext <-> 127562306a36Sopenharmony_ci * SSN association set here 127662306a36Sopenharmony_ci */ 127762306a36Sopenharmony_ci mpext = skb_ext_find(skb, SKB_EXT_MPTCP); 127862306a36Sopenharmony_ci if (!mptcp_skb_can_collapse_to(data_seq, skb, mpext)) { 127962306a36Sopenharmony_ci TCP_SKB_CB(skb)->eor = 1; 128062306a36Sopenharmony_ci tcp_mark_push(tcp_sk(ssk), skb); 128162306a36Sopenharmony_ci goto alloc_skb; 128262306a36Sopenharmony_ci } 128362306a36Sopenharmony_ci 128462306a36Sopenharmony_ci i = skb_shinfo(skb)->nr_frags; 128562306a36Sopenharmony_ci can_coalesce = skb_can_coalesce(skb, i, dfrag->page, offset); 128662306a36Sopenharmony_ci if (!can_coalesce && i >= READ_ONCE(sysctl_max_skb_frags)) { 128762306a36Sopenharmony_ci tcp_mark_push(tcp_sk(ssk), skb); 128862306a36Sopenharmony_ci goto alloc_skb; 128962306a36Sopenharmony_ci } 129062306a36Sopenharmony_ci 129162306a36Sopenharmony_ci copy -= skb->len; 129262306a36Sopenharmony_ci } else { 129362306a36Sopenharmony_cialloc_skb: 129462306a36Sopenharmony_ci skb = mptcp_alloc_tx_skb(sk, ssk, info->data_lock_held); 129562306a36Sopenharmony_ci if (!skb) 129662306a36Sopenharmony_ci return -ENOMEM; 129762306a36Sopenharmony_ci 129862306a36Sopenharmony_ci i = skb_shinfo(skb)->nr_frags; 129962306a36Sopenharmony_ci reuse_skb = false; 130062306a36Sopenharmony_ci mpext = skb_ext_find(skb, SKB_EXT_MPTCP); 130162306a36Sopenharmony_ci } 130262306a36Sopenharmony_ci 130362306a36Sopenharmony_ci /* Zero window and all data acked? Probe. */ 130462306a36Sopenharmony_ci copy = mptcp_check_allowed_size(msk, ssk, data_seq, copy); 130562306a36Sopenharmony_ci if (copy == 0) { 130662306a36Sopenharmony_ci u64 snd_una = READ_ONCE(msk->snd_una); 130762306a36Sopenharmony_ci 130862306a36Sopenharmony_ci if (snd_una != msk->snd_nxt || tcp_write_queue_tail(ssk)) { 130962306a36Sopenharmony_ci tcp_remove_empty_skb(ssk); 131062306a36Sopenharmony_ci return 0; 131162306a36Sopenharmony_ci } 131262306a36Sopenharmony_ci 131362306a36Sopenharmony_ci zero_window_probe = true; 131462306a36Sopenharmony_ci data_seq = snd_una - 1; 131562306a36Sopenharmony_ci copy = 1; 131662306a36Sopenharmony_ci } 131762306a36Sopenharmony_ci 131862306a36Sopenharmony_ci copy = min_t(size_t, copy, info->limit - info->sent); 131962306a36Sopenharmony_ci if (!sk_wmem_schedule(ssk, copy)) { 132062306a36Sopenharmony_ci tcp_remove_empty_skb(ssk); 132162306a36Sopenharmony_ci return -ENOMEM; 132262306a36Sopenharmony_ci } 132362306a36Sopenharmony_ci 132462306a36Sopenharmony_ci if (can_coalesce) { 132562306a36Sopenharmony_ci skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 132662306a36Sopenharmony_ci } else { 132762306a36Sopenharmony_ci get_page(dfrag->page); 132862306a36Sopenharmony_ci skb_fill_page_desc(skb, i, dfrag->page, offset, copy); 132962306a36Sopenharmony_ci } 133062306a36Sopenharmony_ci 133162306a36Sopenharmony_ci skb->len += copy; 133262306a36Sopenharmony_ci skb->data_len += copy; 133362306a36Sopenharmony_ci skb->truesize += copy; 133462306a36Sopenharmony_ci sk_wmem_queued_add(ssk, copy); 133562306a36Sopenharmony_ci sk_mem_charge(ssk, copy); 133662306a36Sopenharmony_ci WRITE_ONCE(tcp_sk(ssk)->write_seq, tcp_sk(ssk)->write_seq + copy); 133762306a36Sopenharmony_ci TCP_SKB_CB(skb)->end_seq += copy; 133862306a36Sopenharmony_ci tcp_skb_pcount_set(skb, 0); 133962306a36Sopenharmony_ci 134062306a36Sopenharmony_ci /* on skb reuse we just need to update the DSS len */ 134162306a36Sopenharmony_ci if (reuse_skb) { 134262306a36Sopenharmony_ci TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; 134362306a36Sopenharmony_ci mpext->data_len += copy; 134462306a36Sopenharmony_ci goto out; 134562306a36Sopenharmony_ci } 134662306a36Sopenharmony_ci 134762306a36Sopenharmony_ci memset(mpext, 0, sizeof(*mpext)); 134862306a36Sopenharmony_ci mpext->data_seq = data_seq; 134962306a36Sopenharmony_ci mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq; 135062306a36Sopenharmony_ci mpext->data_len = copy; 135162306a36Sopenharmony_ci mpext->use_map = 1; 135262306a36Sopenharmony_ci mpext->dsn64 = 1; 135362306a36Sopenharmony_ci 135462306a36Sopenharmony_ci pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d", 135562306a36Sopenharmony_ci mpext->data_seq, mpext->subflow_seq, mpext->data_len, 135662306a36Sopenharmony_ci mpext->dsn64); 135762306a36Sopenharmony_ci 135862306a36Sopenharmony_ci if (zero_window_probe) { 135962306a36Sopenharmony_ci mptcp_subflow_ctx(ssk)->rel_write_seq += copy; 136062306a36Sopenharmony_ci mpext->frozen = 1; 136162306a36Sopenharmony_ci if (READ_ONCE(msk->csum_enabled)) 136262306a36Sopenharmony_ci mptcp_update_data_checksum(skb, copy); 136362306a36Sopenharmony_ci tcp_push_pending_frames(ssk); 136462306a36Sopenharmony_ci return 0; 136562306a36Sopenharmony_ci } 136662306a36Sopenharmony_ciout: 136762306a36Sopenharmony_ci if (READ_ONCE(msk->csum_enabled)) 136862306a36Sopenharmony_ci mptcp_update_data_checksum(skb, copy); 136962306a36Sopenharmony_ci if (mptcp_subflow_ctx(ssk)->send_infinite_map) 137062306a36Sopenharmony_ci mptcp_update_infinite_map(msk, ssk, mpext); 137162306a36Sopenharmony_ci trace_mptcp_sendmsg_frag(mpext); 137262306a36Sopenharmony_ci mptcp_subflow_ctx(ssk)->rel_write_seq += copy; 137362306a36Sopenharmony_ci return copy; 137462306a36Sopenharmony_ci} 137562306a36Sopenharmony_ci 137662306a36Sopenharmony_ci#define MPTCP_SEND_BURST_SIZE ((1 << 16) - \ 137762306a36Sopenharmony_ci sizeof(struct tcphdr) - \ 137862306a36Sopenharmony_ci MAX_TCP_OPTION_SPACE - \ 137962306a36Sopenharmony_ci sizeof(struct ipv6hdr) - \ 138062306a36Sopenharmony_ci sizeof(struct frag_hdr)) 138162306a36Sopenharmony_ci 138262306a36Sopenharmony_cistruct subflow_send_info { 138362306a36Sopenharmony_ci struct sock *ssk; 138462306a36Sopenharmony_ci u64 linger_time; 138562306a36Sopenharmony_ci}; 138662306a36Sopenharmony_ci 138762306a36Sopenharmony_civoid mptcp_subflow_set_active(struct mptcp_subflow_context *subflow) 138862306a36Sopenharmony_ci{ 138962306a36Sopenharmony_ci if (!subflow->stale) 139062306a36Sopenharmony_ci return; 139162306a36Sopenharmony_ci 139262306a36Sopenharmony_ci subflow->stale = 0; 139362306a36Sopenharmony_ci MPTCP_INC_STATS(sock_net(mptcp_subflow_tcp_sock(subflow)), MPTCP_MIB_SUBFLOWRECOVER); 139462306a36Sopenharmony_ci} 139562306a36Sopenharmony_ci 139662306a36Sopenharmony_cibool mptcp_subflow_active(struct mptcp_subflow_context *subflow) 139762306a36Sopenharmony_ci{ 139862306a36Sopenharmony_ci if (unlikely(subflow->stale)) { 139962306a36Sopenharmony_ci u32 rcv_tstamp = READ_ONCE(tcp_sk(mptcp_subflow_tcp_sock(subflow))->rcv_tstamp); 140062306a36Sopenharmony_ci 140162306a36Sopenharmony_ci if (subflow->stale_rcv_tstamp == rcv_tstamp) 140262306a36Sopenharmony_ci return false; 140362306a36Sopenharmony_ci 140462306a36Sopenharmony_ci mptcp_subflow_set_active(subflow); 140562306a36Sopenharmony_ci } 140662306a36Sopenharmony_ci return __mptcp_subflow_active(subflow); 140762306a36Sopenharmony_ci} 140862306a36Sopenharmony_ci 140962306a36Sopenharmony_ci#define SSK_MODE_ACTIVE 0 141062306a36Sopenharmony_ci#define SSK_MODE_BACKUP 1 141162306a36Sopenharmony_ci#define SSK_MODE_MAX 2 141262306a36Sopenharmony_ci 141362306a36Sopenharmony_ci/* implement the mptcp packet scheduler; 141462306a36Sopenharmony_ci * returns the subflow that will transmit the next DSS 141562306a36Sopenharmony_ci * additionally updates the rtx timeout 141662306a36Sopenharmony_ci */ 141762306a36Sopenharmony_cistruct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) 141862306a36Sopenharmony_ci{ 141962306a36Sopenharmony_ci struct subflow_send_info send_info[SSK_MODE_MAX]; 142062306a36Sopenharmony_ci struct mptcp_subflow_context *subflow; 142162306a36Sopenharmony_ci struct sock *sk = (struct sock *)msk; 142262306a36Sopenharmony_ci u32 pace, burst, wmem; 142362306a36Sopenharmony_ci int i, nr_active = 0; 142462306a36Sopenharmony_ci struct sock *ssk; 142562306a36Sopenharmony_ci u64 linger_time; 142662306a36Sopenharmony_ci long tout = 0; 142762306a36Sopenharmony_ci 142862306a36Sopenharmony_ci /* pick the subflow with the lower wmem/wspace ratio */ 142962306a36Sopenharmony_ci for (i = 0; i < SSK_MODE_MAX; ++i) { 143062306a36Sopenharmony_ci send_info[i].ssk = NULL; 143162306a36Sopenharmony_ci send_info[i].linger_time = -1; 143262306a36Sopenharmony_ci } 143362306a36Sopenharmony_ci 143462306a36Sopenharmony_ci mptcp_for_each_subflow(msk, subflow) { 143562306a36Sopenharmony_ci trace_mptcp_subflow_get_send(subflow); 143662306a36Sopenharmony_ci ssk = mptcp_subflow_tcp_sock(subflow); 143762306a36Sopenharmony_ci if (!mptcp_subflow_active(subflow)) 143862306a36Sopenharmony_ci continue; 143962306a36Sopenharmony_ci 144062306a36Sopenharmony_ci tout = max(tout, mptcp_timeout_from_subflow(subflow)); 144162306a36Sopenharmony_ci nr_active += !subflow->backup; 144262306a36Sopenharmony_ci pace = subflow->avg_pacing_rate; 144362306a36Sopenharmony_ci if (unlikely(!pace)) { 144462306a36Sopenharmony_ci /* init pacing rate from socket */ 144562306a36Sopenharmony_ci subflow->avg_pacing_rate = READ_ONCE(ssk->sk_pacing_rate); 144662306a36Sopenharmony_ci pace = subflow->avg_pacing_rate; 144762306a36Sopenharmony_ci if (!pace) 144862306a36Sopenharmony_ci continue; 144962306a36Sopenharmony_ci } 145062306a36Sopenharmony_ci 145162306a36Sopenharmony_ci linger_time = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32, pace); 145262306a36Sopenharmony_ci if (linger_time < send_info[subflow->backup].linger_time) { 145362306a36Sopenharmony_ci send_info[subflow->backup].ssk = ssk; 145462306a36Sopenharmony_ci send_info[subflow->backup].linger_time = linger_time; 145562306a36Sopenharmony_ci } 145662306a36Sopenharmony_ci } 145762306a36Sopenharmony_ci __mptcp_set_timeout(sk, tout); 145862306a36Sopenharmony_ci 145962306a36Sopenharmony_ci /* pick the best backup if no other subflow is active */ 146062306a36Sopenharmony_ci if (!nr_active) 146162306a36Sopenharmony_ci send_info[SSK_MODE_ACTIVE].ssk = send_info[SSK_MODE_BACKUP].ssk; 146262306a36Sopenharmony_ci 146362306a36Sopenharmony_ci /* According to the blest algorithm, to avoid HoL blocking for the 146462306a36Sopenharmony_ci * faster flow, we need to: 146562306a36Sopenharmony_ci * - estimate the faster flow linger time 146662306a36Sopenharmony_ci * - use the above to estimate the amount of byte transferred 146762306a36Sopenharmony_ci * by the faster flow 146862306a36Sopenharmony_ci * - check that the amount of queued data is greter than the above, 146962306a36Sopenharmony_ci * otherwise do not use the picked, slower, subflow 147062306a36Sopenharmony_ci * We select the subflow with the shorter estimated time to flush 147162306a36Sopenharmony_ci * the queued mem, which basically ensure the above. We just need 147262306a36Sopenharmony_ci * to check that subflow has a non empty cwin. 147362306a36Sopenharmony_ci */ 147462306a36Sopenharmony_ci ssk = send_info[SSK_MODE_ACTIVE].ssk; 147562306a36Sopenharmony_ci if (!ssk || !sk_stream_memory_free(ssk)) 147662306a36Sopenharmony_ci return NULL; 147762306a36Sopenharmony_ci 147862306a36Sopenharmony_ci burst = min_t(int, MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt); 147962306a36Sopenharmony_ci wmem = READ_ONCE(ssk->sk_wmem_queued); 148062306a36Sopenharmony_ci if (!burst) 148162306a36Sopenharmony_ci return ssk; 148262306a36Sopenharmony_ci 148362306a36Sopenharmony_ci subflow = mptcp_subflow_ctx(ssk); 148462306a36Sopenharmony_ci subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem + 148562306a36Sopenharmony_ci READ_ONCE(ssk->sk_pacing_rate) * burst, 148662306a36Sopenharmony_ci burst + wmem); 148762306a36Sopenharmony_ci msk->snd_burst = burst; 148862306a36Sopenharmony_ci return ssk; 148962306a36Sopenharmony_ci} 149062306a36Sopenharmony_ci 149162306a36Sopenharmony_cistatic void mptcp_push_release(struct sock *ssk, struct mptcp_sendmsg_info *info) 149262306a36Sopenharmony_ci{ 149362306a36Sopenharmony_ci tcp_push(ssk, 0, info->mss_now, tcp_sk(ssk)->nonagle, info->size_goal); 149462306a36Sopenharmony_ci release_sock(ssk); 149562306a36Sopenharmony_ci} 149662306a36Sopenharmony_ci 149762306a36Sopenharmony_cistatic void mptcp_update_post_push(struct mptcp_sock *msk, 149862306a36Sopenharmony_ci struct mptcp_data_frag *dfrag, 149962306a36Sopenharmony_ci u32 sent) 150062306a36Sopenharmony_ci{ 150162306a36Sopenharmony_ci u64 snd_nxt_new = dfrag->data_seq; 150262306a36Sopenharmony_ci 150362306a36Sopenharmony_ci dfrag->already_sent += sent; 150462306a36Sopenharmony_ci 150562306a36Sopenharmony_ci msk->snd_burst -= sent; 150662306a36Sopenharmony_ci 150762306a36Sopenharmony_ci snd_nxt_new += dfrag->already_sent; 150862306a36Sopenharmony_ci 150962306a36Sopenharmony_ci /* snd_nxt_new can be smaller than snd_nxt in case mptcp 151062306a36Sopenharmony_ci * is recovering after a failover. In that event, this re-sends 151162306a36Sopenharmony_ci * old segments. 151262306a36Sopenharmony_ci * 151362306a36Sopenharmony_ci * Thus compute snd_nxt_new candidate based on 151462306a36Sopenharmony_ci * the dfrag->data_seq that was sent and the data 151562306a36Sopenharmony_ci * that has been handed to the subflow for transmission 151662306a36Sopenharmony_ci * and skip update in case it was old dfrag. 151762306a36Sopenharmony_ci */ 151862306a36Sopenharmony_ci if (likely(after64(snd_nxt_new, msk->snd_nxt))) { 151962306a36Sopenharmony_ci msk->bytes_sent += snd_nxt_new - msk->snd_nxt; 152062306a36Sopenharmony_ci msk->snd_nxt = snd_nxt_new; 152162306a36Sopenharmony_ci } 152262306a36Sopenharmony_ci} 152362306a36Sopenharmony_ci 152462306a36Sopenharmony_civoid mptcp_check_and_set_pending(struct sock *sk) 152562306a36Sopenharmony_ci{ 152662306a36Sopenharmony_ci if (mptcp_send_head(sk)) { 152762306a36Sopenharmony_ci mptcp_data_lock(sk); 152862306a36Sopenharmony_ci mptcp_sk(sk)->cb_flags |= BIT(MPTCP_PUSH_PENDING); 152962306a36Sopenharmony_ci mptcp_data_unlock(sk); 153062306a36Sopenharmony_ci } 153162306a36Sopenharmony_ci} 153262306a36Sopenharmony_ci 153362306a36Sopenharmony_cistatic int __subflow_push_pending(struct sock *sk, struct sock *ssk, 153462306a36Sopenharmony_ci struct mptcp_sendmsg_info *info) 153562306a36Sopenharmony_ci{ 153662306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sk); 153762306a36Sopenharmony_ci struct mptcp_data_frag *dfrag; 153862306a36Sopenharmony_ci int len, copied = 0, err = 0; 153962306a36Sopenharmony_ci 154062306a36Sopenharmony_ci while ((dfrag = mptcp_send_head(sk))) { 154162306a36Sopenharmony_ci info->sent = dfrag->already_sent; 154262306a36Sopenharmony_ci info->limit = dfrag->data_len; 154362306a36Sopenharmony_ci len = dfrag->data_len - dfrag->already_sent; 154462306a36Sopenharmony_ci while (len > 0) { 154562306a36Sopenharmony_ci int ret = 0; 154662306a36Sopenharmony_ci 154762306a36Sopenharmony_ci ret = mptcp_sendmsg_frag(sk, ssk, dfrag, info); 154862306a36Sopenharmony_ci if (ret <= 0) { 154962306a36Sopenharmony_ci err = copied ? : ret; 155062306a36Sopenharmony_ci goto out; 155162306a36Sopenharmony_ci } 155262306a36Sopenharmony_ci 155362306a36Sopenharmony_ci info->sent += ret; 155462306a36Sopenharmony_ci copied += ret; 155562306a36Sopenharmony_ci len -= ret; 155662306a36Sopenharmony_ci 155762306a36Sopenharmony_ci mptcp_update_post_push(msk, dfrag, ret); 155862306a36Sopenharmony_ci } 155962306a36Sopenharmony_ci WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); 156062306a36Sopenharmony_ci 156162306a36Sopenharmony_ci if (msk->snd_burst <= 0 || 156262306a36Sopenharmony_ci !sk_stream_memory_free(ssk) || 156362306a36Sopenharmony_ci !mptcp_subflow_active(mptcp_subflow_ctx(ssk))) { 156462306a36Sopenharmony_ci err = copied; 156562306a36Sopenharmony_ci goto out; 156662306a36Sopenharmony_ci } 156762306a36Sopenharmony_ci mptcp_set_timeout(sk); 156862306a36Sopenharmony_ci } 156962306a36Sopenharmony_ci err = copied; 157062306a36Sopenharmony_ci 157162306a36Sopenharmony_ciout: 157262306a36Sopenharmony_ci return err; 157362306a36Sopenharmony_ci} 157462306a36Sopenharmony_ci 157562306a36Sopenharmony_civoid __mptcp_push_pending(struct sock *sk, unsigned int flags) 157662306a36Sopenharmony_ci{ 157762306a36Sopenharmony_ci struct sock *prev_ssk = NULL, *ssk = NULL; 157862306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sk); 157962306a36Sopenharmony_ci struct mptcp_sendmsg_info info = { 158062306a36Sopenharmony_ci .flags = flags, 158162306a36Sopenharmony_ci }; 158262306a36Sopenharmony_ci bool do_check_data_fin = false; 158362306a36Sopenharmony_ci int push_count = 1; 158462306a36Sopenharmony_ci 158562306a36Sopenharmony_ci while (mptcp_send_head(sk) && (push_count > 0)) { 158662306a36Sopenharmony_ci struct mptcp_subflow_context *subflow; 158762306a36Sopenharmony_ci int ret = 0; 158862306a36Sopenharmony_ci 158962306a36Sopenharmony_ci if (mptcp_sched_get_send(msk)) 159062306a36Sopenharmony_ci break; 159162306a36Sopenharmony_ci 159262306a36Sopenharmony_ci push_count = 0; 159362306a36Sopenharmony_ci 159462306a36Sopenharmony_ci mptcp_for_each_subflow(msk, subflow) { 159562306a36Sopenharmony_ci if (READ_ONCE(subflow->scheduled)) { 159662306a36Sopenharmony_ci mptcp_subflow_set_scheduled(subflow, false); 159762306a36Sopenharmony_ci 159862306a36Sopenharmony_ci prev_ssk = ssk; 159962306a36Sopenharmony_ci ssk = mptcp_subflow_tcp_sock(subflow); 160062306a36Sopenharmony_ci if (ssk != prev_ssk) { 160162306a36Sopenharmony_ci /* First check. If the ssk has changed since 160262306a36Sopenharmony_ci * the last round, release prev_ssk 160362306a36Sopenharmony_ci */ 160462306a36Sopenharmony_ci if (prev_ssk) 160562306a36Sopenharmony_ci mptcp_push_release(prev_ssk, &info); 160662306a36Sopenharmony_ci 160762306a36Sopenharmony_ci /* Need to lock the new subflow only if different 160862306a36Sopenharmony_ci * from the previous one, otherwise we are still 160962306a36Sopenharmony_ci * helding the relevant lock 161062306a36Sopenharmony_ci */ 161162306a36Sopenharmony_ci lock_sock(ssk); 161262306a36Sopenharmony_ci } 161362306a36Sopenharmony_ci 161462306a36Sopenharmony_ci push_count++; 161562306a36Sopenharmony_ci 161662306a36Sopenharmony_ci ret = __subflow_push_pending(sk, ssk, &info); 161762306a36Sopenharmony_ci if (ret <= 0) { 161862306a36Sopenharmony_ci if (ret != -EAGAIN || 161962306a36Sopenharmony_ci (1 << ssk->sk_state) & 162062306a36Sopenharmony_ci (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSE)) 162162306a36Sopenharmony_ci push_count--; 162262306a36Sopenharmony_ci continue; 162362306a36Sopenharmony_ci } 162462306a36Sopenharmony_ci do_check_data_fin = true; 162562306a36Sopenharmony_ci } 162662306a36Sopenharmony_ci } 162762306a36Sopenharmony_ci } 162862306a36Sopenharmony_ci 162962306a36Sopenharmony_ci /* at this point we held the socket lock for the last subflow we used */ 163062306a36Sopenharmony_ci if (ssk) 163162306a36Sopenharmony_ci mptcp_push_release(ssk, &info); 163262306a36Sopenharmony_ci 163362306a36Sopenharmony_ci /* ensure the rtx timer is running */ 163462306a36Sopenharmony_ci if (!mptcp_rtx_timer_pending(sk)) 163562306a36Sopenharmony_ci mptcp_reset_rtx_timer(sk); 163662306a36Sopenharmony_ci if (do_check_data_fin) 163762306a36Sopenharmony_ci mptcp_check_send_data_fin(sk); 163862306a36Sopenharmony_ci} 163962306a36Sopenharmony_ci 164062306a36Sopenharmony_cistatic void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool first) 164162306a36Sopenharmony_ci{ 164262306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sk); 164362306a36Sopenharmony_ci struct mptcp_sendmsg_info info = { 164462306a36Sopenharmony_ci .data_lock_held = true, 164562306a36Sopenharmony_ci }; 164662306a36Sopenharmony_ci bool keep_pushing = true; 164762306a36Sopenharmony_ci struct sock *xmit_ssk; 164862306a36Sopenharmony_ci int copied = 0; 164962306a36Sopenharmony_ci 165062306a36Sopenharmony_ci info.flags = 0; 165162306a36Sopenharmony_ci while (mptcp_send_head(sk) && keep_pushing) { 165262306a36Sopenharmony_ci struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 165362306a36Sopenharmony_ci int ret = 0; 165462306a36Sopenharmony_ci 165562306a36Sopenharmony_ci /* check for a different subflow usage only after 165662306a36Sopenharmony_ci * spooling the first chunk of data 165762306a36Sopenharmony_ci */ 165862306a36Sopenharmony_ci if (first) { 165962306a36Sopenharmony_ci mptcp_subflow_set_scheduled(subflow, false); 166062306a36Sopenharmony_ci ret = __subflow_push_pending(sk, ssk, &info); 166162306a36Sopenharmony_ci first = false; 166262306a36Sopenharmony_ci if (ret <= 0) 166362306a36Sopenharmony_ci break; 166462306a36Sopenharmony_ci copied += ret; 166562306a36Sopenharmony_ci continue; 166662306a36Sopenharmony_ci } 166762306a36Sopenharmony_ci 166862306a36Sopenharmony_ci if (mptcp_sched_get_send(msk)) 166962306a36Sopenharmony_ci goto out; 167062306a36Sopenharmony_ci 167162306a36Sopenharmony_ci if (READ_ONCE(subflow->scheduled)) { 167262306a36Sopenharmony_ci mptcp_subflow_set_scheduled(subflow, false); 167362306a36Sopenharmony_ci ret = __subflow_push_pending(sk, ssk, &info); 167462306a36Sopenharmony_ci if (ret <= 0) 167562306a36Sopenharmony_ci keep_pushing = false; 167662306a36Sopenharmony_ci copied += ret; 167762306a36Sopenharmony_ci } 167862306a36Sopenharmony_ci 167962306a36Sopenharmony_ci mptcp_for_each_subflow(msk, subflow) { 168062306a36Sopenharmony_ci if (READ_ONCE(subflow->scheduled)) { 168162306a36Sopenharmony_ci xmit_ssk = mptcp_subflow_tcp_sock(subflow); 168262306a36Sopenharmony_ci if (xmit_ssk != ssk) { 168362306a36Sopenharmony_ci mptcp_subflow_delegate(subflow, 168462306a36Sopenharmony_ci MPTCP_DELEGATE_SEND); 168562306a36Sopenharmony_ci keep_pushing = false; 168662306a36Sopenharmony_ci } 168762306a36Sopenharmony_ci } 168862306a36Sopenharmony_ci } 168962306a36Sopenharmony_ci } 169062306a36Sopenharmony_ci 169162306a36Sopenharmony_ciout: 169262306a36Sopenharmony_ci /* __mptcp_alloc_tx_skb could have released some wmem and we are 169362306a36Sopenharmony_ci * not going to flush it via release_sock() 169462306a36Sopenharmony_ci */ 169562306a36Sopenharmony_ci if (copied) { 169662306a36Sopenharmony_ci tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, 169762306a36Sopenharmony_ci info.size_goal); 169862306a36Sopenharmony_ci if (!mptcp_rtx_timer_pending(sk)) 169962306a36Sopenharmony_ci mptcp_reset_rtx_timer(sk); 170062306a36Sopenharmony_ci 170162306a36Sopenharmony_ci if (msk->snd_data_fin_enable && 170262306a36Sopenharmony_ci msk->snd_nxt + 1 == msk->write_seq) 170362306a36Sopenharmony_ci mptcp_schedule_work(sk); 170462306a36Sopenharmony_ci } 170562306a36Sopenharmony_ci} 170662306a36Sopenharmony_ci 170762306a36Sopenharmony_cistatic void mptcp_set_nospace(struct sock *sk) 170862306a36Sopenharmony_ci{ 170962306a36Sopenharmony_ci /* enable autotune */ 171062306a36Sopenharmony_ci set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 171162306a36Sopenharmony_ci 171262306a36Sopenharmony_ci /* will be cleared on avail space */ 171362306a36Sopenharmony_ci set_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags); 171462306a36Sopenharmony_ci} 171562306a36Sopenharmony_ci 171662306a36Sopenharmony_cistatic int mptcp_disconnect(struct sock *sk, int flags); 171762306a36Sopenharmony_ci 171862306a36Sopenharmony_cistatic int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, 171962306a36Sopenharmony_ci size_t len, int *copied_syn) 172062306a36Sopenharmony_ci{ 172162306a36Sopenharmony_ci unsigned int saved_flags = msg->msg_flags; 172262306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sk); 172362306a36Sopenharmony_ci struct sock *ssk; 172462306a36Sopenharmony_ci int ret; 172562306a36Sopenharmony_ci 172662306a36Sopenharmony_ci /* on flags based fastopen the mptcp is supposed to create the 172762306a36Sopenharmony_ci * first subflow right now. Otherwise we are in the defer_connect 172862306a36Sopenharmony_ci * path, and the first subflow must be already present. 172962306a36Sopenharmony_ci * Since the defer_connect flag is cleared after the first succsful 173062306a36Sopenharmony_ci * fastopen attempt, no need to check for additional subflow status. 173162306a36Sopenharmony_ci */ 173262306a36Sopenharmony_ci if (msg->msg_flags & MSG_FASTOPEN) { 173362306a36Sopenharmony_ci ssk = __mptcp_nmpc_sk(msk); 173462306a36Sopenharmony_ci if (IS_ERR(ssk)) 173562306a36Sopenharmony_ci return PTR_ERR(ssk); 173662306a36Sopenharmony_ci } 173762306a36Sopenharmony_ci if (!msk->first) 173862306a36Sopenharmony_ci return -EINVAL; 173962306a36Sopenharmony_ci 174062306a36Sopenharmony_ci ssk = msk->first; 174162306a36Sopenharmony_ci 174262306a36Sopenharmony_ci lock_sock(ssk); 174362306a36Sopenharmony_ci msg->msg_flags |= MSG_DONTWAIT; 174462306a36Sopenharmony_ci msk->fastopening = 1; 174562306a36Sopenharmony_ci ret = tcp_sendmsg_fastopen(ssk, msg, copied_syn, len, NULL); 174662306a36Sopenharmony_ci msk->fastopening = 0; 174762306a36Sopenharmony_ci msg->msg_flags = saved_flags; 174862306a36Sopenharmony_ci release_sock(ssk); 174962306a36Sopenharmony_ci 175062306a36Sopenharmony_ci /* do the blocking bits of inet_stream_connect outside the ssk socket lock */ 175162306a36Sopenharmony_ci if (ret == -EINPROGRESS && !(msg->msg_flags & MSG_DONTWAIT)) { 175262306a36Sopenharmony_ci ret = __inet_stream_connect(sk->sk_socket, msg->msg_name, 175362306a36Sopenharmony_ci msg->msg_namelen, msg->msg_flags, 1); 175462306a36Sopenharmony_ci 175562306a36Sopenharmony_ci /* Keep the same behaviour of plain TCP: zero the copied bytes in 175662306a36Sopenharmony_ci * case of any error, except timeout or signal 175762306a36Sopenharmony_ci */ 175862306a36Sopenharmony_ci if (ret && ret != -EINPROGRESS && ret != -ERESTARTSYS && ret != -EINTR) 175962306a36Sopenharmony_ci *copied_syn = 0; 176062306a36Sopenharmony_ci } else if (ret && ret != -EINPROGRESS) { 176162306a36Sopenharmony_ci /* The disconnect() op called by tcp_sendmsg_fastopen()/ 176262306a36Sopenharmony_ci * __inet_stream_connect() can fail, due to looking check, 176362306a36Sopenharmony_ci * see mptcp_disconnect(). 176462306a36Sopenharmony_ci * Attempt it again outside the problematic scope. 176562306a36Sopenharmony_ci */ 176662306a36Sopenharmony_ci if (!mptcp_disconnect(sk, 0)) 176762306a36Sopenharmony_ci sk->sk_socket->state = SS_UNCONNECTED; 176862306a36Sopenharmony_ci } 176962306a36Sopenharmony_ci inet_clear_bit(DEFER_CONNECT, sk); 177062306a36Sopenharmony_ci 177162306a36Sopenharmony_ci return ret; 177262306a36Sopenharmony_ci} 177362306a36Sopenharmony_ci 177462306a36Sopenharmony_cistatic int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) 177562306a36Sopenharmony_ci{ 177662306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sk); 177762306a36Sopenharmony_ci struct page_frag *pfrag; 177862306a36Sopenharmony_ci size_t copied = 0; 177962306a36Sopenharmony_ci int ret = 0; 178062306a36Sopenharmony_ci long timeo; 178162306a36Sopenharmony_ci 178262306a36Sopenharmony_ci /* silently ignore everything else */ 178362306a36Sopenharmony_ci msg->msg_flags &= MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_FASTOPEN; 178462306a36Sopenharmony_ci 178562306a36Sopenharmony_ci lock_sock(sk); 178662306a36Sopenharmony_ci 178762306a36Sopenharmony_ci if (unlikely(inet_test_bit(DEFER_CONNECT, sk) || 178862306a36Sopenharmony_ci msg->msg_flags & MSG_FASTOPEN)) { 178962306a36Sopenharmony_ci int copied_syn = 0; 179062306a36Sopenharmony_ci 179162306a36Sopenharmony_ci ret = mptcp_sendmsg_fastopen(sk, msg, len, &copied_syn); 179262306a36Sopenharmony_ci copied += copied_syn; 179362306a36Sopenharmony_ci if (ret == -EINPROGRESS && copied_syn > 0) 179462306a36Sopenharmony_ci goto out; 179562306a36Sopenharmony_ci else if (ret) 179662306a36Sopenharmony_ci goto do_error; 179762306a36Sopenharmony_ci } 179862306a36Sopenharmony_ci 179962306a36Sopenharmony_ci timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 180062306a36Sopenharmony_ci 180162306a36Sopenharmony_ci if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) { 180262306a36Sopenharmony_ci ret = sk_stream_wait_connect(sk, &timeo); 180362306a36Sopenharmony_ci if (ret) 180462306a36Sopenharmony_ci goto do_error; 180562306a36Sopenharmony_ci } 180662306a36Sopenharmony_ci 180762306a36Sopenharmony_ci ret = -EPIPE; 180862306a36Sopenharmony_ci if (unlikely(sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))) 180962306a36Sopenharmony_ci goto do_error; 181062306a36Sopenharmony_ci 181162306a36Sopenharmony_ci pfrag = sk_page_frag(sk); 181262306a36Sopenharmony_ci 181362306a36Sopenharmony_ci while (msg_data_left(msg)) { 181462306a36Sopenharmony_ci int total_ts, frag_truesize = 0; 181562306a36Sopenharmony_ci struct mptcp_data_frag *dfrag; 181662306a36Sopenharmony_ci bool dfrag_collapsed; 181762306a36Sopenharmony_ci size_t psize, offset; 181862306a36Sopenharmony_ci 181962306a36Sopenharmony_ci /* reuse tail pfrag, if possible, or carve a new one from the 182062306a36Sopenharmony_ci * page allocator 182162306a36Sopenharmony_ci */ 182262306a36Sopenharmony_ci dfrag = mptcp_pending_tail(sk); 182362306a36Sopenharmony_ci dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag); 182462306a36Sopenharmony_ci if (!dfrag_collapsed) { 182562306a36Sopenharmony_ci if (!sk_stream_memory_free(sk)) 182662306a36Sopenharmony_ci goto wait_for_memory; 182762306a36Sopenharmony_ci 182862306a36Sopenharmony_ci if (!mptcp_page_frag_refill(sk, pfrag)) 182962306a36Sopenharmony_ci goto wait_for_memory; 183062306a36Sopenharmony_ci 183162306a36Sopenharmony_ci dfrag = mptcp_carve_data_frag(msk, pfrag, pfrag->offset); 183262306a36Sopenharmony_ci frag_truesize = dfrag->overhead; 183362306a36Sopenharmony_ci } 183462306a36Sopenharmony_ci 183562306a36Sopenharmony_ci /* we do not bound vs wspace, to allow a single packet. 183662306a36Sopenharmony_ci * memory accounting will prevent execessive memory usage 183762306a36Sopenharmony_ci * anyway 183862306a36Sopenharmony_ci */ 183962306a36Sopenharmony_ci offset = dfrag->offset + dfrag->data_len; 184062306a36Sopenharmony_ci psize = pfrag->size - offset; 184162306a36Sopenharmony_ci psize = min_t(size_t, psize, msg_data_left(msg)); 184262306a36Sopenharmony_ci total_ts = psize + frag_truesize; 184362306a36Sopenharmony_ci 184462306a36Sopenharmony_ci if (!sk_wmem_schedule(sk, total_ts)) 184562306a36Sopenharmony_ci goto wait_for_memory; 184662306a36Sopenharmony_ci 184762306a36Sopenharmony_ci if (copy_page_from_iter(dfrag->page, offset, psize, 184862306a36Sopenharmony_ci &msg->msg_iter) != psize) { 184962306a36Sopenharmony_ci ret = -EFAULT; 185062306a36Sopenharmony_ci goto do_error; 185162306a36Sopenharmony_ci } 185262306a36Sopenharmony_ci 185362306a36Sopenharmony_ci /* data successfully copied into the write queue */ 185462306a36Sopenharmony_ci sk_forward_alloc_add(sk, -total_ts); 185562306a36Sopenharmony_ci copied += psize; 185662306a36Sopenharmony_ci dfrag->data_len += psize; 185762306a36Sopenharmony_ci frag_truesize += psize; 185862306a36Sopenharmony_ci pfrag->offset += frag_truesize; 185962306a36Sopenharmony_ci WRITE_ONCE(msk->write_seq, msk->write_seq + psize); 186062306a36Sopenharmony_ci 186162306a36Sopenharmony_ci /* charge data on mptcp pending queue to the msk socket 186262306a36Sopenharmony_ci * Note: we charge such data both to sk and ssk 186362306a36Sopenharmony_ci */ 186462306a36Sopenharmony_ci sk_wmem_queued_add(sk, frag_truesize); 186562306a36Sopenharmony_ci if (!dfrag_collapsed) { 186662306a36Sopenharmony_ci get_page(dfrag->page); 186762306a36Sopenharmony_ci list_add_tail(&dfrag->list, &msk->rtx_queue); 186862306a36Sopenharmony_ci if (!msk->first_pending) 186962306a36Sopenharmony_ci WRITE_ONCE(msk->first_pending, dfrag); 187062306a36Sopenharmony_ci } 187162306a36Sopenharmony_ci pr_debug("msk=%p dfrag at seq=%llu len=%u sent=%u new=%d", msk, 187262306a36Sopenharmony_ci dfrag->data_seq, dfrag->data_len, dfrag->already_sent, 187362306a36Sopenharmony_ci !dfrag_collapsed); 187462306a36Sopenharmony_ci 187562306a36Sopenharmony_ci continue; 187662306a36Sopenharmony_ci 187762306a36Sopenharmony_ciwait_for_memory: 187862306a36Sopenharmony_ci mptcp_set_nospace(sk); 187962306a36Sopenharmony_ci __mptcp_push_pending(sk, msg->msg_flags); 188062306a36Sopenharmony_ci ret = sk_stream_wait_memory(sk, &timeo); 188162306a36Sopenharmony_ci if (ret) 188262306a36Sopenharmony_ci goto do_error; 188362306a36Sopenharmony_ci } 188462306a36Sopenharmony_ci 188562306a36Sopenharmony_ci if (copied) 188662306a36Sopenharmony_ci __mptcp_push_pending(sk, msg->msg_flags); 188762306a36Sopenharmony_ci 188862306a36Sopenharmony_ciout: 188962306a36Sopenharmony_ci release_sock(sk); 189062306a36Sopenharmony_ci return copied; 189162306a36Sopenharmony_ci 189262306a36Sopenharmony_cido_error: 189362306a36Sopenharmony_ci if (copied) 189462306a36Sopenharmony_ci goto out; 189562306a36Sopenharmony_ci 189662306a36Sopenharmony_ci copied = sk_stream_error(sk, msg->msg_flags, ret); 189762306a36Sopenharmony_ci goto out; 189862306a36Sopenharmony_ci} 189962306a36Sopenharmony_ci 190062306a36Sopenharmony_cistatic int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, 190162306a36Sopenharmony_ci struct msghdr *msg, 190262306a36Sopenharmony_ci size_t len, int flags, 190362306a36Sopenharmony_ci struct scm_timestamping_internal *tss, 190462306a36Sopenharmony_ci int *cmsg_flags) 190562306a36Sopenharmony_ci{ 190662306a36Sopenharmony_ci struct sk_buff *skb, *tmp; 190762306a36Sopenharmony_ci int copied = 0; 190862306a36Sopenharmony_ci 190962306a36Sopenharmony_ci skb_queue_walk_safe(&msk->receive_queue, skb, tmp) { 191062306a36Sopenharmony_ci u32 offset = MPTCP_SKB_CB(skb)->offset; 191162306a36Sopenharmony_ci u32 data_len = skb->len - offset; 191262306a36Sopenharmony_ci u32 count = min_t(size_t, len - copied, data_len); 191362306a36Sopenharmony_ci int err; 191462306a36Sopenharmony_ci 191562306a36Sopenharmony_ci if (!(flags & MSG_TRUNC)) { 191662306a36Sopenharmony_ci err = skb_copy_datagram_msg(skb, offset, msg, count); 191762306a36Sopenharmony_ci if (unlikely(err < 0)) { 191862306a36Sopenharmony_ci if (!copied) 191962306a36Sopenharmony_ci return err; 192062306a36Sopenharmony_ci break; 192162306a36Sopenharmony_ci } 192262306a36Sopenharmony_ci } 192362306a36Sopenharmony_ci 192462306a36Sopenharmony_ci if (MPTCP_SKB_CB(skb)->has_rxtstamp) { 192562306a36Sopenharmony_ci tcp_update_recv_tstamps(skb, tss); 192662306a36Sopenharmony_ci *cmsg_flags |= MPTCP_CMSG_TS; 192762306a36Sopenharmony_ci } 192862306a36Sopenharmony_ci 192962306a36Sopenharmony_ci copied += count; 193062306a36Sopenharmony_ci 193162306a36Sopenharmony_ci if (count < data_len) { 193262306a36Sopenharmony_ci if (!(flags & MSG_PEEK)) { 193362306a36Sopenharmony_ci MPTCP_SKB_CB(skb)->offset += count; 193462306a36Sopenharmony_ci MPTCP_SKB_CB(skb)->map_seq += count; 193562306a36Sopenharmony_ci } 193662306a36Sopenharmony_ci break; 193762306a36Sopenharmony_ci } 193862306a36Sopenharmony_ci 193962306a36Sopenharmony_ci if (!(flags & MSG_PEEK)) { 194062306a36Sopenharmony_ci /* we will bulk release the skb memory later */ 194162306a36Sopenharmony_ci skb->destructor = NULL; 194262306a36Sopenharmony_ci WRITE_ONCE(msk->rmem_released, msk->rmem_released + skb->truesize); 194362306a36Sopenharmony_ci __skb_unlink(skb, &msk->receive_queue); 194462306a36Sopenharmony_ci __kfree_skb(skb); 194562306a36Sopenharmony_ci } 194662306a36Sopenharmony_ci 194762306a36Sopenharmony_ci if (copied >= len) 194862306a36Sopenharmony_ci break; 194962306a36Sopenharmony_ci } 195062306a36Sopenharmony_ci 195162306a36Sopenharmony_ci return copied; 195262306a36Sopenharmony_ci} 195362306a36Sopenharmony_ci 195462306a36Sopenharmony_ci/* receive buffer autotuning. See tcp_rcv_space_adjust for more information. 195562306a36Sopenharmony_ci * 195662306a36Sopenharmony_ci * Only difference: Use highest rtt estimate of the subflows in use. 195762306a36Sopenharmony_ci */ 195862306a36Sopenharmony_cistatic void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) 195962306a36Sopenharmony_ci{ 196062306a36Sopenharmony_ci struct mptcp_subflow_context *subflow; 196162306a36Sopenharmony_ci struct sock *sk = (struct sock *)msk; 196262306a36Sopenharmony_ci u8 scaling_ratio = U8_MAX; 196362306a36Sopenharmony_ci u32 time, advmss = 1; 196462306a36Sopenharmony_ci u64 rtt_us, mstamp; 196562306a36Sopenharmony_ci 196662306a36Sopenharmony_ci msk_owned_by_me(msk); 196762306a36Sopenharmony_ci 196862306a36Sopenharmony_ci if (copied <= 0) 196962306a36Sopenharmony_ci return; 197062306a36Sopenharmony_ci 197162306a36Sopenharmony_ci if (!msk->rcvspace_init) 197262306a36Sopenharmony_ci mptcp_rcv_space_init(msk, msk->first); 197362306a36Sopenharmony_ci 197462306a36Sopenharmony_ci msk->rcvq_space.copied += copied; 197562306a36Sopenharmony_ci 197662306a36Sopenharmony_ci mstamp = div_u64(tcp_clock_ns(), NSEC_PER_USEC); 197762306a36Sopenharmony_ci time = tcp_stamp_us_delta(mstamp, msk->rcvq_space.time); 197862306a36Sopenharmony_ci 197962306a36Sopenharmony_ci rtt_us = msk->rcvq_space.rtt_us; 198062306a36Sopenharmony_ci if (rtt_us && time < (rtt_us >> 3)) 198162306a36Sopenharmony_ci return; 198262306a36Sopenharmony_ci 198362306a36Sopenharmony_ci rtt_us = 0; 198462306a36Sopenharmony_ci mptcp_for_each_subflow(msk, subflow) { 198562306a36Sopenharmony_ci const struct tcp_sock *tp; 198662306a36Sopenharmony_ci u64 sf_rtt_us; 198762306a36Sopenharmony_ci u32 sf_advmss; 198862306a36Sopenharmony_ci 198962306a36Sopenharmony_ci tp = tcp_sk(mptcp_subflow_tcp_sock(subflow)); 199062306a36Sopenharmony_ci 199162306a36Sopenharmony_ci sf_rtt_us = READ_ONCE(tp->rcv_rtt_est.rtt_us); 199262306a36Sopenharmony_ci sf_advmss = READ_ONCE(tp->advmss); 199362306a36Sopenharmony_ci 199462306a36Sopenharmony_ci rtt_us = max(sf_rtt_us, rtt_us); 199562306a36Sopenharmony_ci advmss = max(sf_advmss, advmss); 199662306a36Sopenharmony_ci scaling_ratio = min(tp->scaling_ratio, scaling_ratio); 199762306a36Sopenharmony_ci } 199862306a36Sopenharmony_ci 199962306a36Sopenharmony_ci msk->rcvq_space.rtt_us = rtt_us; 200062306a36Sopenharmony_ci msk->scaling_ratio = scaling_ratio; 200162306a36Sopenharmony_ci if (time < (rtt_us >> 3) || rtt_us == 0) 200262306a36Sopenharmony_ci return; 200362306a36Sopenharmony_ci 200462306a36Sopenharmony_ci if (msk->rcvq_space.copied <= msk->rcvq_space.space) 200562306a36Sopenharmony_ci goto new_measure; 200662306a36Sopenharmony_ci 200762306a36Sopenharmony_ci if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && 200862306a36Sopenharmony_ci !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { 200962306a36Sopenharmony_ci u64 rcvwin, grow; 201062306a36Sopenharmony_ci int rcvbuf; 201162306a36Sopenharmony_ci 201262306a36Sopenharmony_ci rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss; 201362306a36Sopenharmony_ci 201462306a36Sopenharmony_ci grow = rcvwin * (msk->rcvq_space.copied - msk->rcvq_space.space); 201562306a36Sopenharmony_ci 201662306a36Sopenharmony_ci do_div(grow, msk->rcvq_space.space); 201762306a36Sopenharmony_ci rcvwin += (grow << 1); 201862306a36Sopenharmony_ci 201962306a36Sopenharmony_ci rcvbuf = min_t(u64, __tcp_space_from_win(scaling_ratio, rcvwin), 202062306a36Sopenharmony_ci READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); 202162306a36Sopenharmony_ci 202262306a36Sopenharmony_ci if (rcvbuf > sk->sk_rcvbuf) { 202362306a36Sopenharmony_ci u32 window_clamp; 202462306a36Sopenharmony_ci 202562306a36Sopenharmony_ci window_clamp = __tcp_win_from_space(scaling_ratio, rcvbuf); 202662306a36Sopenharmony_ci WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); 202762306a36Sopenharmony_ci 202862306a36Sopenharmony_ci /* Make subflows follow along. If we do not do this, we 202962306a36Sopenharmony_ci * get drops at subflow level if skbs can't be moved to 203062306a36Sopenharmony_ci * the mptcp rx queue fast enough (announced rcv_win can 203162306a36Sopenharmony_ci * exceed ssk->sk_rcvbuf). 203262306a36Sopenharmony_ci */ 203362306a36Sopenharmony_ci mptcp_for_each_subflow(msk, subflow) { 203462306a36Sopenharmony_ci struct sock *ssk; 203562306a36Sopenharmony_ci bool slow; 203662306a36Sopenharmony_ci 203762306a36Sopenharmony_ci ssk = mptcp_subflow_tcp_sock(subflow); 203862306a36Sopenharmony_ci slow = lock_sock_fast(ssk); 203962306a36Sopenharmony_ci WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf); 204062306a36Sopenharmony_ci tcp_sk(ssk)->window_clamp = window_clamp; 204162306a36Sopenharmony_ci tcp_cleanup_rbuf(ssk, 1); 204262306a36Sopenharmony_ci unlock_sock_fast(ssk, slow); 204362306a36Sopenharmony_ci } 204462306a36Sopenharmony_ci } 204562306a36Sopenharmony_ci } 204662306a36Sopenharmony_ci 204762306a36Sopenharmony_ci msk->rcvq_space.space = msk->rcvq_space.copied; 204862306a36Sopenharmony_cinew_measure: 204962306a36Sopenharmony_ci msk->rcvq_space.copied = 0; 205062306a36Sopenharmony_ci msk->rcvq_space.time = mstamp; 205162306a36Sopenharmony_ci} 205262306a36Sopenharmony_ci 205362306a36Sopenharmony_cistatic void __mptcp_update_rmem(struct sock *sk) 205462306a36Sopenharmony_ci{ 205562306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sk); 205662306a36Sopenharmony_ci 205762306a36Sopenharmony_ci if (!msk->rmem_released) 205862306a36Sopenharmony_ci return; 205962306a36Sopenharmony_ci 206062306a36Sopenharmony_ci atomic_sub(msk->rmem_released, &sk->sk_rmem_alloc); 206162306a36Sopenharmony_ci mptcp_rmem_uncharge(sk, msk->rmem_released); 206262306a36Sopenharmony_ci WRITE_ONCE(msk->rmem_released, 0); 206362306a36Sopenharmony_ci} 206462306a36Sopenharmony_ci 206562306a36Sopenharmony_cistatic void __mptcp_splice_receive_queue(struct sock *sk) 206662306a36Sopenharmony_ci{ 206762306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sk); 206862306a36Sopenharmony_ci 206962306a36Sopenharmony_ci skb_queue_splice_tail_init(&sk->sk_receive_queue, &msk->receive_queue); 207062306a36Sopenharmony_ci} 207162306a36Sopenharmony_ci 207262306a36Sopenharmony_cistatic bool __mptcp_move_skbs(struct mptcp_sock *msk) 207362306a36Sopenharmony_ci{ 207462306a36Sopenharmony_ci struct sock *sk = (struct sock *)msk; 207562306a36Sopenharmony_ci unsigned int moved = 0; 207662306a36Sopenharmony_ci bool ret, done; 207762306a36Sopenharmony_ci 207862306a36Sopenharmony_ci do { 207962306a36Sopenharmony_ci struct sock *ssk = mptcp_subflow_recv_lookup(msk); 208062306a36Sopenharmony_ci bool slowpath; 208162306a36Sopenharmony_ci 208262306a36Sopenharmony_ci /* we can have data pending in the subflows only if the msk 208362306a36Sopenharmony_ci * receive buffer was full at subflow_data_ready() time, 208462306a36Sopenharmony_ci * that is an unlikely slow path. 208562306a36Sopenharmony_ci */ 208662306a36Sopenharmony_ci if (likely(!ssk)) 208762306a36Sopenharmony_ci break; 208862306a36Sopenharmony_ci 208962306a36Sopenharmony_ci slowpath = lock_sock_fast(ssk); 209062306a36Sopenharmony_ci mptcp_data_lock(sk); 209162306a36Sopenharmony_ci __mptcp_update_rmem(sk); 209262306a36Sopenharmony_ci done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved); 209362306a36Sopenharmony_ci mptcp_data_unlock(sk); 209462306a36Sopenharmony_ci 209562306a36Sopenharmony_ci if (unlikely(ssk->sk_err)) 209662306a36Sopenharmony_ci __mptcp_error_report(sk); 209762306a36Sopenharmony_ci unlock_sock_fast(ssk, slowpath); 209862306a36Sopenharmony_ci } while (!done); 209962306a36Sopenharmony_ci 210062306a36Sopenharmony_ci /* acquire the data lock only if some input data is pending */ 210162306a36Sopenharmony_ci ret = moved > 0; 210262306a36Sopenharmony_ci if (!RB_EMPTY_ROOT(&msk->out_of_order_queue) || 210362306a36Sopenharmony_ci !skb_queue_empty_lockless(&sk->sk_receive_queue)) { 210462306a36Sopenharmony_ci mptcp_data_lock(sk); 210562306a36Sopenharmony_ci __mptcp_update_rmem(sk); 210662306a36Sopenharmony_ci ret |= __mptcp_ofo_queue(msk); 210762306a36Sopenharmony_ci __mptcp_splice_receive_queue(sk); 210862306a36Sopenharmony_ci mptcp_data_unlock(sk); 210962306a36Sopenharmony_ci } 211062306a36Sopenharmony_ci if (ret) 211162306a36Sopenharmony_ci mptcp_check_data_fin((struct sock *)msk); 211262306a36Sopenharmony_ci return !skb_queue_empty(&msk->receive_queue); 211362306a36Sopenharmony_ci} 211462306a36Sopenharmony_ci 211562306a36Sopenharmony_cistatic unsigned int mptcp_inq_hint(const struct sock *sk) 211662306a36Sopenharmony_ci{ 211762306a36Sopenharmony_ci const struct mptcp_sock *msk = mptcp_sk(sk); 211862306a36Sopenharmony_ci const struct sk_buff *skb; 211962306a36Sopenharmony_ci 212062306a36Sopenharmony_ci skb = skb_peek(&msk->receive_queue); 212162306a36Sopenharmony_ci if (skb) { 212262306a36Sopenharmony_ci u64 hint_val = msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq; 212362306a36Sopenharmony_ci 212462306a36Sopenharmony_ci if (hint_val >= INT_MAX) 212562306a36Sopenharmony_ci return INT_MAX; 212662306a36Sopenharmony_ci 212762306a36Sopenharmony_ci return (unsigned int)hint_val; 212862306a36Sopenharmony_ci } 212962306a36Sopenharmony_ci 213062306a36Sopenharmony_ci if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN)) 213162306a36Sopenharmony_ci return 1; 213262306a36Sopenharmony_ci 213362306a36Sopenharmony_ci return 0; 213462306a36Sopenharmony_ci} 213562306a36Sopenharmony_ci 213662306a36Sopenharmony_cistatic int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, 213762306a36Sopenharmony_ci int flags, int *addr_len) 213862306a36Sopenharmony_ci{ 213962306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sk); 214062306a36Sopenharmony_ci struct scm_timestamping_internal tss; 214162306a36Sopenharmony_ci int copied = 0, cmsg_flags = 0; 214262306a36Sopenharmony_ci int target; 214362306a36Sopenharmony_ci long timeo; 214462306a36Sopenharmony_ci 214562306a36Sopenharmony_ci /* MSG_ERRQUEUE is really a no-op till we support IP_RECVERR */ 214662306a36Sopenharmony_ci if (unlikely(flags & MSG_ERRQUEUE)) 214762306a36Sopenharmony_ci return inet_recv_error(sk, msg, len, addr_len); 214862306a36Sopenharmony_ci 214962306a36Sopenharmony_ci lock_sock(sk); 215062306a36Sopenharmony_ci if (unlikely(sk->sk_state == TCP_LISTEN)) { 215162306a36Sopenharmony_ci copied = -ENOTCONN; 215262306a36Sopenharmony_ci goto out_err; 215362306a36Sopenharmony_ci } 215462306a36Sopenharmony_ci 215562306a36Sopenharmony_ci timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 215662306a36Sopenharmony_ci 215762306a36Sopenharmony_ci len = min_t(size_t, len, INT_MAX); 215862306a36Sopenharmony_ci target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); 215962306a36Sopenharmony_ci 216062306a36Sopenharmony_ci if (unlikely(msk->recvmsg_inq)) 216162306a36Sopenharmony_ci cmsg_flags = MPTCP_CMSG_INQ; 216262306a36Sopenharmony_ci 216362306a36Sopenharmony_ci while (copied < len) { 216462306a36Sopenharmony_ci int bytes_read; 216562306a36Sopenharmony_ci 216662306a36Sopenharmony_ci bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied, flags, &tss, &cmsg_flags); 216762306a36Sopenharmony_ci if (unlikely(bytes_read < 0)) { 216862306a36Sopenharmony_ci if (!copied) 216962306a36Sopenharmony_ci copied = bytes_read; 217062306a36Sopenharmony_ci goto out_err; 217162306a36Sopenharmony_ci } 217262306a36Sopenharmony_ci 217362306a36Sopenharmony_ci copied += bytes_read; 217462306a36Sopenharmony_ci 217562306a36Sopenharmony_ci /* be sure to advertise window change */ 217662306a36Sopenharmony_ci mptcp_cleanup_rbuf(msk); 217762306a36Sopenharmony_ci 217862306a36Sopenharmony_ci if (skb_queue_empty(&msk->receive_queue) && __mptcp_move_skbs(msk)) 217962306a36Sopenharmony_ci continue; 218062306a36Sopenharmony_ci 218162306a36Sopenharmony_ci /* only the master socket status is relevant here. The exit 218262306a36Sopenharmony_ci * conditions mirror closely tcp_recvmsg() 218362306a36Sopenharmony_ci */ 218462306a36Sopenharmony_ci if (copied >= target) 218562306a36Sopenharmony_ci break; 218662306a36Sopenharmony_ci 218762306a36Sopenharmony_ci if (copied) { 218862306a36Sopenharmony_ci if (sk->sk_err || 218962306a36Sopenharmony_ci sk->sk_state == TCP_CLOSE || 219062306a36Sopenharmony_ci (sk->sk_shutdown & RCV_SHUTDOWN) || 219162306a36Sopenharmony_ci !timeo || 219262306a36Sopenharmony_ci signal_pending(current)) 219362306a36Sopenharmony_ci break; 219462306a36Sopenharmony_ci } else { 219562306a36Sopenharmony_ci if (sk->sk_err) { 219662306a36Sopenharmony_ci copied = sock_error(sk); 219762306a36Sopenharmony_ci break; 219862306a36Sopenharmony_ci } 219962306a36Sopenharmony_ci 220062306a36Sopenharmony_ci if (sk->sk_shutdown & RCV_SHUTDOWN) { 220162306a36Sopenharmony_ci /* race breaker: the shutdown could be after the 220262306a36Sopenharmony_ci * previous receive queue check 220362306a36Sopenharmony_ci */ 220462306a36Sopenharmony_ci if (__mptcp_move_skbs(msk)) 220562306a36Sopenharmony_ci continue; 220662306a36Sopenharmony_ci break; 220762306a36Sopenharmony_ci } 220862306a36Sopenharmony_ci 220962306a36Sopenharmony_ci if (sk->sk_state == TCP_CLOSE) { 221062306a36Sopenharmony_ci copied = -ENOTCONN; 221162306a36Sopenharmony_ci break; 221262306a36Sopenharmony_ci } 221362306a36Sopenharmony_ci 221462306a36Sopenharmony_ci if (!timeo) { 221562306a36Sopenharmony_ci copied = -EAGAIN; 221662306a36Sopenharmony_ci break; 221762306a36Sopenharmony_ci } 221862306a36Sopenharmony_ci 221962306a36Sopenharmony_ci if (signal_pending(current)) { 222062306a36Sopenharmony_ci copied = sock_intr_errno(timeo); 222162306a36Sopenharmony_ci break; 222262306a36Sopenharmony_ci } 222362306a36Sopenharmony_ci } 222462306a36Sopenharmony_ci 222562306a36Sopenharmony_ci pr_debug("block timeout %ld", timeo); 222662306a36Sopenharmony_ci sk_wait_data(sk, &timeo, NULL); 222762306a36Sopenharmony_ci } 222862306a36Sopenharmony_ci 222962306a36Sopenharmony_ciout_err: 223062306a36Sopenharmony_ci if (cmsg_flags && copied >= 0) { 223162306a36Sopenharmony_ci if (cmsg_flags & MPTCP_CMSG_TS) 223262306a36Sopenharmony_ci tcp_recv_timestamp(msg, sk, &tss); 223362306a36Sopenharmony_ci 223462306a36Sopenharmony_ci if (cmsg_flags & MPTCP_CMSG_INQ) { 223562306a36Sopenharmony_ci unsigned int inq = mptcp_inq_hint(sk); 223662306a36Sopenharmony_ci 223762306a36Sopenharmony_ci put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq); 223862306a36Sopenharmony_ci } 223962306a36Sopenharmony_ci } 224062306a36Sopenharmony_ci 224162306a36Sopenharmony_ci pr_debug("msk=%p rx queue empty=%d:%d copied=%d", 224262306a36Sopenharmony_ci msk, skb_queue_empty_lockless(&sk->sk_receive_queue), 224362306a36Sopenharmony_ci skb_queue_empty(&msk->receive_queue), copied); 224462306a36Sopenharmony_ci if (!(flags & MSG_PEEK)) 224562306a36Sopenharmony_ci mptcp_rcv_space_adjust(msk, copied); 224662306a36Sopenharmony_ci 224762306a36Sopenharmony_ci release_sock(sk); 224862306a36Sopenharmony_ci return copied; 224962306a36Sopenharmony_ci} 225062306a36Sopenharmony_ci 225162306a36Sopenharmony_cistatic void mptcp_retransmit_timer(struct timer_list *t) 225262306a36Sopenharmony_ci{ 225362306a36Sopenharmony_ci struct inet_connection_sock *icsk = from_timer(icsk, t, 225462306a36Sopenharmony_ci icsk_retransmit_timer); 225562306a36Sopenharmony_ci struct sock *sk = &icsk->icsk_inet.sk; 225662306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sk); 225762306a36Sopenharmony_ci 225862306a36Sopenharmony_ci bh_lock_sock(sk); 225962306a36Sopenharmony_ci if (!sock_owned_by_user(sk)) { 226062306a36Sopenharmony_ci /* we need a process context to retransmit */ 226162306a36Sopenharmony_ci if (!test_and_set_bit(MPTCP_WORK_RTX, &msk->flags)) 226262306a36Sopenharmony_ci mptcp_schedule_work(sk); 226362306a36Sopenharmony_ci } else { 226462306a36Sopenharmony_ci /* delegate our work to tcp_release_cb() */ 226562306a36Sopenharmony_ci __set_bit(MPTCP_RETRANSMIT, &msk->cb_flags); 226662306a36Sopenharmony_ci } 226762306a36Sopenharmony_ci bh_unlock_sock(sk); 226862306a36Sopenharmony_ci sock_put(sk); 226962306a36Sopenharmony_ci} 227062306a36Sopenharmony_ci 227162306a36Sopenharmony_cistatic void mptcp_tout_timer(struct timer_list *t) 227262306a36Sopenharmony_ci{ 227362306a36Sopenharmony_ci struct sock *sk = from_timer(sk, t, sk_timer); 227462306a36Sopenharmony_ci 227562306a36Sopenharmony_ci mptcp_schedule_work(sk); 227662306a36Sopenharmony_ci sock_put(sk); 227762306a36Sopenharmony_ci} 227862306a36Sopenharmony_ci 227962306a36Sopenharmony_ci/* Find an idle subflow. Return NULL if there is unacked data at tcp 228062306a36Sopenharmony_ci * level. 228162306a36Sopenharmony_ci * 228262306a36Sopenharmony_ci * A backup subflow is returned only if that is the only kind available. 228362306a36Sopenharmony_ci */ 228462306a36Sopenharmony_cistruct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk) 228562306a36Sopenharmony_ci{ 228662306a36Sopenharmony_ci struct sock *backup = NULL, *pick = NULL; 228762306a36Sopenharmony_ci struct mptcp_subflow_context *subflow; 228862306a36Sopenharmony_ci int min_stale_count = INT_MAX; 228962306a36Sopenharmony_ci 229062306a36Sopenharmony_ci mptcp_for_each_subflow(msk, subflow) { 229162306a36Sopenharmony_ci struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 229262306a36Sopenharmony_ci 229362306a36Sopenharmony_ci if (!__mptcp_subflow_active(subflow)) 229462306a36Sopenharmony_ci continue; 229562306a36Sopenharmony_ci 229662306a36Sopenharmony_ci /* still data outstanding at TCP level? skip this */ 229762306a36Sopenharmony_ci if (!tcp_rtx_and_write_queues_empty(ssk)) { 229862306a36Sopenharmony_ci mptcp_pm_subflow_chk_stale(msk, ssk); 229962306a36Sopenharmony_ci min_stale_count = min_t(int, min_stale_count, subflow->stale_count); 230062306a36Sopenharmony_ci continue; 230162306a36Sopenharmony_ci } 230262306a36Sopenharmony_ci 230362306a36Sopenharmony_ci if (subflow->backup) { 230462306a36Sopenharmony_ci if (!backup) 230562306a36Sopenharmony_ci backup = ssk; 230662306a36Sopenharmony_ci continue; 230762306a36Sopenharmony_ci } 230862306a36Sopenharmony_ci 230962306a36Sopenharmony_ci if (!pick) 231062306a36Sopenharmony_ci pick = ssk; 231162306a36Sopenharmony_ci } 231262306a36Sopenharmony_ci 231362306a36Sopenharmony_ci if (pick) 231462306a36Sopenharmony_ci return pick; 231562306a36Sopenharmony_ci 231662306a36Sopenharmony_ci /* use backup only if there are no progresses anywhere */ 231762306a36Sopenharmony_ci return min_stale_count > 1 ? backup : NULL; 231862306a36Sopenharmony_ci} 231962306a36Sopenharmony_ci 232062306a36Sopenharmony_cibool __mptcp_retransmit_pending_data(struct sock *sk) 232162306a36Sopenharmony_ci{ 232262306a36Sopenharmony_ci struct mptcp_data_frag *cur, *rtx_head; 232362306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sk); 232462306a36Sopenharmony_ci 232562306a36Sopenharmony_ci if (__mptcp_check_fallback(msk)) 232662306a36Sopenharmony_ci return false; 232762306a36Sopenharmony_ci 232862306a36Sopenharmony_ci /* the closing socket has some data untransmitted and/or unacked: 232962306a36Sopenharmony_ci * some data in the mptcp rtx queue has not really xmitted yet. 233062306a36Sopenharmony_ci * keep it simple and re-inject the whole mptcp level rtx queue 233162306a36Sopenharmony_ci */ 233262306a36Sopenharmony_ci mptcp_data_lock(sk); 233362306a36Sopenharmony_ci __mptcp_clean_una_wakeup(sk); 233462306a36Sopenharmony_ci rtx_head = mptcp_rtx_head(sk); 233562306a36Sopenharmony_ci if (!rtx_head) { 233662306a36Sopenharmony_ci mptcp_data_unlock(sk); 233762306a36Sopenharmony_ci return false; 233862306a36Sopenharmony_ci } 233962306a36Sopenharmony_ci 234062306a36Sopenharmony_ci msk->recovery_snd_nxt = msk->snd_nxt; 234162306a36Sopenharmony_ci msk->recovery = true; 234262306a36Sopenharmony_ci mptcp_data_unlock(sk); 234362306a36Sopenharmony_ci 234462306a36Sopenharmony_ci msk->first_pending = rtx_head; 234562306a36Sopenharmony_ci msk->snd_burst = 0; 234662306a36Sopenharmony_ci 234762306a36Sopenharmony_ci /* be sure to clear the "sent status" on all re-injected fragments */ 234862306a36Sopenharmony_ci list_for_each_entry(cur, &msk->rtx_queue, list) { 234962306a36Sopenharmony_ci if (!cur->already_sent) 235062306a36Sopenharmony_ci break; 235162306a36Sopenharmony_ci cur->already_sent = 0; 235262306a36Sopenharmony_ci } 235362306a36Sopenharmony_ci 235462306a36Sopenharmony_ci return true; 235562306a36Sopenharmony_ci} 235662306a36Sopenharmony_ci 235762306a36Sopenharmony_ci/* flags for __mptcp_close_ssk() */ 235862306a36Sopenharmony_ci#define MPTCP_CF_PUSH BIT(1) 235962306a36Sopenharmony_ci#define MPTCP_CF_FASTCLOSE BIT(2) 236062306a36Sopenharmony_ci 236162306a36Sopenharmony_ci/* be sure to send a reset only if the caller asked for it, also 236262306a36Sopenharmony_ci * clean completely the subflow status when the subflow reaches 236362306a36Sopenharmony_ci * TCP_CLOSE state 236462306a36Sopenharmony_ci */ 236562306a36Sopenharmony_cistatic void __mptcp_subflow_disconnect(struct sock *ssk, 236662306a36Sopenharmony_ci struct mptcp_subflow_context *subflow, 236762306a36Sopenharmony_ci unsigned int flags) 236862306a36Sopenharmony_ci{ 236962306a36Sopenharmony_ci if (((1 << ssk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) || 237062306a36Sopenharmony_ci (flags & MPTCP_CF_FASTCLOSE)) { 237162306a36Sopenharmony_ci /* The MPTCP code never wait on the subflow sockets, TCP-level 237262306a36Sopenharmony_ci * disconnect should never fail 237362306a36Sopenharmony_ci */ 237462306a36Sopenharmony_ci WARN_ON_ONCE(tcp_disconnect(ssk, 0)); 237562306a36Sopenharmony_ci mptcp_subflow_ctx_reset(subflow); 237662306a36Sopenharmony_ci } else { 237762306a36Sopenharmony_ci tcp_shutdown(ssk, SEND_SHUTDOWN); 237862306a36Sopenharmony_ci } 237962306a36Sopenharmony_ci} 238062306a36Sopenharmony_ci 238162306a36Sopenharmony_ci/* subflow sockets can be either outgoing (connect) or incoming 238262306a36Sopenharmony_ci * (accept). 238362306a36Sopenharmony_ci * 238462306a36Sopenharmony_ci * Outgoing subflows use in-kernel sockets. 238562306a36Sopenharmony_ci * Incoming subflows do not have their own 'struct socket' allocated, 238662306a36Sopenharmony_ci * so we need to use tcp_close() after detaching them from the mptcp 238762306a36Sopenharmony_ci * parent socket. 238862306a36Sopenharmony_ci */ 238962306a36Sopenharmony_cistatic void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, 239062306a36Sopenharmony_ci struct mptcp_subflow_context *subflow, 239162306a36Sopenharmony_ci unsigned int flags) 239262306a36Sopenharmony_ci{ 239362306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sk); 239462306a36Sopenharmony_ci bool dispose_it, need_push = false; 239562306a36Sopenharmony_ci 239662306a36Sopenharmony_ci /* If the first subflow moved to a close state before accept, e.g. due 239762306a36Sopenharmony_ci * to an incoming reset or listener shutdown, the subflow socket is 239862306a36Sopenharmony_ci * already deleted by inet_child_forget() and the mptcp socket can't 239962306a36Sopenharmony_ci * survive too. 240062306a36Sopenharmony_ci */ 240162306a36Sopenharmony_ci if (msk->in_accept_queue && msk->first == ssk && 240262306a36Sopenharmony_ci (sock_flag(sk, SOCK_DEAD) || sock_flag(ssk, SOCK_DEAD))) { 240362306a36Sopenharmony_ci /* ensure later check in mptcp_worker() will dispose the msk */ 240462306a36Sopenharmony_ci mptcp_set_close_tout(sk, tcp_jiffies32 - (TCP_TIMEWAIT_LEN + 1)); 240562306a36Sopenharmony_ci sock_set_flag(sk, SOCK_DEAD); 240662306a36Sopenharmony_ci lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); 240762306a36Sopenharmony_ci mptcp_subflow_drop_ctx(ssk); 240862306a36Sopenharmony_ci goto out_release; 240962306a36Sopenharmony_ci } 241062306a36Sopenharmony_ci 241162306a36Sopenharmony_ci dispose_it = msk->free_first || ssk != msk->first; 241262306a36Sopenharmony_ci if (dispose_it) 241362306a36Sopenharmony_ci list_del(&subflow->node); 241462306a36Sopenharmony_ci 241562306a36Sopenharmony_ci lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); 241662306a36Sopenharmony_ci 241762306a36Sopenharmony_ci if ((flags & MPTCP_CF_FASTCLOSE) && !__mptcp_check_fallback(msk)) { 241862306a36Sopenharmony_ci /* be sure to force the tcp_close path 241962306a36Sopenharmony_ci * to generate the egress reset 242062306a36Sopenharmony_ci */ 242162306a36Sopenharmony_ci ssk->sk_lingertime = 0; 242262306a36Sopenharmony_ci sock_set_flag(ssk, SOCK_LINGER); 242362306a36Sopenharmony_ci subflow->send_fastclose = 1; 242462306a36Sopenharmony_ci } 242562306a36Sopenharmony_ci 242662306a36Sopenharmony_ci need_push = (flags & MPTCP_CF_PUSH) && __mptcp_retransmit_pending_data(sk); 242762306a36Sopenharmony_ci if (!dispose_it) { 242862306a36Sopenharmony_ci __mptcp_subflow_disconnect(ssk, subflow, flags); 242962306a36Sopenharmony_ci release_sock(ssk); 243062306a36Sopenharmony_ci 243162306a36Sopenharmony_ci goto out; 243262306a36Sopenharmony_ci } 243362306a36Sopenharmony_ci 243462306a36Sopenharmony_ci subflow->disposable = 1; 243562306a36Sopenharmony_ci 243662306a36Sopenharmony_ci /* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops 243762306a36Sopenharmony_ci * the ssk has been already destroyed, we just need to release the 243862306a36Sopenharmony_ci * reference owned by msk; 243962306a36Sopenharmony_ci */ 244062306a36Sopenharmony_ci if (!inet_csk(ssk)->icsk_ulp_ops) { 244162306a36Sopenharmony_ci WARN_ON_ONCE(!sock_flag(ssk, SOCK_DEAD)); 244262306a36Sopenharmony_ci kfree_rcu(subflow, rcu); 244362306a36Sopenharmony_ci } else { 244462306a36Sopenharmony_ci /* otherwise tcp will dispose of the ssk and subflow ctx */ 244562306a36Sopenharmony_ci __tcp_close(ssk, 0); 244662306a36Sopenharmony_ci 244762306a36Sopenharmony_ci /* close acquired an extra ref */ 244862306a36Sopenharmony_ci __sock_put(ssk); 244962306a36Sopenharmony_ci } 245062306a36Sopenharmony_ci 245162306a36Sopenharmony_ciout_release: 245262306a36Sopenharmony_ci __mptcp_subflow_error_report(sk, ssk); 245362306a36Sopenharmony_ci release_sock(ssk); 245462306a36Sopenharmony_ci 245562306a36Sopenharmony_ci sock_put(ssk); 245662306a36Sopenharmony_ci 245762306a36Sopenharmony_ci if (ssk == msk->first) 245862306a36Sopenharmony_ci WRITE_ONCE(msk->first, NULL); 245962306a36Sopenharmony_ci 246062306a36Sopenharmony_ciout: 246162306a36Sopenharmony_ci __mptcp_sync_sndbuf(sk); 246262306a36Sopenharmony_ci if (need_push) 246362306a36Sopenharmony_ci __mptcp_push_pending(sk, 0); 246462306a36Sopenharmony_ci 246562306a36Sopenharmony_ci /* Catch every 'all subflows closed' scenario, including peers silently 246662306a36Sopenharmony_ci * closing them, e.g. due to timeout. 246762306a36Sopenharmony_ci * For established sockets, allow an additional timeout before closing, 246862306a36Sopenharmony_ci * as the protocol can still create more subflows. 246962306a36Sopenharmony_ci */ 247062306a36Sopenharmony_ci if (list_is_singular(&msk->conn_list) && msk->first && 247162306a36Sopenharmony_ci inet_sk_state_load(msk->first) == TCP_CLOSE) { 247262306a36Sopenharmony_ci if (sk->sk_state != TCP_ESTABLISHED || 247362306a36Sopenharmony_ci msk->in_accept_queue || sock_flag(sk, SOCK_DEAD)) { 247462306a36Sopenharmony_ci mptcp_set_state(sk, TCP_CLOSE); 247562306a36Sopenharmony_ci mptcp_close_wake_up(sk); 247662306a36Sopenharmony_ci } else { 247762306a36Sopenharmony_ci mptcp_start_tout_timer(sk); 247862306a36Sopenharmony_ci } 247962306a36Sopenharmony_ci } 248062306a36Sopenharmony_ci} 248162306a36Sopenharmony_ci 248262306a36Sopenharmony_civoid mptcp_close_ssk(struct sock *sk, struct sock *ssk, 248362306a36Sopenharmony_ci struct mptcp_subflow_context *subflow) 248462306a36Sopenharmony_ci{ 248562306a36Sopenharmony_ci if (sk->sk_state == TCP_ESTABLISHED) 248662306a36Sopenharmony_ci mptcp_event(MPTCP_EVENT_SUB_CLOSED, mptcp_sk(sk), ssk, GFP_KERNEL); 248762306a36Sopenharmony_ci 248862306a36Sopenharmony_ci /* subflow aborted before reaching the fully_established status 248962306a36Sopenharmony_ci * attempt the creation of the next subflow 249062306a36Sopenharmony_ci */ 249162306a36Sopenharmony_ci mptcp_pm_subflow_check_next(mptcp_sk(sk), ssk, subflow); 249262306a36Sopenharmony_ci 249362306a36Sopenharmony_ci __mptcp_close_ssk(sk, ssk, subflow, MPTCP_CF_PUSH); 249462306a36Sopenharmony_ci} 249562306a36Sopenharmony_ci 249662306a36Sopenharmony_cistatic unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) 249762306a36Sopenharmony_ci{ 249862306a36Sopenharmony_ci return 0; 249962306a36Sopenharmony_ci} 250062306a36Sopenharmony_ci 250162306a36Sopenharmony_cistatic void __mptcp_close_subflow(struct sock *sk) 250262306a36Sopenharmony_ci{ 250362306a36Sopenharmony_ci struct mptcp_subflow_context *subflow, *tmp; 250462306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sk); 250562306a36Sopenharmony_ci 250662306a36Sopenharmony_ci might_sleep(); 250762306a36Sopenharmony_ci 250862306a36Sopenharmony_ci mptcp_for_each_subflow_safe(msk, subflow, tmp) { 250962306a36Sopenharmony_ci struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 251062306a36Sopenharmony_ci 251162306a36Sopenharmony_ci if (inet_sk_state_load(ssk) != TCP_CLOSE) 251262306a36Sopenharmony_ci continue; 251362306a36Sopenharmony_ci 251462306a36Sopenharmony_ci /* 'subflow_data_ready' will re-sched once rx queue is empty */ 251562306a36Sopenharmony_ci if (!skb_queue_empty_lockless(&ssk->sk_receive_queue)) 251662306a36Sopenharmony_ci continue; 251762306a36Sopenharmony_ci 251862306a36Sopenharmony_ci mptcp_close_ssk(sk, ssk, subflow); 251962306a36Sopenharmony_ci } 252062306a36Sopenharmony_ci 252162306a36Sopenharmony_ci} 252262306a36Sopenharmony_ci 252362306a36Sopenharmony_cistatic bool mptcp_close_tout_expired(const struct sock *sk) 252462306a36Sopenharmony_ci{ 252562306a36Sopenharmony_ci if (!inet_csk(sk)->icsk_mtup.probe_timestamp || 252662306a36Sopenharmony_ci sk->sk_state == TCP_CLOSE) 252762306a36Sopenharmony_ci return false; 252862306a36Sopenharmony_ci 252962306a36Sopenharmony_ci return time_after32(tcp_jiffies32, 253062306a36Sopenharmony_ci inet_csk(sk)->icsk_mtup.probe_timestamp + TCP_TIMEWAIT_LEN); 253162306a36Sopenharmony_ci} 253262306a36Sopenharmony_ci 253362306a36Sopenharmony_cistatic void mptcp_check_fastclose(struct mptcp_sock *msk) 253462306a36Sopenharmony_ci{ 253562306a36Sopenharmony_ci struct mptcp_subflow_context *subflow, *tmp; 253662306a36Sopenharmony_ci struct sock *sk = (struct sock *)msk; 253762306a36Sopenharmony_ci 253862306a36Sopenharmony_ci if (likely(!READ_ONCE(msk->rcv_fastclose))) 253962306a36Sopenharmony_ci return; 254062306a36Sopenharmony_ci 254162306a36Sopenharmony_ci mptcp_token_destroy(msk); 254262306a36Sopenharmony_ci 254362306a36Sopenharmony_ci mptcp_for_each_subflow_safe(msk, subflow, tmp) { 254462306a36Sopenharmony_ci struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); 254562306a36Sopenharmony_ci bool slow; 254662306a36Sopenharmony_ci 254762306a36Sopenharmony_ci slow = lock_sock_fast(tcp_sk); 254862306a36Sopenharmony_ci if (tcp_sk->sk_state != TCP_CLOSE) { 254962306a36Sopenharmony_ci tcp_send_active_reset(tcp_sk, GFP_ATOMIC); 255062306a36Sopenharmony_ci tcp_set_state(tcp_sk, TCP_CLOSE); 255162306a36Sopenharmony_ci } 255262306a36Sopenharmony_ci unlock_sock_fast(tcp_sk, slow); 255362306a36Sopenharmony_ci } 255462306a36Sopenharmony_ci 255562306a36Sopenharmony_ci /* Mirror the tcp_reset() error propagation */ 255662306a36Sopenharmony_ci switch (sk->sk_state) { 255762306a36Sopenharmony_ci case TCP_SYN_SENT: 255862306a36Sopenharmony_ci WRITE_ONCE(sk->sk_err, ECONNREFUSED); 255962306a36Sopenharmony_ci break; 256062306a36Sopenharmony_ci case TCP_CLOSE_WAIT: 256162306a36Sopenharmony_ci WRITE_ONCE(sk->sk_err, EPIPE); 256262306a36Sopenharmony_ci break; 256362306a36Sopenharmony_ci case TCP_CLOSE: 256462306a36Sopenharmony_ci return; 256562306a36Sopenharmony_ci default: 256662306a36Sopenharmony_ci WRITE_ONCE(sk->sk_err, ECONNRESET); 256762306a36Sopenharmony_ci } 256862306a36Sopenharmony_ci 256962306a36Sopenharmony_ci mptcp_set_state(sk, TCP_CLOSE); 257062306a36Sopenharmony_ci WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); 257162306a36Sopenharmony_ci smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ 257262306a36Sopenharmony_ci set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags); 257362306a36Sopenharmony_ci 257462306a36Sopenharmony_ci /* the calling mptcp_worker will properly destroy the socket */ 257562306a36Sopenharmony_ci if (sock_flag(sk, SOCK_DEAD)) 257662306a36Sopenharmony_ci return; 257762306a36Sopenharmony_ci 257862306a36Sopenharmony_ci sk->sk_state_change(sk); 257962306a36Sopenharmony_ci sk_error_report(sk); 258062306a36Sopenharmony_ci} 258162306a36Sopenharmony_ci 258262306a36Sopenharmony_cistatic void __mptcp_retrans(struct sock *sk) 258362306a36Sopenharmony_ci{ 258462306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sk); 258562306a36Sopenharmony_ci struct mptcp_subflow_context *subflow; 258662306a36Sopenharmony_ci struct mptcp_sendmsg_info info = {}; 258762306a36Sopenharmony_ci struct mptcp_data_frag *dfrag; 258862306a36Sopenharmony_ci struct sock *ssk; 258962306a36Sopenharmony_ci int ret, err; 259062306a36Sopenharmony_ci u16 len = 0; 259162306a36Sopenharmony_ci 259262306a36Sopenharmony_ci mptcp_clean_una_wakeup(sk); 259362306a36Sopenharmony_ci 259462306a36Sopenharmony_ci /* first check ssk: need to kick "stale" logic */ 259562306a36Sopenharmony_ci err = mptcp_sched_get_retrans(msk); 259662306a36Sopenharmony_ci dfrag = mptcp_rtx_head(sk); 259762306a36Sopenharmony_ci if (!dfrag) { 259862306a36Sopenharmony_ci if (mptcp_data_fin_enabled(msk)) { 259962306a36Sopenharmony_ci struct inet_connection_sock *icsk = inet_csk(sk); 260062306a36Sopenharmony_ci 260162306a36Sopenharmony_ci icsk->icsk_retransmits++; 260262306a36Sopenharmony_ci mptcp_set_datafin_timeout(sk); 260362306a36Sopenharmony_ci mptcp_send_ack(msk); 260462306a36Sopenharmony_ci 260562306a36Sopenharmony_ci goto reset_timer; 260662306a36Sopenharmony_ci } 260762306a36Sopenharmony_ci 260862306a36Sopenharmony_ci if (!mptcp_send_head(sk)) 260962306a36Sopenharmony_ci return; 261062306a36Sopenharmony_ci 261162306a36Sopenharmony_ci goto reset_timer; 261262306a36Sopenharmony_ci } 261362306a36Sopenharmony_ci 261462306a36Sopenharmony_ci if (err) 261562306a36Sopenharmony_ci goto reset_timer; 261662306a36Sopenharmony_ci 261762306a36Sopenharmony_ci mptcp_for_each_subflow(msk, subflow) { 261862306a36Sopenharmony_ci if (READ_ONCE(subflow->scheduled)) { 261962306a36Sopenharmony_ci u16 copied = 0; 262062306a36Sopenharmony_ci 262162306a36Sopenharmony_ci mptcp_subflow_set_scheduled(subflow, false); 262262306a36Sopenharmony_ci 262362306a36Sopenharmony_ci ssk = mptcp_subflow_tcp_sock(subflow); 262462306a36Sopenharmony_ci 262562306a36Sopenharmony_ci lock_sock(ssk); 262662306a36Sopenharmony_ci 262762306a36Sopenharmony_ci /* limit retransmission to the bytes already sent on some subflows */ 262862306a36Sopenharmony_ci info.sent = 0; 262962306a36Sopenharmony_ci info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len : 263062306a36Sopenharmony_ci dfrag->already_sent; 263162306a36Sopenharmony_ci while (info.sent < info.limit) { 263262306a36Sopenharmony_ci ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); 263362306a36Sopenharmony_ci if (ret <= 0) 263462306a36Sopenharmony_ci break; 263562306a36Sopenharmony_ci 263662306a36Sopenharmony_ci MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS); 263762306a36Sopenharmony_ci copied += ret; 263862306a36Sopenharmony_ci info.sent += ret; 263962306a36Sopenharmony_ci } 264062306a36Sopenharmony_ci if (copied) { 264162306a36Sopenharmony_ci len = max(copied, len); 264262306a36Sopenharmony_ci tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, 264362306a36Sopenharmony_ci info.size_goal); 264462306a36Sopenharmony_ci WRITE_ONCE(msk->allow_infinite_fallback, false); 264562306a36Sopenharmony_ci } 264662306a36Sopenharmony_ci 264762306a36Sopenharmony_ci release_sock(ssk); 264862306a36Sopenharmony_ci } 264962306a36Sopenharmony_ci } 265062306a36Sopenharmony_ci 265162306a36Sopenharmony_ci msk->bytes_retrans += len; 265262306a36Sopenharmony_ci dfrag->already_sent = max(dfrag->already_sent, len); 265362306a36Sopenharmony_ci 265462306a36Sopenharmony_cireset_timer: 265562306a36Sopenharmony_ci mptcp_check_and_set_pending(sk); 265662306a36Sopenharmony_ci 265762306a36Sopenharmony_ci if (!mptcp_rtx_timer_pending(sk)) 265862306a36Sopenharmony_ci mptcp_reset_rtx_timer(sk); 265962306a36Sopenharmony_ci} 266062306a36Sopenharmony_ci 266162306a36Sopenharmony_ci/* schedule the timeout timer for the relevant event: either close timeout 266262306a36Sopenharmony_ci * or mp_fail timeout. The close timeout takes precedence on the mp_fail one 266362306a36Sopenharmony_ci */ 266462306a36Sopenharmony_civoid mptcp_reset_tout_timer(struct mptcp_sock *msk, unsigned long fail_tout) 266562306a36Sopenharmony_ci{ 266662306a36Sopenharmony_ci struct sock *sk = (struct sock *)msk; 266762306a36Sopenharmony_ci unsigned long timeout, close_timeout; 266862306a36Sopenharmony_ci 266962306a36Sopenharmony_ci if (!fail_tout && !inet_csk(sk)->icsk_mtup.probe_timestamp) 267062306a36Sopenharmony_ci return; 267162306a36Sopenharmony_ci 267262306a36Sopenharmony_ci close_timeout = inet_csk(sk)->icsk_mtup.probe_timestamp - tcp_jiffies32 + jiffies + 267362306a36Sopenharmony_ci TCP_TIMEWAIT_LEN; 267462306a36Sopenharmony_ci 267562306a36Sopenharmony_ci /* the close timeout takes precedence on the fail one, and here at least one of 267662306a36Sopenharmony_ci * them is active 267762306a36Sopenharmony_ci */ 267862306a36Sopenharmony_ci timeout = inet_csk(sk)->icsk_mtup.probe_timestamp ? close_timeout : fail_tout; 267962306a36Sopenharmony_ci 268062306a36Sopenharmony_ci sk_reset_timer(sk, &sk->sk_timer, timeout); 268162306a36Sopenharmony_ci} 268262306a36Sopenharmony_ci 268362306a36Sopenharmony_cistatic void mptcp_mp_fail_no_response(struct mptcp_sock *msk) 268462306a36Sopenharmony_ci{ 268562306a36Sopenharmony_ci struct sock *ssk = msk->first; 268662306a36Sopenharmony_ci bool slow; 268762306a36Sopenharmony_ci 268862306a36Sopenharmony_ci if (!ssk) 268962306a36Sopenharmony_ci return; 269062306a36Sopenharmony_ci 269162306a36Sopenharmony_ci pr_debug("MP_FAIL doesn't respond, reset the subflow"); 269262306a36Sopenharmony_ci 269362306a36Sopenharmony_ci slow = lock_sock_fast(ssk); 269462306a36Sopenharmony_ci mptcp_subflow_reset(ssk); 269562306a36Sopenharmony_ci WRITE_ONCE(mptcp_subflow_ctx(ssk)->fail_tout, 0); 269662306a36Sopenharmony_ci unlock_sock_fast(ssk, slow); 269762306a36Sopenharmony_ci} 269862306a36Sopenharmony_ci 269962306a36Sopenharmony_cistatic void mptcp_do_fastclose(struct sock *sk) 270062306a36Sopenharmony_ci{ 270162306a36Sopenharmony_ci struct mptcp_subflow_context *subflow, *tmp; 270262306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sk); 270362306a36Sopenharmony_ci 270462306a36Sopenharmony_ci mptcp_set_state(sk, TCP_CLOSE); 270562306a36Sopenharmony_ci mptcp_for_each_subflow_safe(msk, subflow, tmp) 270662306a36Sopenharmony_ci __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), 270762306a36Sopenharmony_ci subflow, MPTCP_CF_FASTCLOSE); 270862306a36Sopenharmony_ci} 270962306a36Sopenharmony_ci 271062306a36Sopenharmony_cistatic void mptcp_worker(struct work_struct *work) 271162306a36Sopenharmony_ci{ 271262306a36Sopenharmony_ci struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); 271362306a36Sopenharmony_ci struct sock *sk = (struct sock *)msk; 271462306a36Sopenharmony_ci unsigned long fail_tout; 271562306a36Sopenharmony_ci int state; 271662306a36Sopenharmony_ci 271762306a36Sopenharmony_ci lock_sock(sk); 271862306a36Sopenharmony_ci state = sk->sk_state; 271962306a36Sopenharmony_ci if (unlikely((1 << state) & (TCPF_CLOSE | TCPF_LISTEN))) 272062306a36Sopenharmony_ci goto unlock; 272162306a36Sopenharmony_ci 272262306a36Sopenharmony_ci mptcp_check_fastclose(msk); 272362306a36Sopenharmony_ci 272462306a36Sopenharmony_ci mptcp_pm_nl_work(msk); 272562306a36Sopenharmony_ci 272662306a36Sopenharmony_ci mptcp_check_send_data_fin(sk); 272762306a36Sopenharmony_ci mptcp_check_data_fin_ack(sk); 272862306a36Sopenharmony_ci mptcp_check_data_fin(sk); 272962306a36Sopenharmony_ci 273062306a36Sopenharmony_ci if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) 273162306a36Sopenharmony_ci __mptcp_close_subflow(sk); 273262306a36Sopenharmony_ci 273362306a36Sopenharmony_ci if (mptcp_close_tout_expired(sk)) { 273462306a36Sopenharmony_ci mptcp_do_fastclose(sk); 273562306a36Sopenharmony_ci mptcp_close_wake_up(sk); 273662306a36Sopenharmony_ci } 273762306a36Sopenharmony_ci 273862306a36Sopenharmony_ci if (sock_flag(sk, SOCK_DEAD) && sk->sk_state == TCP_CLOSE) { 273962306a36Sopenharmony_ci __mptcp_destroy_sock(sk); 274062306a36Sopenharmony_ci goto unlock; 274162306a36Sopenharmony_ci } 274262306a36Sopenharmony_ci 274362306a36Sopenharmony_ci if (test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) 274462306a36Sopenharmony_ci __mptcp_retrans(sk); 274562306a36Sopenharmony_ci 274662306a36Sopenharmony_ci fail_tout = msk->first ? READ_ONCE(mptcp_subflow_ctx(msk->first)->fail_tout) : 0; 274762306a36Sopenharmony_ci if (fail_tout && time_after(jiffies, fail_tout)) 274862306a36Sopenharmony_ci mptcp_mp_fail_no_response(msk); 274962306a36Sopenharmony_ci 275062306a36Sopenharmony_ciunlock: 275162306a36Sopenharmony_ci release_sock(sk); 275262306a36Sopenharmony_ci sock_put(sk); 275362306a36Sopenharmony_ci} 275462306a36Sopenharmony_ci 275562306a36Sopenharmony_cistatic void __mptcp_init_sock(struct sock *sk) 275662306a36Sopenharmony_ci{ 275762306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sk); 275862306a36Sopenharmony_ci 275962306a36Sopenharmony_ci INIT_LIST_HEAD(&msk->conn_list); 276062306a36Sopenharmony_ci INIT_LIST_HEAD(&msk->join_list); 276162306a36Sopenharmony_ci INIT_LIST_HEAD(&msk->rtx_queue); 276262306a36Sopenharmony_ci INIT_WORK(&msk->work, mptcp_worker); 276362306a36Sopenharmony_ci __skb_queue_head_init(&msk->receive_queue); 276462306a36Sopenharmony_ci msk->out_of_order_queue = RB_ROOT; 276562306a36Sopenharmony_ci msk->first_pending = NULL; 276662306a36Sopenharmony_ci msk->rmem_fwd_alloc = 0; 276762306a36Sopenharmony_ci WRITE_ONCE(msk->rmem_released, 0); 276862306a36Sopenharmony_ci msk->timer_ival = TCP_RTO_MIN; 276962306a36Sopenharmony_ci 277062306a36Sopenharmony_ci WRITE_ONCE(msk->first, NULL); 277162306a36Sopenharmony_ci inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; 277262306a36Sopenharmony_ci WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); 277362306a36Sopenharmony_ci WRITE_ONCE(msk->allow_infinite_fallback, true); 277462306a36Sopenharmony_ci msk->recovery = false; 277562306a36Sopenharmony_ci msk->subflow_id = 1; 277662306a36Sopenharmony_ci 277762306a36Sopenharmony_ci mptcp_pm_data_init(msk); 277862306a36Sopenharmony_ci 277962306a36Sopenharmony_ci /* re-use the csk retrans timer for MPTCP-level retrans */ 278062306a36Sopenharmony_ci timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0); 278162306a36Sopenharmony_ci timer_setup(&sk->sk_timer, mptcp_tout_timer, 0); 278262306a36Sopenharmony_ci} 278362306a36Sopenharmony_ci 278462306a36Sopenharmony_cistatic void mptcp_ca_reset(struct sock *sk) 278562306a36Sopenharmony_ci{ 278662306a36Sopenharmony_ci struct inet_connection_sock *icsk = inet_csk(sk); 278762306a36Sopenharmony_ci 278862306a36Sopenharmony_ci tcp_assign_congestion_control(sk); 278962306a36Sopenharmony_ci strcpy(mptcp_sk(sk)->ca_name, icsk->icsk_ca_ops->name); 279062306a36Sopenharmony_ci 279162306a36Sopenharmony_ci /* no need to keep a reference to the ops, the name will suffice */ 279262306a36Sopenharmony_ci tcp_cleanup_congestion_control(sk); 279362306a36Sopenharmony_ci icsk->icsk_ca_ops = NULL; 279462306a36Sopenharmony_ci} 279562306a36Sopenharmony_ci 279662306a36Sopenharmony_cistatic int mptcp_init_sock(struct sock *sk) 279762306a36Sopenharmony_ci{ 279862306a36Sopenharmony_ci struct net *net = sock_net(sk); 279962306a36Sopenharmony_ci int ret; 280062306a36Sopenharmony_ci 280162306a36Sopenharmony_ci __mptcp_init_sock(sk); 280262306a36Sopenharmony_ci 280362306a36Sopenharmony_ci if (!mptcp_is_enabled(net)) 280462306a36Sopenharmony_ci return -ENOPROTOOPT; 280562306a36Sopenharmony_ci 280662306a36Sopenharmony_ci if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net)) 280762306a36Sopenharmony_ci return -ENOMEM; 280862306a36Sopenharmony_ci 280962306a36Sopenharmony_ci ret = mptcp_init_sched(mptcp_sk(sk), 281062306a36Sopenharmony_ci mptcp_sched_find(mptcp_get_scheduler(net))); 281162306a36Sopenharmony_ci if (ret) 281262306a36Sopenharmony_ci return ret; 281362306a36Sopenharmony_ci 281462306a36Sopenharmony_ci set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); 281562306a36Sopenharmony_ci 281662306a36Sopenharmony_ci /* fetch the ca name; do it outside __mptcp_init_sock(), so that clone will 281762306a36Sopenharmony_ci * propagate the correct value 281862306a36Sopenharmony_ci */ 281962306a36Sopenharmony_ci mptcp_ca_reset(sk); 282062306a36Sopenharmony_ci 282162306a36Sopenharmony_ci sk_sockets_allocated_inc(sk); 282262306a36Sopenharmony_ci sk->sk_rcvbuf = READ_ONCE(net->ipv4.sysctl_tcp_rmem[1]); 282362306a36Sopenharmony_ci sk->sk_sndbuf = READ_ONCE(net->ipv4.sysctl_tcp_wmem[1]); 282462306a36Sopenharmony_ci 282562306a36Sopenharmony_ci return 0; 282662306a36Sopenharmony_ci} 282762306a36Sopenharmony_ci 282862306a36Sopenharmony_cistatic void __mptcp_clear_xmit(struct sock *sk) 282962306a36Sopenharmony_ci{ 283062306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sk); 283162306a36Sopenharmony_ci struct mptcp_data_frag *dtmp, *dfrag; 283262306a36Sopenharmony_ci 283362306a36Sopenharmony_ci WRITE_ONCE(msk->first_pending, NULL); 283462306a36Sopenharmony_ci list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) 283562306a36Sopenharmony_ci dfrag_clear(sk, dfrag); 283662306a36Sopenharmony_ci} 283762306a36Sopenharmony_ci 283862306a36Sopenharmony_civoid mptcp_cancel_work(struct sock *sk) 283962306a36Sopenharmony_ci{ 284062306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sk); 284162306a36Sopenharmony_ci 284262306a36Sopenharmony_ci if (cancel_work_sync(&msk->work)) 284362306a36Sopenharmony_ci __sock_put(sk); 284462306a36Sopenharmony_ci} 284562306a36Sopenharmony_ci 284662306a36Sopenharmony_civoid mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how) 284762306a36Sopenharmony_ci{ 284862306a36Sopenharmony_ci lock_sock(ssk); 284962306a36Sopenharmony_ci 285062306a36Sopenharmony_ci switch (ssk->sk_state) { 285162306a36Sopenharmony_ci case TCP_LISTEN: 285262306a36Sopenharmony_ci if (!(how & RCV_SHUTDOWN)) 285362306a36Sopenharmony_ci break; 285462306a36Sopenharmony_ci fallthrough; 285562306a36Sopenharmony_ci case TCP_SYN_SENT: 285662306a36Sopenharmony_ci WARN_ON_ONCE(tcp_disconnect(ssk, O_NONBLOCK)); 285762306a36Sopenharmony_ci break; 285862306a36Sopenharmony_ci default: 285962306a36Sopenharmony_ci if (__mptcp_check_fallback(mptcp_sk(sk))) { 286062306a36Sopenharmony_ci pr_debug("Fallback"); 286162306a36Sopenharmony_ci ssk->sk_shutdown |= how; 286262306a36Sopenharmony_ci tcp_shutdown(ssk, how); 286362306a36Sopenharmony_ci 286462306a36Sopenharmony_ci /* simulate the data_fin ack reception to let the state 286562306a36Sopenharmony_ci * machine move forward 286662306a36Sopenharmony_ci */ 286762306a36Sopenharmony_ci WRITE_ONCE(mptcp_sk(sk)->snd_una, mptcp_sk(sk)->snd_nxt); 286862306a36Sopenharmony_ci mptcp_schedule_work(sk); 286962306a36Sopenharmony_ci } else { 287062306a36Sopenharmony_ci pr_debug("Sending DATA_FIN on subflow %p", ssk); 287162306a36Sopenharmony_ci tcp_send_ack(ssk); 287262306a36Sopenharmony_ci if (!mptcp_rtx_timer_pending(sk)) 287362306a36Sopenharmony_ci mptcp_reset_rtx_timer(sk); 287462306a36Sopenharmony_ci } 287562306a36Sopenharmony_ci break; 287662306a36Sopenharmony_ci } 287762306a36Sopenharmony_ci 287862306a36Sopenharmony_ci release_sock(ssk); 287962306a36Sopenharmony_ci} 288062306a36Sopenharmony_ci 288162306a36Sopenharmony_civoid mptcp_set_state(struct sock *sk, int state) 288262306a36Sopenharmony_ci{ 288362306a36Sopenharmony_ci int oldstate = sk->sk_state; 288462306a36Sopenharmony_ci 288562306a36Sopenharmony_ci switch (state) { 288662306a36Sopenharmony_ci case TCP_ESTABLISHED: 288762306a36Sopenharmony_ci if (oldstate != TCP_ESTABLISHED) 288862306a36Sopenharmony_ci MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_CURRESTAB); 288962306a36Sopenharmony_ci break; 289062306a36Sopenharmony_ci 289162306a36Sopenharmony_ci default: 289262306a36Sopenharmony_ci if (oldstate == TCP_ESTABLISHED) 289362306a36Sopenharmony_ci MPTCP_DEC_STATS(sock_net(sk), MPTCP_MIB_CURRESTAB); 289462306a36Sopenharmony_ci } 289562306a36Sopenharmony_ci 289662306a36Sopenharmony_ci inet_sk_state_store(sk, state); 289762306a36Sopenharmony_ci} 289862306a36Sopenharmony_ci 289962306a36Sopenharmony_cistatic const unsigned char new_state[16] = { 290062306a36Sopenharmony_ci /* current state: new state: action: */ 290162306a36Sopenharmony_ci [0 /* (Invalid) */] = TCP_CLOSE, 290262306a36Sopenharmony_ci [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, 290362306a36Sopenharmony_ci [TCP_SYN_SENT] = TCP_CLOSE, 290462306a36Sopenharmony_ci [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, 290562306a36Sopenharmony_ci [TCP_FIN_WAIT1] = TCP_FIN_WAIT1, 290662306a36Sopenharmony_ci [TCP_FIN_WAIT2] = TCP_FIN_WAIT2, 290762306a36Sopenharmony_ci [TCP_TIME_WAIT] = TCP_CLOSE, /* should not happen ! */ 290862306a36Sopenharmony_ci [TCP_CLOSE] = TCP_CLOSE, 290962306a36Sopenharmony_ci [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN, 291062306a36Sopenharmony_ci [TCP_LAST_ACK] = TCP_LAST_ACK, 291162306a36Sopenharmony_ci [TCP_LISTEN] = TCP_CLOSE, 291262306a36Sopenharmony_ci [TCP_CLOSING] = TCP_CLOSING, 291362306a36Sopenharmony_ci [TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */ 291462306a36Sopenharmony_ci}; 291562306a36Sopenharmony_ci 291662306a36Sopenharmony_cistatic int mptcp_close_state(struct sock *sk) 291762306a36Sopenharmony_ci{ 291862306a36Sopenharmony_ci int next = (int)new_state[sk->sk_state]; 291962306a36Sopenharmony_ci int ns = next & TCP_STATE_MASK; 292062306a36Sopenharmony_ci 292162306a36Sopenharmony_ci mptcp_set_state(sk, ns); 292262306a36Sopenharmony_ci 292362306a36Sopenharmony_ci return next & TCP_ACTION_FIN; 292462306a36Sopenharmony_ci} 292562306a36Sopenharmony_ci 292662306a36Sopenharmony_cistatic void mptcp_check_send_data_fin(struct sock *sk) 292762306a36Sopenharmony_ci{ 292862306a36Sopenharmony_ci struct mptcp_subflow_context *subflow; 292962306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sk); 293062306a36Sopenharmony_ci 293162306a36Sopenharmony_ci pr_debug("msk=%p snd_data_fin_enable=%d pending=%d snd_nxt=%llu write_seq=%llu", 293262306a36Sopenharmony_ci msk, msk->snd_data_fin_enable, !!mptcp_send_head(sk), 293362306a36Sopenharmony_ci msk->snd_nxt, msk->write_seq); 293462306a36Sopenharmony_ci 293562306a36Sopenharmony_ci /* we still need to enqueue subflows or not really shutting down, 293662306a36Sopenharmony_ci * skip this 293762306a36Sopenharmony_ci */ 293862306a36Sopenharmony_ci if (!msk->snd_data_fin_enable || msk->snd_nxt + 1 != msk->write_seq || 293962306a36Sopenharmony_ci mptcp_send_head(sk)) 294062306a36Sopenharmony_ci return; 294162306a36Sopenharmony_ci 294262306a36Sopenharmony_ci WRITE_ONCE(msk->snd_nxt, msk->write_seq); 294362306a36Sopenharmony_ci 294462306a36Sopenharmony_ci mptcp_for_each_subflow(msk, subflow) { 294562306a36Sopenharmony_ci struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); 294662306a36Sopenharmony_ci 294762306a36Sopenharmony_ci mptcp_subflow_shutdown(sk, tcp_sk, SEND_SHUTDOWN); 294862306a36Sopenharmony_ci } 294962306a36Sopenharmony_ci} 295062306a36Sopenharmony_ci 295162306a36Sopenharmony_cistatic void __mptcp_wr_shutdown(struct sock *sk) 295262306a36Sopenharmony_ci{ 295362306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sk); 295462306a36Sopenharmony_ci 295562306a36Sopenharmony_ci pr_debug("msk=%p snd_data_fin_enable=%d shutdown=%x state=%d pending=%d", 295662306a36Sopenharmony_ci msk, msk->snd_data_fin_enable, sk->sk_shutdown, sk->sk_state, 295762306a36Sopenharmony_ci !!mptcp_send_head(sk)); 295862306a36Sopenharmony_ci 295962306a36Sopenharmony_ci /* will be ignored by fallback sockets */ 296062306a36Sopenharmony_ci WRITE_ONCE(msk->write_seq, msk->write_seq + 1); 296162306a36Sopenharmony_ci WRITE_ONCE(msk->snd_data_fin_enable, 1); 296262306a36Sopenharmony_ci 296362306a36Sopenharmony_ci mptcp_check_send_data_fin(sk); 296462306a36Sopenharmony_ci} 296562306a36Sopenharmony_ci 296662306a36Sopenharmony_cistatic void __mptcp_destroy_sock(struct sock *sk) 296762306a36Sopenharmony_ci{ 296862306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sk); 296962306a36Sopenharmony_ci 297062306a36Sopenharmony_ci pr_debug("msk=%p", msk); 297162306a36Sopenharmony_ci 297262306a36Sopenharmony_ci might_sleep(); 297362306a36Sopenharmony_ci 297462306a36Sopenharmony_ci mptcp_stop_rtx_timer(sk); 297562306a36Sopenharmony_ci sk_stop_timer(sk, &sk->sk_timer); 297662306a36Sopenharmony_ci msk->pm.status = 0; 297762306a36Sopenharmony_ci mptcp_release_sched(msk); 297862306a36Sopenharmony_ci 297962306a36Sopenharmony_ci sk->sk_prot->destroy(sk); 298062306a36Sopenharmony_ci 298162306a36Sopenharmony_ci WARN_ON_ONCE(msk->rmem_fwd_alloc); 298262306a36Sopenharmony_ci WARN_ON_ONCE(msk->rmem_released); 298362306a36Sopenharmony_ci sk_stream_kill_queues(sk); 298462306a36Sopenharmony_ci xfrm_sk_free_policy(sk); 298562306a36Sopenharmony_ci 298662306a36Sopenharmony_ci sock_put(sk); 298762306a36Sopenharmony_ci} 298862306a36Sopenharmony_ci 298962306a36Sopenharmony_civoid __mptcp_unaccepted_force_close(struct sock *sk) 299062306a36Sopenharmony_ci{ 299162306a36Sopenharmony_ci sock_set_flag(sk, SOCK_DEAD); 299262306a36Sopenharmony_ci mptcp_do_fastclose(sk); 299362306a36Sopenharmony_ci __mptcp_destroy_sock(sk); 299462306a36Sopenharmony_ci} 299562306a36Sopenharmony_ci 299662306a36Sopenharmony_cistatic __poll_t mptcp_check_readable(struct mptcp_sock *msk) 299762306a36Sopenharmony_ci{ 299862306a36Sopenharmony_ci /* Concurrent splices from sk_receive_queue into receive_queue will 299962306a36Sopenharmony_ci * always show at least one non-empty queue when checked in this order. 300062306a36Sopenharmony_ci */ 300162306a36Sopenharmony_ci if (skb_queue_empty_lockless(&((struct sock *)msk)->sk_receive_queue) && 300262306a36Sopenharmony_ci skb_queue_empty_lockless(&msk->receive_queue)) 300362306a36Sopenharmony_ci return 0; 300462306a36Sopenharmony_ci 300562306a36Sopenharmony_ci return EPOLLIN | EPOLLRDNORM; 300662306a36Sopenharmony_ci} 300762306a36Sopenharmony_ci 300862306a36Sopenharmony_cistatic void mptcp_check_listen_stop(struct sock *sk) 300962306a36Sopenharmony_ci{ 301062306a36Sopenharmony_ci struct sock *ssk; 301162306a36Sopenharmony_ci 301262306a36Sopenharmony_ci if (inet_sk_state_load(sk) != TCP_LISTEN) 301362306a36Sopenharmony_ci return; 301462306a36Sopenharmony_ci 301562306a36Sopenharmony_ci sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 301662306a36Sopenharmony_ci ssk = mptcp_sk(sk)->first; 301762306a36Sopenharmony_ci if (WARN_ON_ONCE(!ssk || inet_sk_state_load(ssk) != TCP_LISTEN)) 301862306a36Sopenharmony_ci return; 301962306a36Sopenharmony_ci 302062306a36Sopenharmony_ci lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); 302162306a36Sopenharmony_ci tcp_set_state(ssk, TCP_CLOSE); 302262306a36Sopenharmony_ci mptcp_subflow_queue_clean(sk, ssk); 302362306a36Sopenharmony_ci inet_csk_listen_stop(ssk); 302462306a36Sopenharmony_ci mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CLOSED); 302562306a36Sopenharmony_ci release_sock(ssk); 302662306a36Sopenharmony_ci} 302762306a36Sopenharmony_ci 302862306a36Sopenharmony_cibool __mptcp_close(struct sock *sk, long timeout) 302962306a36Sopenharmony_ci{ 303062306a36Sopenharmony_ci struct mptcp_subflow_context *subflow; 303162306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sk); 303262306a36Sopenharmony_ci bool do_cancel_work = false; 303362306a36Sopenharmony_ci int subflows_alive = 0; 303462306a36Sopenharmony_ci 303562306a36Sopenharmony_ci WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); 303662306a36Sopenharmony_ci 303762306a36Sopenharmony_ci if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) { 303862306a36Sopenharmony_ci mptcp_check_listen_stop(sk); 303962306a36Sopenharmony_ci mptcp_set_state(sk, TCP_CLOSE); 304062306a36Sopenharmony_ci goto cleanup; 304162306a36Sopenharmony_ci } 304262306a36Sopenharmony_ci 304362306a36Sopenharmony_ci if (mptcp_check_readable(msk) || timeout < 0) { 304462306a36Sopenharmony_ci /* If the msk has read data, or the caller explicitly ask it, 304562306a36Sopenharmony_ci * do the MPTCP equivalent of TCP reset, aka MPTCP fastclose 304662306a36Sopenharmony_ci */ 304762306a36Sopenharmony_ci mptcp_do_fastclose(sk); 304862306a36Sopenharmony_ci timeout = 0; 304962306a36Sopenharmony_ci } else if (mptcp_close_state(sk)) { 305062306a36Sopenharmony_ci __mptcp_wr_shutdown(sk); 305162306a36Sopenharmony_ci } 305262306a36Sopenharmony_ci 305362306a36Sopenharmony_ci sk_stream_wait_close(sk, timeout); 305462306a36Sopenharmony_ci 305562306a36Sopenharmony_cicleanup: 305662306a36Sopenharmony_ci /* orphan all the subflows */ 305762306a36Sopenharmony_ci mptcp_for_each_subflow(msk, subflow) { 305862306a36Sopenharmony_ci struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 305962306a36Sopenharmony_ci bool slow = lock_sock_fast_nested(ssk); 306062306a36Sopenharmony_ci 306162306a36Sopenharmony_ci subflows_alive += ssk->sk_state != TCP_CLOSE; 306262306a36Sopenharmony_ci 306362306a36Sopenharmony_ci /* since the close timeout takes precedence on the fail one, 306462306a36Sopenharmony_ci * cancel the latter 306562306a36Sopenharmony_ci */ 306662306a36Sopenharmony_ci if (ssk == msk->first) 306762306a36Sopenharmony_ci subflow->fail_tout = 0; 306862306a36Sopenharmony_ci 306962306a36Sopenharmony_ci /* detach from the parent socket, but allow data_ready to 307062306a36Sopenharmony_ci * push incoming data into the mptcp stack, to properly ack it 307162306a36Sopenharmony_ci */ 307262306a36Sopenharmony_ci ssk->sk_socket = NULL; 307362306a36Sopenharmony_ci ssk->sk_wq = NULL; 307462306a36Sopenharmony_ci unlock_sock_fast(ssk, slow); 307562306a36Sopenharmony_ci } 307662306a36Sopenharmony_ci sock_orphan(sk); 307762306a36Sopenharmony_ci 307862306a36Sopenharmony_ci /* all the subflows are closed, only timeout can change the msk 307962306a36Sopenharmony_ci * state, let's not keep resources busy for no reasons 308062306a36Sopenharmony_ci */ 308162306a36Sopenharmony_ci if (subflows_alive == 0) 308262306a36Sopenharmony_ci mptcp_set_state(sk, TCP_CLOSE); 308362306a36Sopenharmony_ci 308462306a36Sopenharmony_ci sock_hold(sk); 308562306a36Sopenharmony_ci pr_debug("msk=%p state=%d", sk, sk->sk_state); 308662306a36Sopenharmony_ci if (msk->token) 308762306a36Sopenharmony_ci mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL); 308862306a36Sopenharmony_ci 308962306a36Sopenharmony_ci if (sk->sk_state == TCP_CLOSE) { 309062306a36Sopenharmony_ci __mptcp_destroy_sock(sk); 309162306a36Sopenharmony_ci do_cancel_work = true; 309262306a36Sopenharmony_ci } else { 309362306a36Sopenharmony_ci mptcp_start_tout_timer(sk); 309462306a36Sopenharmony_ci } 309562306a36Sopenharmony_ci 309662306a36Sopenharmony_ci return do_cancel_work; 309762306a36Sopenharmony_ci} 309862306a36Sopenharmony_ci 309962306a36Sopenharmony_cistatic void mptcp_close(struct sock *sk, long timeout) 310062306a36Sopenharmony_ci{ 310162306a36Sopenharmony_ci bool do_cancel_work; 310262306a36Sopenharmony_ci 310362306a36Sopenharmony_ci lock_sock(sk); 310462306a36Sopenharmony_ci 310562306a36Sopenharmony_ci do_cancel_work = __mptcp_close(sk, timeout); 310662306a36Sopenharmony_ci release_sock(sk); 310762306a36Sopenharmony_ci if (do_cancel_work) 310862306a36Sopenharmony_ci mptcp_cancel_work(sk); 310962306a36Sopenharmony_ci 311062306a36Sopenharmony_ci sock_put(sk); 311162306a36Sopenharmony_ci} 311262306a36Sopenharmony_ci 311362306a36Sopenharmony_cistatic void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) 311462306a36Sopenharmony_ci{ 311562306a36Sopenharmony_ci#if IS_ENABLED(CONFIG_MPTCP_IPV6) 311662306a36Sopenharmony_ci const struct ipv6_pinfo *ssk6 = inet6_sk(ssk); 311762306a36Sopenharmony_ci struct ipv6_pinfo *msk6 = inet6_sk(msk); 311862306a36Sopenharmony_ci 311962306a36Sopenharmony_ci msk->sk_v6_daddr = ssk->sk_v6_daddr; 312062306a36Sopenharmony_ci msk->sk_v6_rcv_saddr = ssk->sk_v6_rcv_saddr; 312162306a36Sopenharmony_ci 312262306a36Sopenharmony_ci if (msk6 && ssk6) { 312362306a36Sopenharmony_ci msk6->saddr = ssk6->saddr; 312462306a36Sopenharmony_ci msk6->flow_label = ssk6->flow_label; 312562306a36Sopenharmony_ci } 312662306a36Sopenharmony_ci#endif 312762306a36Sopenharmony_ci 312862306a36Sopenharmony_ci inet_sk(msk)->inet_num = inet_sk(ssk)->inet_num; 312962306a36Sopenharmony_ci inet_sk(msk)->inet_dport = inet_sk(ssk)->inet_dport; 313062306a36Sopenharmony_ci inet_sk(msk)->inet_sport = inet_sk(ssk)->inet_sport; 313162306a36Sopenharmony_ci inet_sk(msk)->inet_daddr = inet_sk(ssk)->inet_daddr; 313262306a36Sopenharmony_ci inet_sk(msk)->inet_saddr = inet_sk(ssk)->inet_saddr; 313362306a36Sopenharmony_ci inet_sk(msk)->inet_rcv_saddr = inet_sk(ssk)->inet_rcv_saddr; 313462306a36Sopenharmony_ci} 313562306a36Sopenharmony_ci 313662306a36Sopenharmony_cistatic int mptcp_disconnect(struct sock *sk, int flags) 313762306a36Sopenharmony_ci{ 313862306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sk); 313962306a36Sopenharmony_ci 314062306a36Sopenharmony_ci /* We are on the fastopen error path. We can't call straight into the 314162306a36Sopenharmony_ci * subflows cleanup code due to lock nesting (we are already under 314262306a36Sopenharmony_ci * msk->firstsocket lock). 314362306a36Sopenharmony_ci */ 314462306a36Sopenharmony_ci if (msk->fastopening) 314562306a36Sopenharmony_ci return -EBUSY; 314662306a36Sopenharmony_ci 314762306a36Sopenharmony_ci mptcp_check_listen_stop(sk); 314862306a36Sopenharmony_ci mptcp_set_state(sk, TCP_CLOSE); 314962306a36Sopenharmony_ci 315062306a36Sopenharmony_ci mptcp_stop_rtx_timer(sk); 315162306a36Sopenharmony_ci mptcp_stop_tout_timer(sk); 315262306a36Sopenharmony_ci 315362306a36Sopenharmony_ci if (msk->token) 315462306a36Sopenharmony_ci mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL); 315562306a36Sopenharmony_ci 315662306a36Sopenharmony_ci /* msk->subflow is still intact, the following will not free the first 315762306a36Sopenharmony_ci * subflow 315862306a36Sopenharmony_ci */ 315962306a36Sopenharmony_ci mptcp_destroy_common(msk, MPTCP_CF_FASTCLOSE); 316062306a36Sopenharmony_ci WRITE_ONCE(msk->flags, 0); 316162306a36Sopenharmony_ci msk->cb_flags = 0; 316262306a36Sopenharmony_ci msk->recovery = false; 316362306a36Sopenharmony_ci msk->can_ack = false; 316462306a36Sopenharmony_ci msk->fully_established = false; 316562306a36Sopenharmony_ci msk->rcv_data_fin = false; 316662306a36Sopenharmony_ci msk->snd_data_fin_enable = false; 316762306a36Sopenharmony_ci msk->rcv_fastclose = false; 316862306a36Sopenharmony_ci msk->use_64bit_ack = false; 316962306a36Sopenharmony_ci WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); 317062306a36Sopenharmony_ci mptcp_pm_data_reset(msk); 317162306a36Sopenharmony_ci mptcp_ca_reset(sk); 317262306a36Sopenharmony_ci msk->bytes_acked = 0; 317362306a36Sopenharmony_ci msk->bytes_received = 0; 317462306a36Sopenharmony_ci msk->bytes_sent = 0; 317562306a36Sopenharmony_ci msk->bytes_retrans = 0; 317662306a36Sopenharmony_ci msk->rcvspace_init = 0; 317762306a36Sopenharmony_ci 317862306a36Sopenharmony_ci WRITE_ONCE(sk->sk_shutdown, 0); 317962306a36Sopenharmony_ci sk_error_report(sk); 318062306a36Sopenharmony_ci return 0; 318162306a36Sopenharmony_ci} 318262306a36Sopenharmony_ci 318362306a36Sopenharmony_ci#if IS_ENABLED(CONFIG_MPTCP_IPV6) 318462306a36Sopenharmony_cistatic struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk) 318562306a36Sopenharmony_ci{ 318662306a36Sopenharmony_ci unsigned int offset = sizeof(struct mptcp6_sock) - sizeof(struct ipv6_pinfo); 318762306a36Sopenharmony_ci 318862306a36Sopenharmony_ci return (struct ipv6_pinfo *)(((u8 *)sk) + offset); 318962306a36Sopenharmony_ci} 319062306a36Sopenharmony_ci 319162306a36Sopenharmony_cistatic void mptcp_copy_ip6_options(struct sock *newsk, const struct sock *sk) 319262306a36Sopenharmony_ci{ 319362306a36Sopenharmony_ci const struct ipv6_pinfo *np = inet6_sk(sk); 319462306a36Sopenharmony_ci struct ipv6_txoptions *opt; 319562306a36Sopenharmony_ci struct ipv6_pinfo *newnp; 319662306a36Sopenharmony_ci 319762306a36Sopenharmony_ci newnp = inet6_sk(newsk); 319862306a36Sopenharmony_ci 319962306a36Sopenharmony_ci rcu_read_lock(); 320062306a36Sopenharmony_ci opt = rcu_dereference(np->opt); 320162306a36Sopenharmony_ci if (opt) { 320262306a36Sopenharmony_ci opt = ipv6_dup_options(newsk, opt); 320362306a36Sopenharmony_ci if (!opt) 320462306a36Sopenharmony_ci net_warn_ratelimited("%s: Failed to copy ip6 options\n", __func__); 320562306a36Sopenharmony_ci } 320662306a36Sopenharmony_ci RCU_INIT_POINTER(newnp->opt, opt); 320762306a36Sopenharmony_ci rcu_read_unlock(); 320862306a36Sopenharmony_ci} 320962306a36Sopenharmony_ci#endif 321062306a36Sopenharmony_ci 321162306a36Sopenharmony_cistatic void mptcp_copy_ip_options(struct sock *newsk, const struct sock *sk) 321262306a36Sopenharmony_ci{ 321362306a36Sopenharmony_ci struct ip_options_rcu *inet_opt, *newopt = NULL; 321462306a36Sopenharmony_ci const struct inet_sock *inet = inet_sk(sk); 321562306a36Sopenharmony_ci struct inet_sock *newinet; 321662306a36Sopenharmony_ci 321762306a36Sopenharmony_ci newinet = inet_sk(newsk); 321862306a36Sopenharmony_ci 321962306a36Sopenharmony_ci rcu_read_lock(); 322062306a36Sopenharmony_ci inet_opt = rcu_dereference(inet->inet_opt); 322162306a36Sopenharmony_ci if (inet_opt) { 322262306a36Sopenharmony_ci newopt = sock_kmalloc(newsk, sizeof(*inet_opt) + 322362306a36Sopenharmony_ci inet_opt->opt.optlen, GFP_ATOMIC); 322462306a36Sopenharmony_ci if (newopt) 322562306a36Sopenharmony_ci memcpy(newopt, inet_opt, sizeof(*inet_opt) + 322662306a36Sopenharmony_ci inet_opt->opt.optlen); 322762306a36Sopenharmony_ci else 322862306a36Sopenharmony_ci net_warn_ratelimited("%s: Failed to copy ip options\n", __func__); 322962306a36Sopenharmony_ci } 323062306a36Sopenharmony_ci RCU_INIT_POINTER(newinet->inet_opt, newopt); 323162306a36Sopenharmony_ci rcu_read_unlock(); 323262306a36Sopenharmony_ci} 323362306a36Sopenharmony_ci 323462306a36Sopenharmony_cistruct sock *mptcp_sk_clone_init(const struct sock *sk, 323562306a36Sopenharmony_ci const struct mptcp_options_received *mp_opt, 323662306a36Sopenharmony_ci struct sock *ssk, 323762306a36Sopenharmony_ci struct request_sock *req) 323862306a36Sopenharmony_ci{ 323962306a36Sopenharmony_ci struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 324062306a36Sopenharmony_ci struct sock *nsk = sk_clone_lock(sk, GFP_ATOMIC); 324162306a36Sopenharmony_ci struct mptcp_subflow_context *subflow; 324262306a36Sopenharmony_ci struct mptcp_sock *msk; 324362306a36Sopenharmony_ci 324462306a36Sopenharmony_ci if (!nsk) 324562306a36Sopenharmony_ci return NULL; 324662306a36Sopenharmony_ci 324762306a36Sopenharmony_ci#if IS_ENABLED(CONFIG_MPTCP_IPV6) 324862306a36Sopenharmony_ci if (nsk->sk_family == AF_INET6) 324962306a36Sopenharmony_ci inet_sk(nsk)->pinet6 = mptcp_inet6_sk(nsk); 325062306a36Sopenharmony_ci#endif 325162306a36Sopenharmony_ci 325262306a36Sopenharmony_ci __mptcp_init_sock(nsk); 325362306a36Sopenharmony_ci 325462306a36Sopenharmony_ci#if IS_ENABLED(CONFIG_MPTCP_IPV6) 325562306a36Sopenharmony_ci if (nsk->sk_family == AF_INET6) 325662306a36Sopenharmony_ci mptcp_copy_ip6_options(nsk, sk); 325762306a36Sopenharmony_ci else 325862306a36Sopenharmony_ci#endif 325962306a36Sopenharmony_ci mptcp_copy_ip_options(nsk, sk); 326062306a36Sopenharmony_ci 326162306a36Sopenharmony_ci msk = mptcp_sk(nsk); 326262306a36Sopenharmony_ci msk->local_key = subflow_req->local_key; 326362306a36Sopenharmony_ci msk->token = subflow_req->token; 326462306a36Sopenharmony_ci msk->in_accept_queue = 1; 326562306a36Sopenharmony_ci WRITE_ONCE(msk->fully_established, false); 326662306a36Sopenharmony_ci if (mp_opt->suboptions & OPTION_MPTCP_CSUMREQD) 326762306a36Sopenharmony_ci WRITE_ONCE(msk->csum_enabled, true); 326862306a36Sopenharmony_ci 326962306a36Sopenharmony_ci msk->write_seq = subflow_req->idsn + 1; 327062306a36Sopenharmony_ci msk->snd_nxt = msk->write_seq; 327162306a36Sopenharmony_ci msk->snd_una = msk->write_seq; 327262306a36Sopenharmony_ci msk->wnd_end = msk->snd_nxt + tcp_sk(ssk)->snd_wnd; 327362306a36Sopenharmony_ci msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq; 327462306a36Sopenharmony_ci mptcp_init_sched(msk, mptcp_sk(sk)->sched); 327562306a36Sopenharmony_ci 327662306a36Sopenharmony_ci /* passive msk is created after the first/MPC subflow */ 327762306a36Sopenharmony_ci msk->subflow_id = 2; 327862306a36Sopenharmony_ci 327962306a36Sopenharmony_ci sock_reset_flag(nsk, SOCK_RCU_FREE); 328062306a36Sopenharmony_ci security_inet_csk_clone(nsk, req); 328162306a36Sopenharmony_ci 328262306a36Sopenharmony_ci /* this can't race with mptcp_close(), as the msk is 328362306a36Sopenharmony_ci * not yet exposted to user-space 328462306a36Sopenharmony_ci */ 328562306a36Sopenharmony_ci mptcp_set_state(nsk, TCP_ESTABLISHED); 328662306a36Sopenharmony_ci 328762306a36Sopenharmony_ci /* The msk maintain a ref to each subflow in the connections list */ 328862306a36Sopenharmony_ci WRITE_ONCE(msk->first, ssk); 328962306a36Sopenharmony_ci subflow = mptcp_subflow_ctx(ssk); 329062306a36Sopenharmony_ci list_add(&subflow->node, &msk->conn_list); 329162306a36Sopenharmony_ci sock_hold(ssk); 329262306a36Sopenharmony_ci 329362306a36Sopenharmony_ci /* new mpc subflow takes ownership of the newly 329462306a36Sopenharmony_ci * created mptcp socket 329562306a36Sopenharmony_ci */ 329662306a36Sopenharmony_ci mptcp_token_accept(subflow_req, msk); 329762306a36Sopenharmony_ci 329862306a36Sopenharmony_ci /* set msk addresses early to ensure mptcp_pm_get_local_id() 329962306a36Sopenharmony_ci * uses the correct data 330062306a36Sopenharmony_ci */ 330162306a36Sopenharmony_ci mptcp_copy_inaddrs(nsk, ssk); 330262306a36Sopenharmony_ci __mptcp_propagate_sndbuf(nsk, ssk); 330362306a36Sopenharmony_ci 330462306a36Sopenharmony_ci mptcp_rcv_space_init(msk, ssk); 330562306a36Sopenharmony_ci 330662306a36Sopenharmony_ci if (mp_opt->suboptions & OPTION_MPTCP_MPC_ACK) 330762306a36Sopenharmony_ci __mptcp_subflow_fully_established(msk, subflow, mp_opt); 330862306a36Sopenharmony_ci bh_unlock_sock(nsk); 330962306a36Sopenharmony_ci 331062306a36Sopenharmony_ci /* note: the newly allocated socket refcount is 2 now */ 331162306a36Sopenharmony_ci return nsk; 331262306a36Sopenharmony_ci} 331362306a36Sopenharmony_ci 331462306a36Sopenharmony_civoid mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk) 331562306a36Sopenharmony_ci{ 331662306a36Sopenharmony_ci const struct tcp_sock *tp = tcp_sk(ssk); 331762306a36Sopenharmony_ci 331862306a36Sopenharmony_ci msk->rcvspace_init = 1; 331962306a36Sopenharmony_ci msk->rcvq_space.copied = 0; 332062306a36Sopenharmony_ci msk->rcvq_space.rtt_us = 0; 332162306a36Sopenharmony_ci 332262306a36Sopenharmony_ci msk->rcvq_space.time = tp->tcp_mstamp; 332362306a36Sopenharmony_ci 332462306a36Sopenharmony_ci /* initial rcv_space offering made to peer */ 332562306a36Sopenharmony_ci msk->rcvq_space.space = min_t(u32, tp->rcv_wnd, 332662306a36Sopenharmony_ci TCP_INIT_CWND * tp->advmss); 332762306a36Sopenharmony_ci if (msk->rcvq_space.space == 0) 332862306a36Sopenharmony_ci msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT; 332962306a36Sopenharmony_ci} 333062306a36Sopenharmony_ci 333162306a36Sopenharmony_cistatic struct sock *mptcp_accept(struct sock *ssk, int flags, int *err, 333262306a36Sopenharmony_ci bool kern) 333362306a36Sopenharmony_ci{ 333462306a36Sopenharmony_ci struct sock *newsk; 333562306a36Sopenharmony_ci 333662306a36Sopenharmony_ci pr_debug("ssk=%p, listener=%p", ssk, mptcp_subflow_ctx(ssk)); 333762306a36Sopenharmony_ci newsk = inet_csk_accept(ssk, flags, err, kern); 333862306a36Sopenharmony_ci if (!newsk) 333962306a36Sopenharmony_ci return NULL; 334062306a36Sopenharmony_ci 334162306a36Sopenharmony_ci pr_debug("newsk=%p, subflow is mptcp=%d", newsk, sk_is_mptcp(newsk)); 334262306a36Sopenharmony_ci if (sk_is_mptcp(newsk)) { 334362306a36Sopenharmony_ci struct mptcp_subflow_context *subflow; 334462306a36Sopenharmony_ci struct sock *new_mptcp_sock; 334562306a36Sopenharmony_ci 334662306a36Sopenharmony_ci subflow = mptcp_subflow_ctx(newsk); 334762306a36Sopenharmony_ci new_mptcp_sock = subflow->conn; 334862306a36Sopenharmony_ci 334962306a36Sopenharmony_ci /* is_mptcp should be false if subflow->conn is missing, see 335062306a36Sopenharmony_ci * subflow_syn_recv_sock() 335162306a36Sopenharmony_ci */ 335262306a36Sopenharmony_ci if (WARN_ON_ONCE(!new_mptcp_sock)) { 335362306a36Sopenharmony_ci tcp_sk(newsk)->is_mptcp = 0; 335462306a36Sopenharmony_ci goto out; 335562306a36Sopenharmony_ci } 335662306a36Sopenharmony_ci 335762306a36Sopenharmony_ci newsk = new_mptcp_sock; 335862306a36Sopenharmony_ci MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEPASSIVEACK); 335962306a36Sopenharmony_ci } else { 336062306a36Sopenharmony_ci MPTCP_INC_STATS(sock_net(ssk), 336162306a36Sopenharmony_ci MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK); 336262306a36Sopenharmony_ci } 336362306a36Sopenharmony_ci 336462306a36Sopenharmony_ciout: 336562306a36Sopenharmony_ci newsk->sk_kern_sock = kern; 336662306a36Sopenharmony_ci return newsk; 336762306a36Sopenharmony_ci} 336862306a36Sopenharmony_ci 336962306a36Sopenharmony_civoid mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags) 337062306a36Sopenharmony_ci{ 337162306a36Sopenharmony_ci struct mptcp_subflow_context *subflow, *tmp; 337262306a36Sopenharmony_ci struct sock *sk = (struct sock *)msk; 337362306a36Sopenharmony_ci 337462306a36Sopenharmony_ci __mptcp_clear_xmit(sk); 337562306a36Sopenharmony_ci 337662306a36Sopenharmony_ci /* join list will be eventually flushed (with rst) at sock lock release time */ 337762306a36Sopenharmony_ci mptcp_for_each_subflow_safe(msk, subflow, tmp) 337862306a36Sopenharmony_ci __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, flags); 337962306a36Sopenharmony_ci 338062306a36Sopenharmony_ci /* move to sk_receive_queue, sk_stream_kill_queues will purge it */ 338162306a36Sopenharmony_ci mptcp_data_lock(sk); 338262306a36Sopenharmony_ci skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue); 338362306a36Sopenharmony_ci __skb_queue_purge(&sk->sk_receive_queue); 338462306a36Sopenharmony_ci skb_rbtree_purge(&msk->out_of_order_queue); 338562306a36Sopenharmony_ci mptcp_data_unlock(sk); 338662306a36Sopenharmony_ci 338762306a36Sopenharmony_ci /* move all the rx fwd alloc into the sk_mem_reclaim_final in 338862306a36Sopenharmony_ci * inet_sock_destruct() will dispose it 338962306a36Sopenharmony_ci */ 339062306a36Sopenharmony_ci sk_forward_alloc_add(sk, msk->rmem_fwd_alloc); 339162306a36Sopenharmony_ci WRITE_ONCE(msk->rmem_fwd_alloc, 0); 339262306a36Sopenharmony_ci mptcp_token_destroy(msk); 339362306a36Sopenharmony_ci mptcp_pm_free_anno_list(msk); 339462306a36Sopenharmony_ci mptcp_free_local_addr_list(msk); 339562306a36Sopenharmony_ci} 339662306a36Sopenharmony_ci 339762306a36Sopenharmony_cistatic void mptcp_destroy(struct sock *sk) 339862306a36Sopenharmony_ci{ 339962306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sk); 340062306a36Sopenharmony_ci 340162306a36Sopenharmony_ci /* allow the following to close even the initial subflow */ 340262306a36Sopenharmony_ci msk->free_first = 1; 340362306a36Sopenharmony_ci mptcp_destroy_common(msk, 0); 340462306a36Sopenharmony_ci sk_sockets_allocated_dec(sk); 340562306a36Sopenharmony_ci} 340662306a36Sopenharmony_ci 340762306a36Sopenharmony_civoid __mptcp_data_acked(struct sock *sk) 340862306a36Sopenharmony_ci{ 340962306a36Sopenharmony_ci if (!sock_owned_by_user(sk)) 341062306a36Sopenharmony_ci __mptcp_clean_una(sk); 341162306a36Sopenharmony_ci else 341262306a36Sopenharmony_ci __set_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->cb_flags); 341362306a36Sopenharmony_ci 341462306a36Sopenharmony_ci if (mptcp_pending_data_fin_ack(sk)) 341562306a36Sopenharmony_ci mptcp_schedule_work(sk); 341662306a36Sopenharmony_ci} 341762306a36Sopenharmony_ci 341862306a36Sopenharmony_civoid __mptcp_check_push(struct sock *sk, struct sock *ssk) 341962306a36Sopenharmony_ci{ 342062306a36Sopenharmony_ci if (!mptcp_send_head(sk)) 342162306a36Sopenharmony_ci return; 342262306a36Sopenharmony_ci 342362306a36Sopenharmony_ci if (!sock_owned_by_user(sk)) 342462306a36Sopenharmony_ci __mptcp_subflow_push_pending(sk, ssk, false); 342562306a36Sopenharmony_ci else 342662306a36Sopenharmony_ci __set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags); 342762306a36Sopenharmony_ci} 342862306a36Sopenharmony_ci 342962306a36Sopenharmony_ci#define MPTCP_FLAGS_PROCESS_CTX_NEED (BIT(MPTCP_PUSH_PENDING) | \ 343062306a36Sopenharmony_ci BIT(MPTCP_RETRANSMIT) | \ 343162306a36Sopenharmony_ci BIT(MPTCP_FLUSH_JOIN_LIST)) 343262306a36Sopenharmony_ci 343362306a36Sopenharmony_ci/* processes deferred events and flush wmem */ 343462306a36Sopenharmony_cistatic void mptcp_release_cb(struct sock *sk) 343562306a36Sopenharmony_ci __must_hold(&sk->sk_lock.slock) 343662306a36Sopenharmony_ci{ 343762306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sk); 343862306a36Sopenharmony_ci 343962306a36Sopenharmony_ci for (;;) { 344062306a36Sopenharmony_ci unsigned long flags = (msk->cb_flags & MPTCP_FLAGS_PROCESS_CTX_NEED); 344162306a36Sopenharmony_ci struct list_head join_list; 344262306a36Sopenharmony_ci 344362306a36Sopenharmony_ci if (!flags) 344462306a36Sopenharmony_ci break; 344562306a36Sopenharmony_ci 344662306a36Sopenharmony_ci INIT_LIST_HEAD(&join_list); 344762306a36Sopenharmony_ci list_splice_init(&msk->join_list, &join_list); 344862306a36Sopenharmony_ci 344962306a36Sopenharmony_ci /* the following actions acquire the subflow socket lock 345062306a36Sopenharmony_ci * 345162306a36Sopenharmony_ci * 1) can't be invoked in atomic scope 345262306a36Sopenharmony_ci * 2) must avoid ABBA deadlock with msk socket spinlock: the RX 345362306a36Sopenharmony_ci * datapath acquires the msk socket spinlock while helding 345462306a36Sopenharmony_ci * the subflow socket lock 345562306a36Sopenharmony_ci */ 345662306a36Sopenharmony_ci msk->cb_flags &= ~flags; 345762306a36Sopenharmony_ci spin_unlock_bh(&sk->sk_lock.slock); 345862306a36Sopenharmony_ci 345962306a36Sopenharmony_ci if (flags & BIT(MPTCP_FLUSH_JOIN_LIST)) 346062306a36Sopenharmony_ci __mptcp_flush_join_list(sk, &join_list); 346162306a36Sopenharmony_ci if (flags & BIT(MPTCP_PUSH_PENDING)) 346262306a36Sopenharmony_ci __mptcp_push_pending(sk, 0); 346362306a36Sopenharmony_ci if (flags & BIT(MPTCP_RETRANSMIT)) 346462306a36Sopenharmony_ci __mptcp_retrans(sk); 346562306a36Sopenharmony_ci 346662306a36Sopenharmony_ci cond_resched(); 346762306a36Sopenharmony_ci spin_lock_bh(&sk->sk_lock.slock); 346862306a36Sopenharmony_ci } 346962306a36Sopenharmony_ci 347062306a36Sopenharmony_ci if (__test_and_clear_bit(MPTCP_CLEAN_UNA, &msk->cb_flags)) 347162306a36Sopenharmony_ci __mptcp_clean_una_wakeup(sk); 347262306a36Sopenharmony_ci if (unlikely(msk->cb_flags)) { 347362306a36Sopenharmony_ci /* be sure to sync the msk state before taking actions 347462306a36Sopenharmony_ci * depending on sk_state (MPTCP_ERROR_REPORT) 347562306a36Sopenharmony_ci * On sk release avoid actions depending on the first subflow 347662306a36Sopenharmony_ci */ 347762306a36Sopenharmony_ci if (__test_and_clear_bit(MPTCP_SYNC_STATE, &msk->cb_flags) && msk->first) 347862306a36Sopenharmony_ci __mptcp_sync_state(sk, msk->pending_state); 347962306a36Sopenharmony_ci if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags)) 348062306a36Sopenharmony_ci __mptcp_error_report(sk); 348162306a36Sopenharmony_ci if (__test_and_clear_bit(MPTCP_SYNC_SNDBUF, &msk->cb_flags)) 348262306a36Sopenharmony_ci __mptcp_sync_sndbuf(sk); 348362306a36Sopenharmony_ci } 348462306a36Sopenharmony_ci 348562306a36Sopenharmony_ci __mptcp_update_rmem(sk); 348662306a36Sopenharmony_ci} 348762306a36Sopenharmony_ci 348862306a36Sopenharmony_ci/* MP_JOIN client subflow must wait for 4th ack before sending any data: 348962306a36Sopenharmony_ci * TCP can't schedule delack timer before the subflow is fully established. 349062306a36Sopenharmony_ci * MPTCP uses the delack timer to do 3rd ack retransmissions 349162306a36Sopenharmony_ci */ 349262306a36Sopenharmony_cistatic void schedule_3rdack_retransmission(struct sock *ssk) 349362306a36Sopenharmony_ci{ 349462306a36Sopenharmony_ci struct inet_connection_sock *icsk = inet_csk(ssk); 349562306a36Sopenharmony_ci struct tcp_sock *tp = tcp_sk(ssk); 349662306a36Sopenharmony_ci unsigned long timeout; 349762306a36Sopenharmony_ci 349862306a36Sopenharmony_ci if (mptcp_subflow_ctx(ssk)->fully_established) 349962306a36Sopenharmony_ci return; 350062306a36Sopenharmony_ci 350162306a36Sopenharmony_ci /* reschedule with a timeout above RTT, as we must look only for drop */ 350262306a36Sopenharmony_ci if (tp->srtt_us) 350362306a36Sopenharmony_ci timeout = usecs_to_jiffies(tp->srtt_us >> (3 - 1)); 350462306a36Sopenharmony_ci else 350562306a36Sopenharmony_ci timeout = TCP_TIMEOUT_INIT; 350662306a36Sopenharmony_ci timeout += jiffies; 350762306a36Sopenharmony_ci 350862306a36Sopenharmony_ci WARN_ON_ONCE(icsk->icsk_ack.pending & ICSK_ACK_TIMER); 350962306a36Sopenharmony_ci icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER; 351062306a36Sopenharmony_ci icsk->icsk_ack.timeout = timeout; 351162306a36Sopenharmony_ci sk_reset_timer(ssk, &icsk->icsk_delack_timer, timeout); 351262306a36Sopenharmony_ci} 351362306a36Sopenharmony_ci 351462306a36Sopenharmony_civoid mptcp_subflow_process_delegated(struct sock *ssk, long status) 351562306a36Sopenharmony_ci{ 351662306a36Sopenharmony_ci struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 351762306a36Sopenharmony_ci struct sock *sk = subflow->conn; 351862306a36Sopenharmony_ci 351962306a36Sopenharmony_ci if (status & BIT(MPTCP_DELEGATE_SEND)) { 352062306a36Sopenharmony_ci mptcp_data_lock(sk); 352162306a36Sopenharmony_ci if (!sock_owned_by_user(sk)) 352262306a36Sopenharmony_ci __mptcp_subflow_push_pending(sk, ssk, true); 352362306a36Sopenharmony_ci else 352462306a36Sopenharmony_ci __set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags); 352562306a36Sopenharmony_ci mptcp_data_unlock(sk); 352662306a36Sopenharmony_ci } 352762306a36Sopenharmony_ci if (status & BIT(MPTCP_DELEGATE_SNDBUF)) { 352862306a36Sopenharmony_ci mptcp_data_lock(sk); 352962306a36Sopenharmony_ci if (!sock_owned_by_user(sk)) 353062306a36Sopenharmony_ci __mptcp_sync_sndbuf(sk); 353162306a36Sopenharmony_ci else 353262306a36Sopenharmony_ci __set_bit(MPTCP_SYNC_SNDBUF, &mptcp_sk(sk)->cb_flags); 353362306a36Sopenharmony_ci mptcp_data_unlock(sk); 353462306a36Sopenharmony_ci } 353562306a36Sopenharmony_ci if (status & BIT(MPTCP_DELEGATE_ACK)) 353662306a36Sopenharmony_ci schedule_3rdack_retransmission(ssk); 353762306a36Sopenharmony_ci} 353862306a36Sopenharmony_ci 353962306a36Sopenharmony_cistatic int mptcp_hash(struct sock *sk) 354062306a36Sopenharmony_ci{ 354162306a36Sopenharmony_ci /* should never be called, 354262306a36Sopenharmony_ci * we hash the TCP subflows not the master socket 354362306a36Sopenharmony_ci */ 354462306a36Sopenharmony_ci WARN_ON_ONCE(1); 354562306a36Sopenharmony_ci return 0; 354662306a36Sopenharmony_ci} 354762306a36Sopenharmony_ci 354862306a36Sopenharmony_cistatic void mptcp_unhash(struct sock *sk) 354962306a36Sopenharmony_ci{ 355062306a36Sopenharmony_ci /* called from sk_common_release(), but nothing to do here */ 355162306a36Sopenharmony_ci} 355262306a36Sopenharmony_ci 355362306a36Sopenharmony_cistatic int mptcp_get_port(struct sock *sk, unsigned short snum) 355462306a36Sopenharmony_ci{ 355562306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sk); 355662306a36Sopenharmony_ci 355762306a36Sopenharmony_ci pr_debug("msk=%p, ssk=%p", msk, msk->first); 355862306a36Sopenharmony_ci if (WARN_ON_ONCE(!msk->first)) 355962306a36Sopenharmony_ci return -EINVAL; 356062306a36Sopenharmony_ci 356162306a36Sopenharmony_ci return inet_csk_get_port(msk->first, snum); 356262306a36Sopenharmony_ci} 356362306a36Sopenharmony_ci 356462306a36Sopenharmony_civoid mptcp_finish_connect(struct sock *ssk) 356562306a36Sopenharmony_ci{ 356662306a36Sopenharmony_ci struct mptcp_subflow_context *subflow; 356762306a36Sopenharmony_ci struct mptcp_sock *msk; 356862306a36Sopenharmony_ci struct sock *sk; 356962306a36Sopenharmony_ci 357062306a36Sopenharmony_ci subflow = mptcp_subflow_ctx(ssk); 357162306a36Sopenharmony_ci sk = subflow->conn; 357262306a36Sopenharmony_ci msk = mptcp_sk(sk); 357362306a36Sopenharmony_ci 357462306a36Sopenharmony_ci pr_debug("msk=%p, token=%u", sk, subflow->token); 357562306a36Sopenharmony_ci 357662306a36Sopenharmony_ci subflow->map_seq = subflow->iasn; 357762306a36Sopenharmony_ci subflow->map_subflow_seq = 1; 357862306a36Sopenharmony_ci 357962306a36Sopenharmony_ci /* the socket is not connected yet, no msk/subflow ops can access/race 358062306a36Sopenharmony_ci * accessing the field below 358162306a36Sopenharmony_ci */ 358262306a36Sopenharmony_ci WRITE_ONCE(msk->local_key, subflow->local_key); 358362306a36Sopenharmony_ci 358462306a36Sopenharmony_ci mptcp_pm_new_connection(msk, ssk, 0); 358562306a36Sopenharmony_ci} 358662306a36Sopenharmony_ci 358762306a36Sopenharmony_civoid mptcp_sock_graft(struct sock *sk, struct socket *parent) 358862306a36Sopenharmony_ci{ 358962306a36Sopenharmony_ci write_lock_bh(&sk->sk_callback_lock); 359062306a36Sopenharmony_ci rcu_assign_pointer(sk->sk_wq, &parent->wq); 359162306a36Sopenharmony_ci sk_set_socket(sk, parent); 359262306a36Sopenharmony_ci sk->sk_uid = SOCK_INODE(parent)->i_uid; 359362306a36Sopenharmony_ci write_unlock_bh(&sk->sk_callback_lock); 359462306a36Sopenharmony_ci} 359562306a36Sopenharmony_ci 359662306a36Sopenharmony_cibool mptcp_finish_join(struct sock *ssk) 359762306a36Sopenharmony_ci{ 359862306a36Sopenharmony_ci struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 359962306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(subflow->conn); 360062306a36Sopenharmony_ci struct sock *parent = (void *)msk; 360162306a36Sopenharmony_ci bool ret = true; 360262306a36Sopenharmony_ci 360362306a36Sopenharmony_ci pr_debug("msk=%p, subflow=%p", msk, subflow); 360462306a36Sopenharmony_ci 360562306a36Sopenharmony_ci /* mptcp socket already closing? */ 360662306a36Sopenharmony_ci if (!mptcp_is_fully_established(parent)) { 360762306a36Sopenharmony_ci subflow->reset_reason = MPTCP_RST_EMPTCP; 360862306a36Sopenharmony_ci return false; 360962306a36Sopenharmony_ci } 361062306a36Sopenharmony_ci 361162306a36Sopenharmony_ci /* active subflow, already present inside the conn_list */ 361262306a36Sopenharmony_ci if (!list_empty(&subflow->node)) { 361362306a36Sopenharmony_ci mptcp_subflow_joined(msk, ssk); 361462306a36Sopenharmony_ci mptcp_propagate_sndbuf(parent, ssk); 361562306a36Sopenharmony_ci return true; 361662306a36Sopenharmony_ci } 361762306a36Sopenharmony_ci 361862306a36Sopenharmony_ci if (!mptcp_pm_allow_new_subflow(msk)) 361962306a36Sopenharmony_ci goto err_prohibited; 362062306a36Sopenharmony_ci 362162306a36Sopenharmony_ci /* If we can't acquire msk socket lock here, let the release callback 362262306a36Sopenharmony_ci * handle it 362362306a36Sopenharmony_ci */ 362462306a36Sopenharmony_ci mptcp_data_lock(parent); 362562306a36Sopenharmony_ci if (!sock_owned_by_user(parent)) { 362662306a36Sopenharmony_ci ret = __mptcp_finish_join(msk, ssk); 362762306a36Sopenharmony_ci if (ret) { 362862306a36Sopenharmony_ci sock_hold(ssk); 362962306a36Sopenharmony_ci list_add_tail(&subflow->node, &msk->conn_list); 363062306a36Sopenharmony_ci } 363162306a36Sopenharmony_ci } else { 363262306a36Sopenharmony_ci sock_hold(ssk); 363362306a36Sopenharmony_ci list_add_tail(&subflow->node, &msk->join_list); 363462306a36Sopenharmony_ci __set_bit(MPTCP_FLUSH_JOIN_LIST, &msk->cb_flags); 363562306a36Sopenharmony_ci } 363662306a36Sopenharmony_ci mptcp_data_unlock(parent); 363762306a36Sopenharmony_ci 363862306a36Sopenharmony_ci if (!ret) { 363962306a36Sopenharmony_cierr_prohibited: 364062306a36Sopenharmony_ci subflow->reset_reason = MPTCP_RST_EPROHIBIT; 364162306a36Sopenharmony_ci return false; 364262306a36Sopenharmony_ci } 364362306a36Sopenharmony_ci 364462306a36Sopenharmony_ci return true; 364562306a36Sopenharmony_ci} 364662306a36Sopenharmony_ci 364762306a36Sopenharmony_cistatic void mptcp_shutdown(struct sock *sk, int how) 364862306a36Sopenharmony_ci{ 364962306a36Sopenharmony_ci pr_debug("sk=%p, how=%d", sk, how); 365062306a36Sopenharmony_ci 365162306a36Sopenharmony_ci if ((how & SEND_SHUTDOWN) && mptcp_close_state(sk)) 365262306a36Sopenharmony_ci __mptcp_wr_shutdown(sk); 365362306a36Sopenharmony_ci} 365462306a36Sopenharmony_ci 365562306a36Sopenharmony_cistatic int mptcp_forward_alloc_get(const struct sock *sk) 365662306a36Sopenharmony_ci{ 365762306a36Sopenharmony_ci return READ_ONCE(sk->sk_forward_alloc) + 365862306a36Sopenharmony_ci READ_ONCE(mptcp_sk(sk)->rmem_fwd_alloc); 365962306a36Sopenharmony_ci} 366062306a36Sopenharmony_ci 366162306a36Sopenharmony_cistatic int mptcp_ioctl_outq(const struct mptcp_sock *msk, u64 v) 366262306a36Sopenharmony_ci{ 366362306a36Sopenharmony_ci const struct sock *sk = (void *)msk; 366462306a36Sopenharmony_ci u64 delta; 366562306a36Sopenharmony_ci 366662306a36Sopenharmony_ci if (sk->sk_state == TCP_LISTEN) 366762306a36Sopenharmony_ci return -EINVAL; 366862306a36Sopenharmony_ci 366962306a36Sopenharmony_ci if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) 367062306a36Sopenharmony_ci return 0; 367162306a36Sopenharmony_ci 367262306a36Sopenharmony_ci delta = msk->write_seq - v; 367362306a36Sopenharmony_ci if (__mptcp_check_fallback(msk) && msk->first) { 367462306a36Sopenharmony_ci struct tcp_sock *tp = tcp_sk(msk->first); 367562306a36Sopenharmony_ci 367662306a36Sopenharmony_ci /* the first subflow is disconnected after close - see 367762306a36Sopenharmony_ci * __mptcp_close_ssk(). tcp_disconnect() moves the write_seq 367862306a36Sopenharmony_ci * so ignore that status, too. 367962306a36Sopenharmony_ci */ 368062306a36Sopenharmony_ci if (!((1 << msk->first->sk_state) & 368162306a36Sopenharmony_ci (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))) 368262306a36Sopenharmony_ci delta += READ_ONCE(tp->write_seq) - tp->snd_una; 368362306a36Sopenharmony_ci } 368462306a36Sopenharmony_ci if (delta > INT_MAX) 368562306a36Sopenharmony_ci delta = INT_MAX; 368662306a36Sopenharmony_ci 368762306a36Sopenharmony_ci return (int)delta; 368862306a36Sopenharmony_ci} 368962306a36Sopenharmony_ci 369062306a36Sopenharmony_cistatic int mptcp_ioctl(struct sock *sk, int cmd, int *karg) 369162306a36Sopenharmony_ci{ 369262306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sk); 369362306a36Sopenharmony_ci bool slow; 369462306a36Sopenharmony_ci 369562306a36Sopenharmony_ci switch (cmd) { 369662306a36Sopenharmony_ci case SIOCINQ: 369762306a36Sopenharmony_ci if (sk->sk_state == TCP_LISTEN) 369862306a36Sopenharmony_ci return -EINVAL; 369962306a36Sopenharmony_ci 370062306a36Sopenharmony_ci lock_sock(sk); 370162306a36Sopenharmony_ci __mptcp_move_skbs(msk); 370262306a36Sopenharmony_ci *karg = mptcp_inq_hint(sk); 370362306a36Sopenharmony_ci release_sock(sk); 370462306a36Sopenharmony_ci break; 370562306a36Sopenharmony_ci case SIOCOUTQ: 370662306a36Sopenharmony_ci slow = lock_sock_fast(sk); 370762306a36Sopenharmony_ci *karg = mptcp_ioctl_outq(msk, READ_ONCE(msk->snd_una)); 370862306a36Sopenharmony_ci unlock_sock_fast(sk, slow); 370962306a36Sopenharmony_ci break; 371062306a36Sopenharmony_ci case SIOCOUTQNSD: 371162306a36Sopenharmony_ci slow = lock_sock_fast(sk); 371262306a36Sopenharmony_ci *karg = mptcp_ioctl_outq(msk, msk->snd_nxt); 371362306a36Sopenharmony_ci unlock_sock_fast(sk, slow); 371462306a36Sopenharmony_ci break; 371562306a36Sopenharmony_ci default: 371662306a36Sopenharmony_ci return -ENOIOCTLCMD; 371762306a36Sopenharmony_ci } 371862306a36Sopenharmony_ci 371962306a36Sopenharmony_ci return 0; 372062306a36Sopenharmony_ci} 372162306a36Sopenharmony_ci 372262306a36Sopenharmony_cistatic void mptcp_subflow_early_fallback(struct mptcp_sock *msk, 372362306a36Sopenharmony_ci struct mptcp_subflow_context *subflow) 372462306a36Sopenharmony_ci{ 372562306a36Sopenharmony_ci subflow->request_mptcp = 0; 372662306a36Sopenharmony_ci __mptcp_do_fallback(msk); 372762306a36Sopenharmony_ci} 372862306a36Sopenharmony_ci 372962306a36Sopenharmony_cistatic int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 373062306a36Sopenharmony_ci{ 373162306a36Sopenharmony_ci struct mptcp_subflow_context *subflow; 373262306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sk); 373362306a36Sopenharmony_ci int err = -EINVAL; 373462306a36Sopenharmony_ci struct sock *ssk; 373562306a36Sopenharmony_ci 373662306a36Sopenharmony_ci ssk = __mptcp_nmpc_sk(msk); 373762306a36Sopenharmony_ci if (IS_ERR(ssk)) 373862306a36Sopenharmony_ci return PTR_ERR(ssk); 373962306a36Sopenharmony_ci 374062306a36Sopenharmony_ci mptcp_set_state(sk, TCP_SYN_SENT); 374162306a36Sopenharmony_ci subflow = mptcp_subflow_ctx(ssk); 374262306a36Sopenharmony_ci#ifdef CONFIG_TCP_MD5SIG 374362306a36Sopenharmony_ci /* no MPTCP if MD5SIG is enabled on this socket or we may run out of 374462306a36Sopenharmony_ci * TCP option space. 374562306a36Sopenharmony_ci */ 374662306a36Sopenharmony_ci if (rcu_access_pointer(tcp_sk(ssk)->md5sig_info)) 374762306a36Sopenharmony_ci mptcp_subflow_early_fallback(msk, subflow); 374862306a36Sopenharmony_ci#endif 374962306a36Sopenharmony_ci if (subflow->request_mptcp && mptcp_token_new_connect(ssk)) { 375062306a36Sopenharmony_ci MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_TOKENFALLBACKINIT); 375162306a36Sopenharmony_ci mptcp_subflow_early_fallback(msk, subflow); 375262306a36Sopenharmony_ci } 375362306a36Sopenharmony_ci if (likely(!__mptcp_check_fallback(msk))) 375462306a36Sopenharmony_ci MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVE); 375562306a36Sopenharmony_ci 375662306a36Sopenharmony_ci /* if reaching here via the fastopen/sendmsg path, the caller already 375762306a36Sopenharmony_ci * acquired the subflow socket lock, too. 375862306a36Sopenharmony_ci */ 375962306a36Sopenharmony_ci if (!msk->fastopening) 376062306a36Sopenharmony_ci lock_sock(ssk); 376162306a36Sopenharmony_ci 376262306a36Sopenharmony_ci /* the following mirrors closely a very small chunk of code from 376362306a36Sopenharmony_ci * __inet_stream_connect() 376462306a36Sopenharmony_ci */ 376562306a36Sopenharmony_ci if (ssk->sk_state != TCP_CLOSE) 376662306a36Sopenharmony_ci goto out; 376762306a36Sopenharmony_ci 376862306a36Sopenharmony_ci if (BPF_CGROUP_PRE_CONNECT_ENABLED(ssk)) { 376962306a36Sopenharmony_ci err = ssk->sk_prot->pre_connect(ssk, uaddr, addr_len); 377062306a36Sopenharmony_ci if (err) 377162306a36Sopenharmony_ci goto out; 377262306a36Sopenharmony_ci } 377362306a36Sopenharmony_ci 377462306a36Sopenharmony_ci err = ssk->sk_prot->connect(ssk, uaddr, addr_len); 377562306a36Sopenharmony_ci if (err < 0) 377662306a36Sopenharmony_ci goto out; 377762306a36Sopenharmony_ci 377862306a36Sopenharmony_ci inet_assign_bit(DEFER_CONNECT, sk, inet_test_bit(DEFER_CONNECT, ssk)); 377962306a36Sopenharmony_ci 378062306a36Sopenharmony_ciout: 378162306a36Sopenharmony_ci if (!msk->fastopening) 378262306a36Sopenharmony_ci release_sock(ssk); 378362306a36Sopenharmony_ci 378462306a36Sopenharmony_ci /* on successful connect, the msk state will be moved to established by 378562306a36Sopenharmony_ci * subflow_finish_connect() 378662306a36Sopenharmony_ci */ 378762306a36Sopenharmony_ci if (unlikely(err)) { 378862306a36Sopenharmony_ci /* avoid leaving a dangling token in an unconnected socket */ 378962306a36Sopenharmony_ci mptcp_token_destroy(msk); 379062306a36Sopenharmony_ci mptcp_set_state(sk, TCP_CLOSE); 379162306a36Sopenharmony_ci return err; 379262306a36Sopenharmony_ci } 379362306a36Sopenharmony_ci 379462306a36Sopenharmony_ci mptcp_copy_inaddrs(sk, ssk); 379562306a36Sopenharmony_ci return 0; 379662306a36Sopenharmony_ci} 379762306a36Sopenharmony_ci 379862306a36Sopenharmony_cistatic struct proto mptcp_prot = { 379962306a36Sopenharmony_ci .name = "MPTCP", 380062306a36Sopenharmony_ci .owner = THIS_MODULE, 380162306a36Sopenharmony_ci .init = mptcp_init_sock, 380262306a36Sopenharmony_ci .connect = mptcp_connect, 380362306a36Sopenharmony_ci .disconnect = mptcp_disconnect, 380462306a36Sopenharmony_ci .close = mptcp_close, 380562306a36Sopenharmony_ci .accept = mptcp_accept, 380662306a36Sopenharmony_ci .setsockopt = mptcp_setsockopt, 380762306a36Sopenharmony_ci .getsockopt = mptcp_getsockopt, 380862306a36Sopenharmony_ci .shutdown = mptcp_shutdown, 380962306a36Sopenharmony_ci .destroy = mptcp_destroy, 381062306a36Sopenharmony_ci .sendmsg = mptcp_sendmsg, 381162306a36Sopenharmony_ci .ioctl = mptcp_ioctl, 381262306a36Sopenharmony_ci .recvmsg = mptcp_recvmsg, 381362306a36Sopenharmony_ci .release_cb = mptcp_release_cb, 381462306a36Sopenharmony_ci .hash = mptcp_hash, 381562306a36Sopenharmony_ci .unhash = mptcp_unhash, 381662306a36Sopenharmony_ci .get_port = mptcp_get_port, 381762306a36Sopenharmony_ci .forward_alloc_get = mptcp_forward_alloc_get, 381862306a36Sopenharmony_ci .sockets_allocated = &mptcp_sockets_allocated, 381962306a36Sopenharmony_ci 382062306a36Sopenharmony_ci .memory_allocated = &tcp_memory_allocated, 382162306a36Sopenharmony_ci .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, 382262306a36Sopenharmony_ci 382362306a36Sopenharmony_ci .memory_pressure = &tcp_memory_pressure, 382462306a36Sopenharmony_ci .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 382562306a36Sopenharmony_ci .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 382662306a36Sopenharmony_ci .sysctl_mem = sysctl_tcp_mem, 382762306a36Sopenharmony_ci .obj_size = sizeof(struct mptcp_sock), 382862306a36Sopenharmony_ci .slab_flags = SLAB_TYPESAFE_BY_RCU, 382962306a36Sopenharmony_ci .no_autobind = true, 383062306a36Sopenharmony_ci}; 383162306a36Sopenharmony_ci 383262306a36Sopenharmony_cistatic int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 383362306a36Sopenharmony_ci{ 383462306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sock->sk); 383562306a36Sopenharmony_ci struct sock *ssk, *sk = sock->sk; 383662306a36Sopenharmony_ci int err = -EINVAL; 383762306a36Sopenharmony_ci 383862306a36Sopenharmony_ci lock_sock(sk); 383962306a36Sopenharmony_ci ssk = __mptcp_nmpc_sk(msk); 384062306a36Sopenharmony_ci if (IS_ERR(ssk)) { 384162306a36Sopenharmony_ci err = PTR_ERR(ssk); 384262306a36Sopenharmony_ci goto unlock; 384362306a36Sopenharmony_ci } 384462306a36Sopenharmony_ci 384562306a36Sopenharmony_ci if (sk->sk_family == AF_INET) 384662306a36Sopenharmony_ci err = inet_bind_sk(ssk, uaddr, addr_len); 384762306a36Sopenharmony_ci#if IS_ENABLED(CONFIG_MPTCP_IPV6) 384862306a36Sopenharmony_ci else if (sk->sk_family == AF_INET6) 384962306a36Sopenharmony_ci err = inet6_bind_sk(ssk, uaddr, addr_len); 385062306a36Sopenharmony_ci#endif 385162306a36Sopenharmony_ci if (!err) 385262306a36Sopenharmony_ci mptcp_copy_inaddrs(sk, ssk); 385362306a36Sopenharmony_ci 385462306a36Sopenharmony_ciunlock: 385562306a36Sopenharmony_ci release_sock(sk); 385662306a36Sopenharmony_ci return err; 385762306a36Sopenharmony_ci} 385862306a36Sopenharmony_ci 385962306a36Sopenharmony_cistatic int mptcp_listen(struct socket *sock, int backlog) 386062306a36Sopenharmony_ci{ 386162306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sock->sk); 386262306a36Sopenharmony_ci struct sock *sk = sock->sk; 386362306a36Sopenharmony_ci struct sock *ssk; 386462306a36Sopenharmony_ci int err; 386562306a36Sopenharmony_ci 386662306a36Sopenharmony_ci pr_debug("msk=%p", msk); 386762306a36Sopenharmony_ci 386862306a36Sopenharmony_ci lock_sock(sk); 386962306a36Sopenharmony_ci 387062306a36Sopenharmony_ci err = -EINVAL; 387162306a36Sopenharmony_ci if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM) 387262306a36Sopenharmony_ci goto unlock; 387362306a36Sopenharmony_ci 387462306a36Sopenharmony_ci ssk = __mptcp_nmpc_sk(msk); 387562306a36Sopenharmony_ci if (IS_ERR(ssk)) { 387662306a36Sopenharmony_ci err = PTR_ERR(ssk); 387762306a36Sopenharmony_ci goto unlock; 387862306a36Sopenharmony_ci } 387962306a36Sopenharmony_ci 388062306a36Sopenharmony_ci mptcp_set_state(sk, TCP_LISTEN); 388162306a36Sopenharmony_ci sock_set_flag(sk, SOCK_RCU_FREE); 388262306a36Sopenharmony_ci 388362306a36Sopenharmony_ci lock_sock(ssk); 388462306a36Sopenharmony_ci err = __inet_listen_sk(ssk, backlog); 388562306a36Sopenharmony_ci release_sock(ssk); 388662306a36Sopenharmony_ci mptcp_set_state(sk, inet_sk_state_load(ssk)); 388762306a36Sopenharmony_ci 388862306a36Sopenharmony_ci if (!err) { 388962306a36Sopenharmony_ci sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 389062306a36Sopenharmony_ci mptcp_copy_inaddrs(sk, ssk); 389162306a36Sopenharmony_ci mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CREATED); 389262306a36Sopenharmony_ci } 389362306a36Sopenharmony_ci 389462306a36Sopenharmony_ciunlock: 389562306a36Sopenharmony_ci release_sock(sk); 389662306a36Sopenharmony_ci return err; 389762306a36Sopenharmony_ci} 389862306a36Sopenharmony_ci 389962306a36Sopenharmony_cistatic int mptcp_stream_accept(struct socket *sock, struct socket *newsock, 390062306a36Sopenharmony_ci int flags, bool kern) 390162306a36Sopenharmony_ci{ 390262306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(sock->sk); 390362306a36Sopenharmony_ci struct sock *ssk, *newsk; 390462306a36Sopenharmony_ci int err; 390562306a36Sopenharmony_ci 390662306a36Sopenharmony_ci pr_debug("msk=%p", msk); 390762306a36Sopenharmony_ci 390862306a36Sopenharmony_ci /* Buggy applications can call accept on socket states other then LISTEN 390962306a36Sopenharmony_ci * but no need to allocate the first subflow just to error out. 391062306a36Sopenharmony_ci */ 391162306a36Sopenharmony_ci ssk = READ_ONCE(msk->first); 391262306a36Sopenharmony_ci if (!ssk) 391362306a36Sopenharmony_ci return -EINVAL; 391462306a36Sopenharmony_ci 391562306a36Sopenharmony_ci newsk = mptcp_accept(ssk, flags, &err, kern); 391662306a36Sopenharmony_ci if (!newsk) 391762306a36Sopenharmony_ci return err; 391862306a36Sopenharmony_ci 391962306a36Sopenharmony_ci lock_sock(newsk); 392062306a36Sopenharmony_ci 392162306a36Sopenharmony_ci __inet_accept(sock, newsock, newsk); 392262306a36Sopenharmony_ci if (!mptcp_is_tcpsk(newsock->sk)) { 392362306a36Sopenharmony_ci struct mptcp_sock *msk = mptcp_sk(newsk); 392462306a36Sopenharmony_ci struct mptcp_subflow_context *subflow; 392562306a36Sopenharmony_ci 392662306a36Sopenharmony_ci set_bit(SOCK_CUSTOM_SOCKOPT, &newsock->flags); 392762306a36Sopenharmony_ci msk->in_accept_queue = 0; 392862306a36Sopenharmony_ci 392962306a36Sopenharmony_ci /* set ssk->sk_socket of accept()ed flows to mptcp socket. 393062306a36Sopenharmony_ci * This is needed so NOSPACE flag can be set from tcp stack. 393162306a36Sopenharmony_ci */ 393262306a36Sopenharmony_ci mptcp_for_each_subflow(msk, subflow) { 393362306a36Sopenharmony_ci struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 393462306a36Sopenharmony_ci 393562306a36Sopenharmony_ci if (!ssk->sk_socket) 393662306a36Sopenharmony_ci mptcp_sock_graft(ssk, newsock); 393762306a36Sopenharmony_ci } 393862306a36Sopenharmony_ci 393962306a36Sopenharmony_ci /* Do late cleanup for the first subflow as necessary. Also 394062306a36Sopenharmony_ci * deal with bad peers not doing a complete shutdown. 394162306a36Sopenharmony_ci */ 394262306a36Sopenharmony_ci if (unlikely(inet_sk_state_load(msk->first) == TCP_CLOSE)) { 394362306a36Sopenharmony_ci __mptcp_close_ssk(newsk, msk->first, 394462306a36Sopenharmony_ci mptcp_subflow_ctx(msk->first), 0); 394562306a36Sopenharmony_ci if (unlikely(list_is_singular(&msk->conn_list))) 394662306a36Sopenharmony_ci mptcp_set_state(newsk, TCP_CLOSE); 394762306a36Sopenharmony_ci } 394862306a36Sopenharmony_ci } 394962306a36Sopenharmony_ci release_sock(newsk); 395062306a36Sopenharmony_ci 395162306a36Sopenharmony_ci return 0; 395262306a36Sopenharmony_ci} 395362306a36Sopenharmony_ci 395462306a36Sopenharmony_cistatic __poll_t mptcp_check_writeable(struct mptcp_sock *msk) 395562306a36Sopenharmony_ci{ 395662306a36Sopenharmony_ci struct sock *sk = (struct sock *)msk; 395762306a36Sopenharmony_ci 395862306a36Sopenharmony_ci if (sk_stream_is_writeable(sk)) 395962306a36Sopenharmony_ci return EPOLLOUT | EPOLLWRNORM; 396062306a36Sopenharmony_ci 396162306a36Sopenharmony_ci mptcp_set_nospace(sk); 396262306a36Sopenharmony_ci smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */ 396362306a36Sopenharmony_ci if (sk_stream_is_writeable(sk)) 396462306a36Sopenharmony_ci return EPOLLOUT | EPOLLWRNORM; 396562306a36Sopenharmony_ci 396662306a36Sopenharmony_ci return 0; 396762306a36Sopenharmony_ci} 396862306a36Sopenharmony_ci 396962306a36Sopenharmony_cistatic __poll_t mptcp_poll(struct file *file, struct socket *sock, 397062306a36Sopenharmony_ci struct poll_table_struct *wait) 397162306a36Sopenharmony_ci{ 397262306a36Sopenharmony_ci struct sock *sk = sock->sk; 397362306a36Sopenharmony_ci struct mptcp_sock *msk; 397462306a36Sopenharmony_ci __poll_t mask = 0; 397562306a36Sopenharmony_ci u8 shutdown; 397662306a36Sopenharmony_ci int state; 397762306a36Sopenharmony_ci 397862306a36Sopenharmony_ci msk = mptcp_sk(sk); 397962306a36Sopenharmony_ci sock_poll_wait(file, sock, wait); 398062306a36Sopenharmony_ci 398162306a36Sopenharmony_ci state = inet_sk_state_load(sk); 398262306a36Sopenharmony_ci pr_debug("msk=%p state=%d flags=%lx", msk, state, msk->flags); 398362306a36Sopenharmony_ci if (state == TCP_LISTEN) { 398462306a36Sopenharmony_ci struct sock *ssk = READ_ONCE(msk->first); 398562306a36Sopenharmony_ci 398662306a36Sopenharmony_ci if (WARN_ON_ONCE(!ssk)) 398762306a36Sopenharmony_ci return 0; 398862306a36Sopenharmony_ci 398962306a36Sopenharmony_ci return inet_csk_listen_poll(ssk); 399062306a36Sopenharmony_ci } 399162306a36Sopenharmony_ci 399262306a36Sopenharmony_ci shutdown = READ_ONCE(sk->sk_shutdown); 399362306a36Sopenharmony_ci if (shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) 399462306a36Sopenharmony_ci mask |= EPOLLHUP; 399562306a36Sopenharmony_ci if (shutdown & RCV_SHUTDOWN) 399662306a36Sopenharmony_ci mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; 399762306a36Sopenharmony_ci 399862306a36Sopenharmony_ci if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { 399962306a36Sopenharmony_ci mask |= mptcp_check_readable(msk); 400062306a36Sopenharmony_ci if (shutdown & SEND_SHUTDOWN) 400162306a36Sopenharmony_ci mask |= EPOLLOUT | EPOLLWRNORM; 400262306a36Sopenharmony_ci else 400362306a36Sopenharmony_ci mask |= mptcp_check_writeable(msk); 400462306a36Sopenharmony_ci } else if (state == TCP_SYN_SENT && 400562306a36Sopenharmony_ci inet_test_bit(DEFER_CONNECT, sk)) { 400662306a36Sopenharmony_ci /* cf tcp_poll() note about TFO */ 400762306a36Sopenharmony_ci mask |= EPOLLOUT | EPOLLWRNORM; 400862306a36Sopenharmony_ci } 400962306a36Sopenharmony_ci 401062306a36Sopenharmony_ci /* This barrier is coupled with smp_wmb() in __mptcp_error_report() */ 401162306a36Sopenharmony_ci smp_rmb(); 401262306a36Sopenharmony_ci if (READ_ONCE(sk->sk_err)) 401362306a36Sopenharmony_ci mask |= EPOLLERR; 401462306a36Sopenharmony_ci 401562306a36Sopenharmony_ci return mask; 401662306a36Sopenharmony_ci} 401762306a36Sopenharmony_ci 401862306a36Sopenharmony_cistatic const struct proto_ops mptcp_stream_ops = { 401962306a36Sopenharmony_ci .family = PF_INET, 402062306a36Sopenharmony_ci .owner = THIS_MODULE, 402162306a36Sopenharmony_ci .release = inet_release, 402262306a36Sopenharmony_ci .bind = mptcp_bind, 402362306a36Sopenharmony_ci .connect = inet_stream_connect, 402462306a36Sopenharmony_ci .socketpair = sock_no_socketpair, 402562306a36Sopenharmony_ci .accept = mptcp_stream_accept, 402662306a36Sopenharmony_ci .getname = inet_getname, 402762306a36Sopenharmony_ci .poll = mptcp_poll, 402862306a36Sopenharmony_ci .ioctl = inet_ioctl, 402962306a36Sopenharmony_ci .gettstamp = sock_gettstamp, 403062306a36Sopenharmony_ci .listen = mptcp_listen, 403162306a36Sopenharmony_ci .shutdown = inet_shutdown, 403262306a36Sopenharmony_ci .setsockopt = sock_common_setsockopt, 403362306a36Sopenharmony_ci .getsockopt = sock_common_getsockopt, 403462306a36Sopenharmony_ci .sendmsg = inet_sendmsg, 403562306a36Sopenharmony_ci .recvmsg = inet_recvmsg, 403662306a36Sopenharmony_ci .mmap = sock_no_mmap, 403762306a36Sopenharmony_ci}; 403862306a36Sopenharmony_ci 403962306a36Sopenharmony_cistatic struct inet_protosw mptcp_protosw = { 404062306a36Sopenharmony_ci .type = SOCK_STREAM, 404162306a36Sopenharmony_ci .protocol = IPPROTO_MPTCP, 404262306a36Sopenharmony_ci .prot = &mptcp_prot, 404362306a36Sopenharmony_ci .ops = &mptcp_stream_ops, 404462306a36Sopenharmony_ci .flags = INET_PROTOSW_ICSK, 404562306a36Sopenharmony_ci}; 404662306a36Sopenharmony_ci 404762306a36Sopenharmony_cistatic int mptcp_napi_poll(struct napi_struct *napi, int budget) 404862306a36Sopenharmony_ci{ 404962306a36Sopenharmony_ci struct mptcp_delegated_action *delegated; 405062306a36Sopenharmony_ci struct mptcp_subflow_context *subflow; 405162306a36Sopenharmony_ci int work_done = 0; 405262306a36Sopenharmony_ci 405362306a36Sopenharmony_ci delegated = container_of(napi, struct mptcp_delegated_action, napi); 405462306a36Sopenharmony_ci while ((subflow = mptcp_subflow_delegated_next(delegated)) != NULL) { 405562306a36Sopenharmony_ci struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 405662306a36Sopenharmony_ci 405762306a36Sopenharmony_ci bh_lock_sock_nested(ssk); 405862306a36Sopenharmony_ci if (!sock_owned_by_user(ssk)) { 405962306a36Sopenharmony_ci mptcp_subflow_process_delegated(ssk, xchg(&subflow->delegated_status, 0)); 406062306a36Sopenharmony_ci } else { 406162306a36Sopenharmony_ci /* tcp_release_cb_override already processed 406262306a36Sopenharmony_ci * the action or will do at next release_sock(). 406362306a36Sopenharmony_ci * In both case must dequeue the subflow here - on the same 406462306a36Sopenharmony_ci * CPU that scheduled it. 406562306a36Sopenharmony_ci */ 406662306a36Sopenharmony_ci smp_wmb(); 406762306a36Sopenharmony_ci clear_bit(MPTCP_DELEGATE_SCHEDULED, &subflow->delegated_status); 406862306a36Sopenharmony_ci } 406962306a36Sopenharmony_ci bh_unlock_sock(ssk); 407062306a36Sopenharmony_ci sock_put(ssk); 407162306a36Sopenharmony_ci 407262306a36Sopenharmony_ci if (++work_done == budget) 407362306a36Sopenharmony_ci return budget; 407462306a36Sopenharmony_ci } 407562306a36Sopenharmony_ci 407662306a36Sopenharmony_ci /* always provide a 0 'work_done' argument, so that napi_complete_done 407762306a36Sopenharmony_ci * will not try accessing the NULL napi->dev ptr 407862306a36Sopenharmony_ci */ 407962306a36Sopenharmony_ci napi_complete_done(napi, 0); 408062306a36Sopenharmony_ci return work_done; 408162306a36Sopenharmony_ci} 408262306a36Sopenharmony_ci 408362306a36Sopenharmony_civoid __init mptcp_proto_init(void) 408462306a36Sopenharmony_ci{ 408562306a36Sopenharmony_ci struct mptcp_delegated_action *delegated; 408662306a36Sopenharmony_ci int cpu; 408762306a36Sopenharmony_ci 408862306a36Sopenharmony_ci mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo; 408962306a36Sopenharmony_ci 409062306a36Sopenharmony_ci if (percpu_counter_init(&mptcp_sockets_allocated, 0, GFP_KERNEL)) 409162306a36Sopenharmony_ci panic("Failed to allocate MPTCP pcpu counter\n"); 409262306a36Sopenharmony_ci 409362306a36Sopenharmony_ci init_dummy_netdev(&mptcp_napi_dev); 409462306a36Sopenharmony_ci for_each_possible_cpu(cpu) { 409562306a36Sopenharmony_ci delegated = per_cpu_ptr(&mptcp_delegated_actions, cpu); 409662306a36Sopenharmony_ci INIT_LIST_HEAD(&delegated->head); 409762306a36Sopenharmony_ci netif_napi_add_tx(&mptcp_napi_dev, &delegated->napi, 409862306a36Sopenharmony_ci mptcp_napi_poll); 409962306a36Sopenharmony_ci napi_enable(&delegated->napi); 410062306a36Sopenharmony_ci } 410162306a36Sopenharmony_ci 410262306a36Sopenharmony_ci mptcp_subflow_init(); 410362306a36Sopenharmony_ci mptcp_pm_init(); 410462306a36Sopenharmony_ci mptcp_sched_init(); 410562306a36Sopenharmony_ci mptcp_token_init(); 410662306a36Sopenharmony_ci 410762306a36Sopenharmony_ci if (proto_register(&mptcp_prot, 1) != 0) 410862306a36Sopenharmony_ci panic("Failed to register MPTCP proto.\n"); 410962306a36Sopenharmony_ci 411062306a36Sopenharmony_ci inet_register_protosw(&mptcp_protosw); 411162306a36Sopenharmony_ci 411262306a36Sopenharmony_ci BUILD_BUG_ON(sizeof(struct mptcp_skb_cb) > sizeof_field(struct sk_buff, cb)); 411362306a36Sopenharmony_ci} 411462306a36Sopenharmony_ci 411562306a36Sopenharmony_ci#if IS_ENABLED(CONFIG_MPTCP_IPV6) 411662306a36Sopenharmony_cistatic const struct proto_ops mptcp_v6_stream_ops = { 411762306a36Sopenharmony_ci .family = PF_INET6, 411862306a36Sopenharmony_ci .owner = THIS_MODULE, 411962306a36Sopenharmony_ci .release = inet6_release, 412062306a36Sopenharmony_ci .bind = mptcp_bind, 412162306a36Sopenharmony_ci .connect = inet_stream_connect, 412262306a36Sopenharmony_ci .socketpair = sock_no_socketpair, 412362306a36Sopenharmony_ci .accept = mptcp_stream_accept, 412462306a36Sopenharmony_ci .getname = inet6_getname, 412562306a36Sopenharmony_ci .poll = mptcp_poll, 412662306a36Sopenharmony_ci .ioctl = inet6_ioctl, 412762306a36Sopenharmony_ci .gettstamp = sock_gettstamp, 412862306a36Sopenharmony_ci .listen = mptcp_listen, 412962306a36Sopenharmony_ci .shutdown = inet_shutdown, 413062306a36Sopenharmony_ci .setsockopt = sock_common_setsockopt, 413162306a36Sopenharmony_ci .getsockopt = sock_common_getsockopt, 413262306a36Sopenharmony_ci .sendmsg = inet6_sendmsg, 413362306a36Sopenharmony_ci .recvmsg = inet6_recvmsg, 413462306a36Sopenharmony_ci .mmap = sock_no_mmap, 413562306a36Sopenharmony_ci#ifdef CONFIG_COMPAT 413662306a36Sopenharmony_ci .compat_ioctl = inet6_compat_ioctl, 413762306a36Sopenharmony_ci#endif 413862306a36Sopenharmony_ci}; 413962306a36Sopenharmony_ci 414062306a36Sopenharmony_cistatic struct proto mptcp_v6_prot; 414162306a36Sopenharmony_ci 414262306a36Sopenharmony_cistatic struct inet_protosw mptcp_v6_protosw = { 414362306a36Sopenharmony_ci .type = SOCK_STREAM, 414462306a36Sopenharmony_ci .protocol = IPPROTO_MPTCP, 414562306a36Sopenharmony_ci .prot = &mptcp_v6_prot, 414662306a36Sopenharmony_ci .ops = &mptcp_v6_stream_ops, 414762306a36Sopenharmony_ci .flags = INET_PROTOSW_ICSK, 414862306a36Sopenharmony_ci}; 414962306a36Sopenharmony_ci 415062306a36Sopenharmony_ciint __init mptcp_proto_v6_init(void) 415162306a36Sopenharmony_ci{ 415262306a36Sopenharmony_ci int err; 415362306a36Sopenharmony_ci 415462306a36Sopenharmony_ci mptcp_v6_prot = mptcp_prot; 415562306a36Sopenharmony_ci strcpy(mptcp_v6_prot.name, "MPTCPv6"); 415662306a36Sopenharmony_ci mptcp_v6_prot.slab = NULL; 415762306a36Sopenharmony_ci mptcp_v6_prot.obj_size = sizeof(struct mptcp6_sock); 415862306a36Sopenharmony_ci mptcp_v6_prot.ipv6_pinfo_offset = offsetof(struct mptcp6_sock, np); 415962306a36Sopenharmony_ci 416062306a36Sopenharmony_ci err = proto_register(&mptcp_v6_prot, 1); 416162306a36Sopenharmony_ci if (err) 416262306a36Sopenharmony_ci return err; 416362306a36Sopenharmony_ci 416462306a36Sopenharmony_ci err = inet6_register_protosw(&mptcp_v6_protosw); 416562306a36Sopenharmony_ci if (err) 416662306a36Sopenharmony_ci proto_unregister(&mptcp_v6_prot); 416762306a36Sopenharmony_ci 416862306a36Sopenharmony_ci return err; 416962306a36Sopenharmony_ci} 417062306a36Sopenharmony_ci#endif 4171