18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * INET An implementation of the TCP/IP protocol suite for the LINUX 48c2ecf20Sopenharmony_ci * operating system. INET is implemented using the BSD Socket 58c2ecf20Sopenharmony_ci * interface as the means of communication with the user level. 68c2ecf20Sopenharmony_ci * 78c2ecf20Sopenharmony_ci * Implementation of the Transmission Control Protocol(TCP). 88c2ecf20Sopenharmony_ci * 98c2ecf20Sopenharmony_ci * Authors: Ross Biro 108c2ecf20Sopenharmony_ci * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 118c2ecf20Sopenharmony_ci * Mark Evans, <evansmp@uhura.aston.ac.uk> 128c2ecf20Sopenharmony_ci * Corey Minyard <wf-rch!minyard@relay.EU.net> 138c2ecf20Sopenharmony_ci * Florian La Roche, <flla@stud.uni-sb.de> 148c2ecf20Sopenharmony_ci * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> 158c2ecf20Sopenharmony_ci * Linus Torvalds, <torvalds@cs.helsinki.fi> 168c2ecf20Sopenharmony_ci * Alan Cox, <gw4pts@gw4pts.ampr.org> 178c2ecf20Sopenharmony_ci * Matthew Dillon, <dillon@apollo.west.oic.com> 188c2ecf20Sopenharmony_ci * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 198c2ecf20Sopenharmony_ci * Jorge Cwik, <jorge@laser.satlink.net> 208c2ecf20Sopenharmony_ci * 218c2ecf20Sopenharmony_ci * Fixes: 228c2ecf20Sopenharmony_ci * Alan Cox : Numerous verify_area() calls 238c2ecf20Sopenharmony_ci * Alan Cox : Set the ACK bit on a reset 248c2ecf20Sopenharmony_ci * Alan Cox : Stopped it crashing if it closed while 258c2ecf20Sopenharmony_ci * sk->inuse=1 and was trying to connect 268c2ecf20Sopenharmony_ci * (tcp_err()). 278c2ecf20Sopenharmony_ci * Alan Cox : All icmp error handling was broken 288c2ecf20Sopenharmony_ci * pointers passed where wrong and the 298c2ecf20Sopenharmony_ci * socket was looked up backwards. Nobody 308c2ecf20Sopenharmony_ci * tested any icmp error code obviously. 318c2ecf20Sopenharmony_ci * Alan Cox : tcp_err() now handled properly. It 328c2ecf20Sopenharmony_ci * wakes people on errors. poll 338c2ecf20Sopenharmony_ci * behaves and the icmp error race 348c2ecf20Sopenharmony_ci * has gone by moving it into sock.c 358c2ecf20Sopenharmony_ci * Alan Cox : tcp_send_reset() fixed to work for 368c2ecf20Sopenharmony_ci * everything not just packets for 378c2ecf20Sopenharmony_ci * unknown sockets. 388c2ecf20Sopenharmony_ci * Alan Cox : tcp option processing. 398c2ecf20Sopenharmony_ci * Alan Cox : Reset tweaked (still not 100%) [Had 408c2ecf20Sopenharmony_ci * syn rule wrong] 418c2ecf20Sopenharmony_ci * Herp Rosmanith : More reset fixes 428c2ecf20Sopenharmony_ci * Alan Cox : No longer acks invalid rst frames. 438c2ecf20Sopenharmony_ci * Acking any kind of RST is right out. 448c2ecf20Sopenharmony_ci * Alan Cox : Sets an ignore me flag on an rst 458c2ecf20Sopenharmony_ci * receive otherwise odd bits of prattle 468c2ecf20Sopenharmony_ci * escape still 478c2ecf20Sopenharmony_ci * Alan Cox : Fixed another acking RST frame bug. 488c2ecf20Sopenharmony_ci * Should stop LAN workplace lockups. 498c2ecf20Sopenharmony_ci * Alan Cox : Some tidyups using the new skb list 508c2ecf20Sopenharmony_ci * facilities 518c2ecf20Sopenharmony_ci * Alan Cox : sk->keepopen now seems to work 528c2ecf20Sopenharmony_ci * Alan Cox : Pulls options out correctly on accepts 538c2ecf20Sopenharmony_ci * Alan Cox : Fixed assorted sk->rqueue->next errors 548c2ecf20Sopenharmony_ci * Alan Cox : PSH doesn't end a TCP read. Switched a 558c2ecf20Sopenharmony_ci * bit to skb ops. 568c2ecf20Sopenharmony_ci * Alan Cox : Tidied tcp_data to avoid a potential 578c2ecf20Sopenharmony_ci * nasty. 588c2ecf20Sopenharmony_ci * Alan Cox : Added some better commenting, as the 598c2ecf20Sopenharmony_ci * tcp is hard to follow 608c2ecf20Sopenharmony_ci * Alan Cox : Removed incorrect check for 20 * psh 618c2ecf20Sopenharmony_ci * Michael O'Reilly : ack < copied bug fix. 628c2ecf20Sopenharmony_ci * Johannes Stille : Misc tcp fixes (not all in yet). 638c2ecf20Sopenharmony_ci * Alan Cox : FIN with no memory -> CRASH 648c2ecf20Sopenharmony_ci * Alan Cox : Added socket option proto entries. 658c2ecf20Sopenharmony_ci * Also added awareness of them to accept. 668c2ecf20Sopenharmony_ci * Alan Cox : Added TCP options (SOL_TCP) 678c2ecf20Sopenharmony_ci * Alan Cox : Switched wakeup calls to callbacks, 688c2ecf20Sopenharmony_ci * so the kernel can layer network 698c2ecf20Sopenharmony_ci * sockets. 708c2ecf20Sopenharmony_ci * Alan Cox : Use ip_tos/ip_ttl settings. 718c2ecf20Sopenharmony_ci * Alan Cox : Handle FIN (more) properly (we hope). 728c2ecf20Sopenharmony_ci * Alan Cox : RST frames sent on unsynchronised 738c2ecf20Sopenharmony_ci * state ack error. 748c2ecf20Sopenharmony_ci * Alan Cox : Put in missing check for SYN bit. 758c2ecf20Sopenharmony_ci * Alan Cox : Added tcp_select_window() aka NET2E 768c2ecf20Sopenharmony_ci * window non shrink trick. 778c2ecf20Sopenharmony_ci * Alan Cox : Added a couple of small NET2E timer 788c2ecf20Sopenharmony_ci * fixes 798c2ecf20Sopenharmony_ci * Charles Hedrick : TCP fixes 808c2ecf20Sopenharmony_ci * Toomas Tamm : TCP window fixes 818c2ecf20Sopenharmony_ci * Alan Cox : Small URG fix to rlogin ^C ack fight 828c2ecf20Sopenharmony_ci * Charles Hedrick : Rewrote most of it to actually work 838c2ecf20Sopenharmony_ci * Linus : Rewrote tcp_read() and URG handling 848c2ecf20Sopenharmony_ci * completely 858c2ecf20Sopenharmony_ci * Gerhard Koerting: Fixed some missing timer handling 868c2ecf20Sopenharmony_ci * Matthew Dillon : Reworked TCP machine states as per RFC 878c2ecf20Sopenharmony_ci * Gerhard Koerting: PC/TCP workarounds 888c2ecf20Sopenharmony_ci * Adam Caldwell : Assorted timer/timing errors 898c2ecf20Sopenharmony_ci * Matthew Dillon : Fixed another RST bug 908c2ecf20Sopenharmony_ci * Alan Cox : Move to kernel side addressing changes. 918c2ecf20Sopenharmony_ci * Alan Cox : Beginning work on TCP fastpathing 928c2ecf20Sopenharmony_ci * (not yet usable) 938c2ecf20Sopenharmony_ci * Arnt Gulbrandsen: Turbocharged tcp_check() routine. 948c2ecf20Sopenharmony_ci * Alan Cox : TCP fast path debugging 958c2ecf20Sopenharmony_ci * Alan Cox : Window clamping 968c2ecf20Sopenharmony_ci * Michael Riepe : Bug in tcp_check() 978c2ecf20Sopenharmony_ci * Matt Dillon : More TCP improvements and RST bug fixes 988c2ecf20Sopenharmony_ci * Matt Dillon : Yet more small nasties remove from the 998c2ecf20Sopenharmony_ci * TCP code (Be very nice to this man if 1008c2ecf20Sopenharmony_ci * tcp finally works 100%) 8) 1018c2ecf20Sopenharmony_ci * Alan Cox : BSD accept semantics. 1028c2ecf20Sopenharmony_ci * Alan Cox : Reset on closedown bug. 1038c2ecf20Sopenharmony_ci * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). 1048c2ecf20Sopenharmony_ci * Michael Pall : Handle poll() after URG properly in 1058c2ecf20Sopenharmony_ci * all cases. 1068c2ecf20Sopenharmony_ci * Michael Pall : Undo the last fix in tcp_read_urg() 1078c2ecf20Sopenharmony_ci * (multi URG PUSH broke rlogin). 1088c2ecf20Sopenharmony_ci * Michael Pall : Fix the multi URG PUSH problem in 1098c2ecf20Sopenharmony_ci * tcp_readable(), poll() after URG 1108c2ecf20Sopenharmony_ci * works now. 1118c2ecf20Sopenharmony_ci * Michael Pall : recv(...,MSG_OOB) never blocks in the 1128c2ecf20Sopenharmony_ci * BSD api. 1138c2ecf20Sopenharmony_ci * Alan Cox : Changed the semantics of sk->socket to 1148c2ecf20Sopenharmony_ci * fix a race and a signal problem with 1158c2ecf20Sopenharmony_ci * accept() and async I/O. 1168c2ecf20Sopenharmony_ci * Alan Cox : Relaxed the rules on tcp_sendto(). 1178c2ecf20Sopenharmony_ci * Yury Shevchuk : Really fixed accept() blocking problem. 1188c2ecf20Sopenharmony_ci * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for 1198c2ecf20Sopenharmony_ci * clients/servers which listen in on 1208c2ecf20Sopenharmony_ci * fixed ports. 1218c2ecf20Sopenharmony_ci * Alan Cox : Cleaned the above up and shrank it to 1228c2ecf20Sopenharmony_ci * a sensible code size. 1238c2ecf20Sopenharmony_ci * Alan Cox : Self connect lockup fix. 1248c2ecf20Sopenharmony_ci * Alan Cox : No connect to multicast. 1258c2ecf20Sopenharmony_ci * Ross Biro : Close unaccepted children on master 1268c2ecf20Sopenharmony_ci * socket close. 1278c2ecf20Sopenharmony_ci * Alan Cox : Reset tracing code. 1288c2ecf20Sopenharmony_ci * Alan Cox : Spurious resets on shutdown. 1298c2ecf20Sopenharmony_ci * Alan Cox : Giant 15 minute/60 second timer error 1308c2ecf20Sopenharmony_ci * Alan Cox : Small whoops in polling before an 1318c2ecf20Sopenharmony_ci * accept. 1328c2ecf20Sopenharmony_ci * Alan Cox : Kept the state trace facility since 1338c2ecf20Sopenharmony_ci * it's handy for debugging. 1348c2ecf20Sopenharmony_ci * Alan Cox : More reset handler fixes. 1358c2ecf20Sopenharmony_ci * Alan Cox : Started rewriting the code based on 1368c2ecf20Sopenharmony_ci * the RFC's for other useful protocol 1378c2ecf20Sopenharmony_ci * references see: Comer, KA9Q NOS, and 1388c2ecf20Sopenharmony_ci * for a reference on the difference 1398c2ecf20Sopenharmony_ci * between specifications and how BSD 1408c2ecf20Sopenharmony_ci * works see the 4.4lite source. 1418c2ecf20Sopenharmony_ci * A.N.Kuznetsov : Don't time wait on completion of tidy 1428c2ecf20Sopenharmony_ci * close. 1438c2ecf20Sopenharmony_ci * Linus Torvalds : Fin/Shutdown & copied_seq changes. 1448c2ecf20Sopenharmony_ci * Linus Torvalds : Fixed BSD port reuse to work first syn 1458c2ecf20Sopenharmony_ci * Alan Cox : Reimplemented timers as per the RFC 1468c2ecf20Sopenharmony_ci * and using multiple timers for sanity. 1478c2ecf20Sopenharmony_ci * Alan Cox : Small bug fixes, and a lot of new 1488c2ecf20Sopenharmony_ci * comments. 1498c2ecf20Sopenharmony_ci * Alan Cox : Fixed dual reader crash by locking 1508c2ecf20Sopenharmony_ci * the buffers (much like datagram.c) 1518c2ecf20Sopenharmony_ci * Alan Cox : Fixed stuck sockets in probe. A probe 1528c2ecf20Sopenharmony_ci * now gets fed up of retrying without 1538c2ecf20Sopenharmony_ci * (even a no space) answer. 1548c2ecf20Sopenharmony_ci * Alan Cox : Extracted closing code better 1558c2ecf20Sopenharmony_ci * Alan Cox : Fixed the closing state machine to 1568c2ecf20Sopenharmony_ci * resemble the RFC. 1578c2ecf20Sopenharmony_ci * Alan Cox : More 'per spec' fixes. 1588c2ecf20Sopenharmony_ci * Jorge Cwik : Even faster checksumming. 1598c2ecf20Sopenharmony_ci * Alan Cox : tcp_data() doesn't ack illegal PSH 1608c2ecf20Sopenharmony_ci * only frames. At least one pc tcp stack 1618c2ecf20Sopenharmony_ci * generates them. 1628c2ecf20Sopenharmony_ci * Alan Cox : Cache last socket. 1638c2ecf20Sopenharmony_ci * Alan Cox : Per route irtt. 1648c2ecf20Sopenharmony_ci * Matt Day : poll()->select() match BSD precisely on error 1658c2ecf20Sopenharmony_ci * Alan Cox : New buffers 1668c2ecf20Sopenharmony_ci * Marc Tamsky : Various sk->prot->retransmits and 1678c2ecf20Sopenharmony_ci * sk->retransmits misupdating fixed. 1688c2ecf20Sopenharmony_ci * Fixed tcp_write_timeout: stuck close, 1698c2ecf20Sopenharmony_ci * and TCP syn retries gets used now. 1708c2ecf20Sopenharmony_ci * Mark Yarvis : In tcp_read_wakeup(), don't send an 1718c2ecf20Sopenharmony_ci * ack if state is TCP_CLOSED. 1728c2ecf20Sopenharmony_ci * Alan Cox : Look up device on a retransmit - routes may 1738c2ecf20Sopenharmony_ci * change. Doesn't yet cope with MSS shrink right 1748c2ecf20Sopenharmony_ci * but it's a start! 1758c2ecf20Sopenharmony_ci * Marc Tamsky : Closing in closing fixes. 1768c2ecf20Sopenharmony_ci * Mike Shaver : RFC1122 verifications. 1778c2ecf20Sopenharmony_ci * Alan Cox : rcv_saddr errors. 1788c2ecf20Sopenharmony_ci * Alan Cox : Block double connect(). 1798c2ecf20Sopenharmony_ci * Alan Cox : Small hooks for enSKIP. 1808c2ecf20Sopenharmony_ci * Alexey Kuznetsov: Path MTU discovery. 1818c2ecf20Sopenharmony_ci * Alan Cox : Support soft errors. 1828c2ecf20Sopenharmony_ci * Alan Cox : Fix MTU discovery pathological case 1838c2ecf20Sopenharmony_ci * when the remote claims no mtu! 1848c2ecf20Sopenharmony_ci * Marc Tamsky : TCP_CLOSE fix. 1858c2ecf20Sopenharmony_ci * Colin (G3TNE) : Send a reset on syn ack replies in 1868c2ecf20Sopenharmony_ci * window but wrong (fixes NT lpd problems) 1878c2ecf20Sopenharmony_ci * Pedro Roque : Better TCP window handling, delayed ack. 1888c2ecf20Sopenharmony_ci * Joerg Reuter : No modification of locked buffers in 1898c2ecf20Sopenharmony_ci * tcp_do_retransmit() 1908c2ecf20Sopenharmony_ci * Eric Schenk : Changed receiver side silly window 1918c2ecf20Sopenharmony_ci * avoidance algorithm to BSD style 1928c2ecf20Sopenharmony_ci * algorithm. This doubles throughput 1938c2ecf20Sopenharmony_ci * against machines running Solaris, 1948c2ecf20Sopenharmony_ci * and seems to result in general 1958c2ecf20Sopenharmony_ci * improvement. 1968c2ecf20Sopenharmony_ci * Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD 1978c2ecf20Sopenharmony_ci * Willy Konynenberg : Transparent proxying support. 1988c2ecf20Sopenharmony_ci * Mike McLagan : Routing by source 1998c2ecf20Sopenharmony_ci * Keith Owens : Do proper merging with partial SKB's in 2008c2ecf20Sopenharmony_ci * tcp_do_sendmsg to avoid burstiness. 2018c2ecf20Sopenharmony_ci * Eric Schenk : Fix fast close down bug with 2028c2ecf20Sopenharmony_ci * shutdown() followed by close(). 2038c2ecf20Sopenharmony_ci * Andi Kleen : Make poll agree with SIGIO 2048c2ecf20Sopenharmony_ci * Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and 2058c2ecf20Sopenharmony_ci * lingertime == 0 (RFC 793 ABORT Call) 2068c2ecf20Sopenharmony_ci * Hirokazu Takahashi : Use copy_from_user() instead of 2078c2ecf20Sopenharmony_ci * csum_and_copy_from_user() if possible. 2088c2ecf20Sopenharmony_ci * 2098c2ecf20Sopenharmony_ci * Description of States: 2108c2ecf20Sopenharmony_ci * 2118c2ecf20Sopenharmony_ci * TCP_SYN_SENT sent a connection request, waiting for ack 2128c2ecf20Sopenharmony_ci * 2138c2ecf20Sopenharmony_ci * TCP_SYN_RECV received a connection request, sent ack, 2148c2ecf20Sopenharmony_ci * waiting for final ack in three-way handshake. 2158c2ecf20Sopenharmony_ci * 2168c2ecf20Sopenharmony_ci * TCP_ESTABLISHED connection established 2178c2ecf20Sopenharmony_ci * 2188c2ecf20Sopenharmony_ci * TCP_FIN_WAIT1 our side has shutdown, waiting to complete 2198c2ecf20Sopenharmony_ci * transmission of remaining buffered data 2208c2ecf20Sopenharmony_ci * 2218c2ecf20Sopenharmony_ci * TCP_FIN_WAIT2 all buffered data sent, waiting for remote 2228c2ecf20Sopenharmony_ci * to shutdown 2238c2ecf20Sopenharmony_ci * 2248c2ecf20Sopenharmony_ci * TCP_CLOSING both sides have shutdown but we still have 2258c2ecf20Sopenharmony_ci * data we have to finish sending 2268c2ecf20Sopenharmony_ci * 2278c2ecf20Sopenharmony_ci * TCP_TIME_WAIT timeout to catch resent junk before entering 2288c2ecf20Sopenharmony_ci * closed, can only be entered from FIN_WAIT2 2298c2ecf20Sopenharmony_ci * or CLOSING. Required because the other end 2308c2ecf20Sopenharmony_ci * may not have gotten our last ACK causing it 2318c2ecf20Sopenharmony_ci * to retransmit the data packet (which we ignore) 2328c2ecf20Sopenharmony_ci * 2338c2ecf20Sopenharmony_ci * TCP_CLOSE_WAIT remote side has shutdown and is waiting for 2348c2ecf20Sopenharmony_ci * us to finish writing our data and to shutdown 2358c2ecf20Sopenharmony_ci * (we have to close() to move on to LAST_ACK) 2368c2ecf20Sopenharmony_ci * 2378c2ecf20Sopenharmony_ci * TCP_LAST_ACK out side has shutdown after remote has 2388c2ecf20Sopenharmony_ci * shutdown. There may still be data in our 2398c2ecf20Sopenharmony_ci * buffer that we have to finish sending 2408c2ecf20Sopenharmony_ci * 2418c2ecf20Sopenharmony_ci * TCP_CLOSE socket is finished 2428c2ecf20Sopenharmony_ci */ 2438c2ecf20Sopenharmony_ci 2448c2ecf20Sopenharmony_ci#define pr_fmt(fmt) "TCP: " fmt 2458c2ecf20Sopenharmony_ci 2468c2ecf20Sopenharmony_ci#include <crypto/hash.h> 2478c2ecf20Sopenharmony_ci#include <linux/kernel.h> 2488c2ecf20Sopenharmony_ci#include <linux/module.h> 2498c2ecf20Sopenharmony_ci#include <linux/types.h> 2508c2ecf20Sopenharmony_ci#include <linux/fcntl.h> 2518c2ecf20Sopenharmony_ci#include <linux/poll.h> 2528c2ecf20Sopenharmony_ci#include <linux/inet_diag.h> 2538c2ecf20Sopenharmony_ci#include <linux/init.h> 2548c2ecf20Sopenharmony_ci#include <linux/fs.h> 2558c2ecf20Sopenharmony_ci#include <linux/skbuff.h> 2568c2ecf20Sopenharmony_ci#include <linux/scatterlist.h> 2578c2ecf20Sopenharmony_ci#include <linux/splice.h> 2588c2ecf20Sopenharmony_ci#include <linux/net.h> 2598c2ecf20Sopenharmony_ci#include <linux/socket.h> 2608c2ecf20Sopenharmony_ci#include <linux/random.h> 2618c2ecf20Sopenharmony_ci#include <linux/memblock.h> 2628c2ecf20Sopenharmony_ci#include <linux/highmem.h> 2638c2ecf20Sopenharmony_ci#include <linux/swap.h> 2648c2ecf20Sopenharmony_ci#include <linux/cache.h> 2658c2ecf20Sopenharmony_ci#include <linux/err.h> 2668c2ecf20Sopenharmony_ci#include <linux/time.h> 2678c2ecf20Sopenharmony_ci#include <linux/slab.h> 2688c2ecf20Sopenharmony_ci#include <linux/errqueue.h> 2698c2ecf20Sopenharmony_ci#include <linux/static_key.h> 2708c2ecf20Sopenharmony_ci 2718c2ecf20Sopenharmony_ci#include <net/icmp.h> 2728c2ecf20Sopenharmony_ci#include <net/inet_common.h> 2738c2ecf20Sopenharmony_ci#include <net/tcp.h> 2748c2ecf20Sopenharmony_ci#include <net/mptcp.h> 2758c2ecf20Sopenharmony_ci#include <net/xfrm.h> 2768c2ecf20Sopenharmony_ci#include <net/ip.h> 2778c2ecf20Sopenharmony_ci#include <net/sock.h> 2788c2ecf20Sopenharmony_ci 2798c2ecf20Sopenharmony_ci#include <linux/uaccess.h> 2808c2ecf20Sopenharmony_ci#include <asm/ioctls.h> 2818c2ecf20Sopenharmony_ci#include <net/busy_poll.h> 2828c2ecf20Sopenharmony_ci#ifdef CONFIG_LOWPOWER_PROTOCOL 2838c2ecf20Sopenharmony_ci#include <net/lowpower_protocol.h> 2848c2ecf20Sopenharmony_ci#endif /* CONFIG_LOWPOWER_PROTOCOL */ 2858c2ecf20Sopenharmony_ci#if defined(CONFIG_TCP_NATA_URC) || defined(CONFIG_TCP_NATA_STL) 2868c2ecf20Sopenharmony_ci#include <net/nata.h> 2878c2ecf20Sopenharmony_ci#endif 2888c2ecf20Sopenharmony_ci 2898c2ecf20Sopenharmony_ciDEFINE_PER_CPU(unsigned int, tcp_orphan_count); 2908c2ecf20Sopenharmony_ciEXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count); 2918c2ecf20Sopenharmony_ci 2928c2ecf20Sopenharmony_cilong sysctl_tcp_mem[3] __read_mostly; 2938c2ecf20Sopenharmony_ciEXPORT_SYMBOL(sysctl_tcp_mem); 2948c2ecf20Sopenharmony_ci 2958c2ecf20Sopenharmony_ciatomic_long_t tcp_memory_allocated; /* Current allocated memory. */ 2968c2ecf20Sopenharmony_ciEXPORT_SYMBOL(tcp_memory_allocated); 2978c2ecf20Sopenharmony_ci 2988c2ecf20Sopenharmony_ci#if IS_ENABLED(CONFIG_SMC) 2998c2ecf20Sopenharmony_ciDEFINE_STATIC_KEY_FALSE(tcp_have_smc); 3008c2ecf20Sopenharmony_ciEXPORT_SYMBOL(tcp_have_smc); 3018c2ecf20Sopenharmony_ci#endif 3028c2ecf20Sopenharmony_ci 3038c2ecf20Sopenharmony_ci/* 3048c2ecf20Sopenharmony_ci * Current number of TCP sockets. 3058c2ecf20Sopenharmony_ci */ 3068c2ecf20Sopenharmony_cistruct percpu_counter tcp_sockets_allocated; 3078c2ecf20Sopenharmony_ciEXPORT_SYMBOL(tcp_sockets_allocated); 3088c2ecf20Sopenharmony_ci 3098c2ecf20Sopenharmony_ci/* 3108c2ecf20Sopenharmony_ci * TCP splice context 3118c2ecf20Sopenharmony_ci */ 3128c2ecf20Sopenharmony_cistruct tcp_splice_state { 3138c2ecf20Sopenharmony_ci struct pipe_inode_info *pipe; 3148c2ecf20Sopenharmony_ci size_t len; 3158c2ecf20Sopenharmony_ci unsigned int flags; 3168c2ecf20Sopenharmony_ci}; 3178c2ecf20Sopenharmony_ci 3188c2ecf20Sopenharmony_ci/* 3198c2ecf20Sopenharmony_ci * Pressure flag: try to collapse. 3208c2ecf20Sopenharmony_ci * Technical note: it is used by multiple contexts non atomically. 3218c2ecf20Sopenharmony_ci * All the __sk_mem_schedule() is of this nature: accounting 3228c2ecf20Sopenharmony_ci * is strict, actions are advisory and have some latency. 3238c2ecf20Sopenharmony_ci */ 3248c2ecf20Sopenharmony_ciunsigned long tcp_memory_pressure __read_mostly; 3258c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(tcp_memory_pressure); 3268c2ecf20Sopenharmony_ci 3278c2ecf20Sopenharmony_ciDEFINE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key); 3288c2ecf20Sopenharmony_ciEXPORT_SYMBOL(tcp_rx_skb_cache_key); 3298c2ecf20Sopenharmony_ci 3308c2ecf20Sopenharmony_ciDEFINE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key); 3318c2ecf20Sopenharmony_ci 3328c2ecf20Sopenharmony_civoid tcp_enter_memory_pressure(struct sock *sk) 3338c2ecf20Sopenharmony_ci{ 3348c2ecf20Sopenharmony_ci unsigned long val; 3358c2ecf20Sopenharmony_ci 3368c2ecf20Sopenharmony_ci if (READ_ONCE(tcp_memory_pressure)) 3378c2ecf20Sopenharmony_ci return; 3388c2ecf20Sopenharmony_ci val = jiffies; 3398c2ecf20Sopenharmony_ci 3408c2ecf20Sopenharmony_ci if (!val) 3418c2ecf20Sopenharmony_ci val--; 3428c2ecf20Sopenharmony_ci if (!cmpxchg(&tcp_memory_pressure, 0, val)) 3438c2ecf20Sopenharmony_ci NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES); 3448c2ecf20Sopenharmony_ci} 3458c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(tcp_enter_memory_pressure); 3468c2ecf20Sopenharmony_ci 3478c2ecf20Sopenharmony_civoid tcp_leave_memory_pressure(struct sock *sk) 3488c2ecf20Sopenharmony_ci{ 3498c2ecf20Sopenharmony_ci unsigned long val; 3508c2ecf20Sopenharmony_ci 3518c2ecf20Sopenharmony_ci if (!READ_ONCE(tcp_memory_pressure)) 3528c2ecf20Sopenharmony_ci return; 3538c2ecf20Sopenharmony_ci val = xchg(&tcp_memory_pressure, 0); 3548c2ecf20Sopenharmony_ci if (val) 3558c2ecf20Sopenharmony_ci NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURESCHRONO, 3568c2ecf20Sopenharmony_ci jiffies_to_msecs(jiffies - val)); 3578c2ecf20Sopenharmony_ci} 3588c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(tcp_leave_memory_pressure); 3598c2ecf20Sopenharmony_ci 3608c2ecf20Sopenharmony_ci/* Convert seconds to retransmits based on initial and max timeout */ 3618c2ecf20Sopenharmony_cistatic u8 secs_to_retrans(int seconds, int timeout, int rto_max) 3628c2ecf20Sopenharmony_ci{ 3638c2ecf20Sopenharmony_ci u8 res = 0; 3648c2ecf20Sopenharmony_ci 3658c2ecf20Sopenharmony_ci if (seconds > 0) { 3668c2ecf20Sopenharmony_ci int period = timeout; 3678c2ecf20Sopenharmony_ci 3688c2ecf20Sopenharmony_ci res = 1; 3698c2ecf20Sopenharmony_ci while (seconds > period && res < 255) { 3708c2ecf20Sopenharmony_ci res++; 3718c2ecf20Sopenharmony_ci timeout <<= 1; 3728c2ecf20Sopenharmony_ci if (timeout > rto_max) 3738c2ecf20Sopenharmony_ci timeout = rto_max; 3748c2ecf20Sopenharmony_ci period += timeout; 3758c2ecf20Sopenharmony_ci } 3768c2ecf20Sopenharmony_ci } 3778c2ecf20Sopenharmony_ci return res; 3788c2ecf20Sopenharmony_ci} 3798c2ecf20Sopenharmony_ci 3808c2ecf20Sopenharmony_ci/* Convert retransmits to seconds based on initial and max timeout */ 3818c2ecf20Sopenharmony_cistatic int retrans_to_secs(u8 retrans, int timeout, int rto_max) 3828c2ecf20Sopenharmony_ci{ 3838c2ecf20Sopenharmony_ci int period = 0; 3848c2ecf20Sopenharmony_ci 3858c2ecf20Sopenharmony_ci if (retrans > 0) { 3868c2ecf20Sopenharmony_ci period = timeout; 3878c2ecf20Sopenharmony_ci while (--retrans) { 3888c2ecf20Sopenharmony_ci timeout <<= 1; 3898c2ecf20Sopenharmony_ci if (timeout > rto_max) 3908c2ecf20Sopenharmony_ci timeout = rto_max; 3918c2ecf20Sopenharmony_ci period += timeout; 3928c2ecf20Sopenharmony_ci } 3938c2ecf20Sopenharmony_ci } 3948c2ecf20Sopenharmony_ci return period; 3958c2ecf20Sopenharmony_ci} 3968c2ecf20Sopenharmony_ci 3978c2ecf20Sopenharmony_cistatic u64 tcp_compute_delivery_rate(const struct tcp_sock *tp) 3988c2ecf20Sopenharmony_ci{ 3998c2ecf20Sopenharmony_ci u32 rate = READ_ONCE(tp->rate_delivered); 4008c2ecf20Sopenharmony_ci u32 intv = READ_ONCE(tp->rate_interval_us); 4018c2ecf20Sopenharmony_ci u64 rate64 = 0; 4028c2ecf20Sopenharmony_ci 4038c2ecf20Sopenharmony_ci if (rate && intv) { 4048c2ecf20Sopenharmony_ci rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC; 4058c2ecf20Sopenharmony_ci do_div(rate64, intv); 4068c2ecf20Sopenharmony_ci } 4078c2ecf20Sopenharmony_ci return rate64; 4088c2ecf20Sopenharmony_ci} 4098c2ecf20Sopenharmony_ci 4108c2ecf20Sopenharmony_ci/* Address-family independent initialization for a tcp_sock. 4118c2ecf20Sopenharmony_ci * 4128c2ecf20Sopenharmony_ci * NOTE: A lot of things set to zero explicitly by call to 4138c2ecf20Sopenharmony_ci * sk_alloc() so need not be done here. 4148c2ecf20Sopenharmony_ci */ 4158c2ecf20Sopenharmony_civoid tcp_init_sock(struct sock *sk) 4168c2ecf20Sopenharmony_ci{ 4178c2ecf20Sopenharmony_ci struct inet_connection_sock *icsk = inet_csk(sk); 4188c2ecf20Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 4198c2ecf20Sopenharmony_ci 4208c2ecf20Sopenharmony_ci tp->out_of_order_queue = RB_ROOT; 4218c2ecf20Sopenharmony_ci sk->tcp_rtx_queue = RB_ROOT; 4228c2ecf20Sopenharmony_ci tcp_init_xmit_timers(sk); 4238c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&tp->tsq_node); 4248c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&tp->tsorted_sent_queue); 4258c2ecf20Sopenharmony_ci 4268c2ecf20Sopenharmony_ci icsk->icsk_rto = TCP_TIMEOUT_INIT; 4278c2ecf20Sopenharmony_ci icsk->icsk_rto_min = TCP_RTO_MIN; 4288c2ecf20Sopenharmony_ci icsk->icsk_delack_max = TCP_DELACK_MAX; 4298c2ecf20Sopenharmony_ci tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); 4308c2ecf20Sopenharmony_ci minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U); 4318c2ecf20Sopenharmony_ci 4328c2ecf20Sopenharmony_ci /* So many TCP implementations out there (incorrectly) count the 4338c2ecf20Sopenharmony_ci * initial SYN frame in their delayed-ACK and congestion control 4348c2ecf20Sopenharmony_ci * algorithms that we must have the following bandaid to talk 4358c2ecf20Sopenharmony_ci * efficiently to them. -DaveM 4368c2ecf20Sopenharmony_ci */ 4378c2ecf20Sopenharmony_ci tp->snd_cwnd = TCP_INIT_CWND; 4388c2ecf20Sopenharmony_ci 4398c2ecf20Sopenharmony_ci /* There's a bubble in the pipe until at least the first ACK. */ 4408c2ecf20Sopenharmony_ci tp->app_limited = ~0U; 4418c2ecf20Sopenharmony_ci tp->rate_app_limited = 1; 4428c2ecf20Sopenharmony_ci 4438c2ecf20Sopenharmony_ci /* See draft-stevens-tcpca-spec-01 for discussion of the 4448c2ecf20Sopenharmony_ci * initialization of these values. 4458c2ecf20Sopenharmony_ci */ 4468c2ecf20Sopenharmony_ci tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; 4478c2ecf20Sopenharmony_ci tp->snd_cwnd_clamp = ~0; 4488c2ecf20Sopenharmony_ci tp->mss_cache = TCP_MSS_DEFAULT; 4498c2ecf20Sopenharmony_ci 4508c2ecf20Sopenharmony_ci tp->reordering = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering); 4518c2ecf20Sopenharmony_ci tcp_assign_congestion_control(sk); 4528c2ecf20Sopenharmony_ci 4538c2ecf20Sopenharmony_ci tp->tsoffset = 0; 4548c2ecf20Sopenharmony_ci tp->rack.reo_wnd_steps = 1; 4558c2ecf20Sopenharmony_ci 4568c2ecf20Sopenharmony_ci sk->sk_write_space = sk_stream_write_space; 4578c2ecf20Sopenharmony_ci sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 4588c2ecf20Sopenharmony_ci 4598c2ecf20Sopenharmony_ci icsk->icsk_sync_mss = tcp_sync_mss; 4608c2ecf20Sopenharmony_ci 4618c2ecf20Sopenharmony_ci WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1])); 4628c2ecf20Sopenharmony_ci WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1])); 4638c2ecf20Sopenharmony_ci 4648c2ecf20Sopenharmony_ci sk_sockets_allocated_inc(sk); 4658c2ecf20Sopenharmony_ci sk->sk_route_forced_caps = NETIF_F_GSO; 4668c2ecf20Sopenharmony_ci#if defined(CONFIG_TCP_NATA_URC) || defined(CONFIG_TCP_NATA_STL) 4678c2ecf20Sopenharmony_ci icsk->nata_retries_enabled = 0; 4688c2ecf20Sopenharmony_ci icsk->nata_retries_type = NATA_NA; 4698c2ecf20Sopenharmony_ci icsk->nata_syn_rto = TCP_TIMEOUT_INIT; 4708c2ecf20Sopenharmony_ci icsk->nata_data_rto = TCP_TIMEOUT_INIT; 4718c2ecf20Sopenharmony_ci icsk->nata_data_retries = 0; 4728c2ecf20Sopenharmony_ci#endif 4738c2ecf20Sopenharmony_ci} 4748c2ecf20Sopenharmony_ciEXPORT_SYMBOL(tcp_init_sock); 4758c2ecf20Sopenharmony_ci 4768c2ecf20Sopenharmony_cistatic void tcp_tx_timestamp(struct sock *sk, u16 tsflags) 4778c2ecf20Sopenharmony_ci{ 4788c2ecf20Sopenharmony_ci struct sk_buff *skb = tcp_write_queue_tail(sk); 4798c2ecf20Sopenharmony_ci 4808c2ecf20Sopenharmony_ci if (tsflags && skb) { 4818c2ecf20Sopenharmony_ci struct skb_shared_info *shinfo = skb_shinfo(skb); 4828c2ecf20Sopenharmony_ci struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); 4838c2ecf20Sopenharmony_ci 4848c2ecf20Sopenharmony_ci sock_tx_timestamp(sk, tsflags, &shinfo->tx_flags); 4858c2ecf20Sopenharmony_ci if (tsflags & SOF_TIMESTAMPING_TX_ACK) 4868c2ecf20Sopenharmony_ci tcb->txstamp_ack = 1; 4878c2ecf20Sopenharmony_ci if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) 4888c2ecf20Sopenharmony_ci shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1; 4898c2ecf20Sopenharmony_ci } 4908c2ecf20Sopenharmony_ci} 4918c2ecf20Sopenharmony_ci 4928c2ecf20Sopenharmony_cistatic inline bool tcp_stream_is_readable(const struct tcp_sock *tp, 4938c2ecf20Sopenharmony_ci int target, struct sock *sk) 4948c2ecf20Sopenharmony_ci{ 4958c2ecf20Sopenharmony_ci int avail = READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq); 4968c2ecf20Sopenharmony_ci 4978c2ecf20Sopenharmony_ci if (avail > 0) { 4988c2ecf20Sopenharmony_ci if (avail >= target) 4998c2ecf20Sopenharmony_ci return true; 5008c2ecf20Sopenharmony_ci if (tcp_rmem_pressure(sk)) 5018c2ecf20Sopenharmony_ci return true; 5028c2ecf20Sopenharmony_ci if (tcp_receive_window(tp) <= inet_csk(sk)->icsk_ack.rcv_mss) 5038c2ecf20Sopenharmony_ci return true; 5048c2ecf20Sopenharmony_ci } 5058c2ecf20Sopenharmony_ci if (sk->sk_prot->stream_memory_read) 5068c2ecf20Sopenharmony_ci return sk->sk_prot->stream_memory_read(sk); 5078c2ecf20Sopenharmony_ci return false; 5088c2ecf20Sopenharmony_ci} 5098c2ecf20Sopenharmony_ci 5108c2ecf20Sopenharmony_ci/* 5118c2ecf20Sopenharmony_ci * Wait for a TCP event. 5128c2ecf20Sopenharmony_ci * 5138c2ecf20Sopenharmony_ci * Note that we don't need to lock the socket, as the upper poll layers 5148c2ecf20Sopenharmony_ci * take care of normal races (between the test and the event) and we don't 5158c2ecf20Sopenharmony_ci * go look at any of the socket buffers directly. 5168c2ecf20Sopenharmony_ci */ 5178c2ecf20Sopenharmony_ci__poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) 5188c2ecf20Sopenharmony_ci{ 5198c2ecf20Sopenharmony_ci __poll_t mask; 5208c2ecf20Sopenharmony_ci struct sock *sk = sock->sk; 5218c2ecf20Sopenharmony_ci const struct tcp_sock *tp = tcp_sk(sk); 5228c2ecf20Sopenharmony_ci u8 shutdown; 5238c2ecf20Sopenharmony_ci int state; 5248c2ecf20Sopenharmony_ci 5258c2ecf20Sopenharmony_ci sock_poll_wait(file, sock, wait); 5268c2ecf20Sopenharmony_ci 5278c2ecf20Sopenharmony_ci state = inet_sk_state_load(sk); 5288c2ecf20Sopenharmony_ci if (state == TCP_LISTEN) 5298c2ecf20Sopenharmony_ci return inet_csk_listen_poll(sk); 5308c2ecf20Sopenharmony_ci 5318c2ecf20Sopenharmony_ci /* Socket is not locked. We are protected from async events 5328c2ecf20Sopenharmony_ci * by poll logic and correct handling of state changes 5338c2ecf20Sopenharmony_ci * made by other threads is impossible in any case. 5348c2ecf20Sopenharmony_ci */ 5358c2ecf20Sopenharmony_ci 5368c2ecf20Sopenharmony_ci mask = 0; 5378c2ecf20Sopenharmony_ci 5388c2ecf20Sopenharmony_ci /* 5398c2ecf20Sopenharmony_ci * EPOLLHUP is certainly not done right. But poll() doesn't 5408c2ecf20Sopenharmony_ci * have a notion of HUP in just one direction, and for a 5418c2ecf20Sopenharmony_ci * socket the read side is more interesting. 5428c2ecf20Sopenharmony_ci * 5438c2ecf20Sopenharmony_ci * Some poll() documentation says that EPOLLHUP is incompatible 5448c2ecf20Sopenharmony_ci * with the EPOLLOUT/POLLWR flags, so somebody should check this 5458c2ecf20Sopenharmony_ci * all. But careful, it tends to be safer to return too many 5468c2ecf20Sopenharmony_ci * bits than too few, and you can easily break real applications 5478c2ecf20Sopenharmony_ci * if you don't tell them that something has hung up! 5488c2ecf20Sopenharmony_ci * 5498c2ecf20Sopenharmony_ci * Check-me. 5508c2ecf20Sopenharmony_ci * 5518c2ecf20Sopenharmony_ci * Check number 1. EPOLLHUP is _UNMASKABLE_ event (see UNIX98 and 5528c2ecf20Sopenharmony_ci * our fs/select.c). It means that after we received EOF, 5538c2ecf20Sopenharmony_ci * poll always returns immediately, making impossible poll() on write() 5548c2ecf20Sopenharmony_ci * in state CLOSE_WAIT. One solution is evident --- to set EPOLLHUP 5558c2ecf20Sopenharmony_ci * if and only if shutdown has been made in both directions. 5568c2ecf20Sopenharmony_ci * Actually, it is interesting to look how Solaris and DUX 5578c2ecf20Sopenharmony_ci * solve this dilemma. I would prefer, if EPOLLHUP were maskable, 5588c2ecf20Sopenharmony_ci * then we could set it on SND_SHUTDOWN. BTW examples given 5598c2ecf20Sopenharmony_ci * in Stevens' books assume exactly this behaviour, it explains 5608c2ecf20Sopenharmony_ci * why EPOLLHUP is incompatible with EPOLLOUT. --ANK 5618c2ecf20Sopenharmony_ci * 5628c2ecf20Sopenharmony_ci * NOTE. Check for TCP_CLOSE is added. The goal is to prevent 5638c2ecf20Sopenharmony_ci * blocking on fresh not-connected or disconnected socket. --ANK 5648c2ecf20Sopenharmony_ci */ 5658c2ecf20Sopenharmony_ci shutdown = READ_ONCE(sk->sk_shutdown); 5668c2ecf20Sopenharmony_ci if (shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) 5678c2ecf20Sopenharmony_ci mask |= EPOLLHUP; 5688c2ecf20Sopenharmony_ci if (shutdown & RCV_SHUTDOWN) 5698c2ecf20Sopenharmony_ci mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; 5708c2ecf20Sopenharmony_ci 5718c2ecf20Sopenharmony_ci /* Connected or passive Fast Open socket? */ 5728c2ecf20Sopenharmony_ci if (state != TCP_SYN_SENT && 5738c2ecf20Sopenharmony_ci (state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) { 5748c2ecf20Sopenharmony_ci int target = sock_rcvlowat(sk, 0, INT_MAX); 5758c2ecf20Sopenharmony_ci 5768c2ecf20Sopenharmony_ci if (READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) && 5778c2ecf20Sopenharmony_ci !sock_flag(sk, SOCK_URGINLINE) && 5788c2ecf20Sopenharmony_ci tp->urg_data) 5798c2ecf20Sopenharmony_ci target++; 5808c2ecf20Sopenharmony_ci 5818c2ecf20Sopenharmony_ci if (tcp_stream_is_readable(tp, target, sk)) 5828c2ecf20Sopenharmony_ci mask |= EPOLLIN | EPOLLRDNORM; 5838c2ecf20Sopenharmony_ci 5848c2ecf20Sopenharmony_ci if (!(shutdown & SEND_SHUTDOWN)) { 5858c2ecf20Sopenharmony_ci if (__sk_stream_is_writeable(sk, 1)) { 5868c2ecf20Sopenharmony_ci mask |= EPOLLOUT | EPOLLWRNORM; 5878c2ecf20Sopenharmony_ci } else { /* send SIGIO later */ 5888c2ecf20Sopenharmony_ci sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 5898c2ecf20Sopenharmony_ci set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 5908c2ecf20Sopenharmony_ci 5918c2ecf20Sopenharmony_ci /* Race breaker. If space is freed after 5928c2ecf20Sopenharmony_ci * wspace test but before the flags are set, 5938c2ecf20Sopenharmony_ci * IO signal will be lost. Memory barrier 5948c2ecf20Sopenharmony_ci * pairs with the input side. 5958c2ecf20Sopenharmony_ci */ 5968c2ecf20Sopenharmony_ci smp_mb__after_atomic(); 5978c2ecf20Sopenharmony_ci if (__sk_stream_is_writeable(sk, 1)) 5988c2ecf20Sopenharmony_ci mask |= EPOLLOUT | EPOLLWRNORM; 5998c2ecf20Sopenharmony_ci } 6008c2ecf20Sopenharmony_ci } else 6018c2ecf20Sopenharmony_ci mask |= EPOLLOUT | EPOLLWRNORM; 6028c2ecf20Sopenharmony_ci 6038c2ecf20Sopenharmony_ci if (tp->urg_data & TCP_URG_VALID) 6048c2ecf20Sopenharmony_ci mask |= EPOLLPRI; 6058c2ecf20Sopenharmony_ci } else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) { 6068c2ecf20Sopenharmony_ci /* Active TCP fastopen socket with defer_connect 6078c2ecf20Sopenharmony_ci * Return EPOLLOUT so application can call write() 6088c2ecf20Sopenharmony_ci * in order for kernel to generate SYN+data 6098c2ecf20Sopenharmony_ci */ 6108c2ecf20Sopenharmony_ci mask |= EPOLLOUT | EPOLLWRNORM; 6118c2ecf20Sopenharmony_ci } 6128c2ecf20Sopenharmony_ci /* This barrier is coupled with smp_wmb() in tcp_reset() */ 6138c2ecf20Sopenharmony_ci smp_rmb(); 6148c2ecf20Sopenharmony_ci if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) 6158c2ecf20Sopenharmony_ci mask |= EPOLLERR; 6168c2ecf20Sopenharmony_ci 6178c2ecf20Sopenharmony_ci return mask; 6188c2ecf20Sopenharmony_ci} 6198c2ecf20Sopenharmony_ciEXPORT_SYMBOL(tcp_poll); 6208c2ecf20Sopenharmony_ci 6218c2ecf20Sopenharmony_ciint tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) 6228c2ecf20Sopenharmony_ci{ 6238c2ecf20Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 6248c2ecf20Sopenharmony_ci int answ; 6258c2ecf20Sopenharmony_ci bool slow; 6268c2ecf20Sopenharmony_ci 6278c2ecf20Sopenharmony_ci switch (cmd) { 6288c2ecf20Sopenharmony_ci case SIOCINQ: 6298c2ecf20Sopenharmony_ci if (sk->sk_state == TCP_LISTEN) 6308c2ecf20Sopenharmony_ci return -EINVAL; 6318c2ecf20Sopenharmony_ci 6328c2ecf20Sopenharmony_ci slow = lock_sock_fast(sk); 6338c2ecf20Sopenharmony_ci answ = tcp_inq(sk); 6348c2ecf20Sopenharmony_ci unlock_sock_fast(sk, slow); 6358c2ecf20Sopenharmony_ci break; 6368c2ecf20Sopenharmony_ci case SIOCATMARK: 6378c2ecf20Sopenharmony_ci answ = tp->urg_data && 6388c2ecf20Sopenharmony_ci READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq); 6398c2ecf20Sopenharmony_ci break; 6408c2ecf20Sopenharmony_ci case SIOCOUTQ: 6418c2ecf20Sopenharmony_ci if (sk->sk_state == TCP_LISTEN) 6428c2ecf20Sopenharmony_ci return -EINVAL; 6438c2ecf20Sopenharmony_ci 6448c2ecf20Sopenharmony_ci if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) 6458c2ecf20Sopenharmony_ci answ = 0; 6468c2ecf20Sopenharmony_ci else 6478c2ecf20Sopenharmony_ci answ = READ_ONCE(tp->write_seq) - tp->snd_una; 6488c2ecf20Sopenharmony_ci break; 6498c2ecf20Sopenharmony_ci case SIOCOUTQNSD: 6508c2ecf20Sopenharmony_ci if (sk->sk_state == TCP_LISTEN) 6518c2ecf20Sopenharmony_ci return -EINVAL; 6528c2ecf20Sopenharmony_ci 6538c2ecf20Sopenharmony_ci if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) 6548c2ecf20Sopenharmony_ci answ = 0; 6558c2ecf20Sopenharmony_ci else 6568c2ecf20Sopenharmony_ci answ = READ_ONCE(tp->write_seq) - 6578c2ecf20Sopenharmony_ci READ_ONCE(tp->snd_nxt); 6588c2ecf20Sopenharmony_ci break; 6598c2ecf20Sopenharmony_ci default: 6608c2ecf20Sopenharmony_ci return -ENOIOCTLCMD; 6618c2ecf20Sopenharmony_ci } 6628c2ecf20Sopenharmony_ci 6638c2ecf20Sopenharmony_ci return put_user(answ, (int __user *)arg); 6648c2ecf20Sopenharmony_ci} 6658c2ecf20Sopenharmony_ciEXPORT_SYMBOL(tcp_ioctl); 6668c2ecf20Sopenharmony_ci 6678c2ecf20Sopenharmony_cistatic inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) 6688c2ecf20Sopenharmony_ci{ 6698c2ecf20Sopenharmony_ci TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; 6708c2ecf20Sopenharmony_ci tp->pushed_seq = tp->write_seq; 6718c2ecf20Sopenharmony_ci} 6728c2ecf20Sopenharmony_ci 6738c2ecf20Sopenharmony_cistatic inline bool forced_push(const struct tcp_sock *tp) 6748c2ecf20Sopenharmony_ci{ 6758c2ecf20Sopenharmony_ci return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1)); 6768c2ecf20Sopenharmony_ci} 6778c2ecf20Sopenharmony_ci 6788c2ecf20Sopenharmony_cistatic void skb_entail(struct sock *sk, struct sk_buff *skb) 6798c2ecf20Sopenharmony_ci{ 6808c2ecf20Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 6818c2ecf20Sopenharmony_ci struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); 6828c2ecf20Sopenharmony_ci 6838c2ecf20Sopenharmony_ci skb->csum = 0; 6848c2ecf20Sopenharmony_ci tcb->seq = tcb->end_seq = tp->write_seq; 6858c2ecf20Sopenharmony_ci tcb->tcp_flags = TCPHDR_ACK; 6868c2ecf20Sopenharmony_ci tcb->sacked = 0; 6878c2ecf20Sopenharmony_ci __skb_header_release(skb); 6888c2ecf20Sopenharmony_ci tcp_add_write_queue_tail(sk, skb); 6898c2ecf20Sopenharmony_ci sk_wmem_queued_add(sk, skb->truesize); 6908c2ecf20Sopenharmony_ci sk_mem_charge(sk, skb->truesize); 6918c2ecf20Sopenharmony_ci if (tp->nonagle & TCP_NAGLE_PUSH) 6928c2ecf20Sopenharmony_ci tp->nonagle &= ~TCP_NAGLE_PUSH; 6938c2ecf20Sopenharmony_ci 6948c2ecf20Sopenharmony_ci tcp_slow_start_after_idle_check(sk); 6958c2ecf20Sopenharmony_ci} 6968c2ecf20Sopenharmony_ci 6978c2ecf20Sopenharmony_cistatic inline void tcp_mark_urg(struct tcp_sock *tp, int flags) 6988c2ecf20Sopenharmony_ci{ 6998c2ecf20Sopenharmony_ci if (flags & MSG_OOB) 7008c2ecf20Sopenharmony_ci tp->snd_up = tp->write_seq; 7018c2ecf20Sopenharmony_ci} 7028c2ecf20Sopenharmony_ci 7038c2ecf20Sopenharmony_ci/* If a not yet filled skb is pushed, do not send it if 7048c2ecf20Sopenharmony_ci * we have data packets in Qdisc or NIC queues : 7058c2ecf20Sopenharmony_ci * Because TX completion will happen shortly, it gives a chance 7068c2ecf20Sopenharmony_ci * to coalesce future sendmsg() payload into this skb, without 7078c2ecf20Sopenharmony_ci * need for a timer, and with no latency trade off. 7088c2ecf20Sopenharmony_ci * As packets containing data payload have a bigger truesize 7098c2ecf20Sopenharmony_ci * than pure acks (dataless) packets, the last checks prevent 7108c2ecf20Sopenharmony_ci * autocorking if we only have an ACK in Qdisc/NIC queues, 7118c2ecf20Sopenharmony_ci * or if TX completion was delayed after we processed ACK packet. 7128c2ecf20Sopenharmony_ci */ 7138c2ecf20Sopenharmony_cistatic bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, 7148c2ecf20Sopenharmony_ci int size_goal) 7158c2ecf20Sopenharmony_ci{ 7168c2ecf20Sopenharmony_ci return skb->len < size_goal && 7178c2ecf20Sopenharmony_ci READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_autocorking) && 7188c2ecf20Sopenharmony_ci !tcp_rtx_queue_empty(sk) && 7198c2ecf20Sopenharmony_ci refcount_read(&sk->sk_wmem_alloc) > skb->truesize; 7208c2ecf20Sopenharmony_ci} 7218c2ecf20Sopenharmony_ci 7228c2ecf20Sopenharmony_civoid tcp_push(struct sock *sk, int flags, int mss_now, 7238c2ecf20Sopenharmony_ci int nonagle, int size_goal) 7248c2ecf20Sopenharmony_ci{ 7258c2ecf20Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 7268c2ecf20Sopenharmony_ci struct sk_buff *skb; 7278c2ecf20Sopenharmony_ci 7288c2ecf20Sopenharmony_ci skb = tcp_write_queue_tail(sk); 7298c2ecf20Sopenharmony_ci if (!skb) 7308c2ecf20Sopenharmony_ci return; 7318c2ecf20Sopenharmony_ci if (!(flags & MSG_MORE) || forced_push(tp)) 7328c2ecf20Sopenharmony_ci tcp_mark_push(tp, skb); 7338c2ecf20Sopenharmony_ci 7348c2ecf20Sopenharmony_ci tcp_mark_urg(tp, flags); 7358c2ecf20Sopenharmony_ci 7368c2ecf20Sopenharmony_ci if (tcp_should_autocork(sk, skb, size_goal)) { 7378c2ecf20Sopenharmony_ci 7388c2ecf20Sopenharmony_ci /* avoid atomic op if TSQ_THROTTLED bit is already set */ 7398c2ecf20Sopenharmony_ci if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) { 7408c2ecf20Sopenharmony_ci NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING); 7418c2ecf20Sopenharmony_ci set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); 7428c2ecf20Sopenharmony_ci smp_mb__after_atomic(); 7438c2ecf20Sopenharmony_ci } 7448c2ecf20Sopenharmony_ci /* It is possible TX completion already happened 7458c2ecf20Sopenharmony_ci * before we set TSQ_THROTTLED. 7468c2ecf20Sopenharmony_ci */ 7478c2ecf20Sopenharmony_ci if (refcount_read(&sk->sk_wmem_alloc) > skb->truesize) 7488c2ecf20Sopenharmony_ci return; 7498c2ecf20Sopenharmony_ci } 7508c2ecf20Sopenharmony_ci 7518c2ecf20Sopenharmony_ci if (flags & MSG_MORE) 7528c2ecf20Sopenharmony_ci nonagle = TCP_NAGLE_CORK; 7538c2ecf20Sopenharmony_ci 7548c2ecf20Sopenharmony_ci __tcp_push_pending_frames(sk, mss_now, nonagle); 7558c2ecf20Sopenharmony_ci} 7568c2ecf20Sopenharmony_ci 7578c2ecf20Sopenharmony_cistatic int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, 7588c2ecf20Sopenharmony_ci unsigned int offset, size_t len) 7598c2ecf20Sopenharmony_ci{ 7608c2ecf20Sopenharmony_ci struct tcp_splice_state *tss = rd_desc->arg.data; 7618c2ecf20Sopenharmony_ci int ret; 7628c2ecf20Sopenharmony_ci 7638c2ecf20Sopenharmony_ci ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe, 7648c2ecf20Sopenharmony_ci min(rd_desc->count, len), tss->flags); 7658c2ecf20Sopenharmony_ci if (ret > 0) 7668c2ecf20Sopenharmony_ci rd_desc->count -= ret; 7678c2ecf20Sopenharmony_ci return ret; 7688c2ecf20Sopenharmony_ci} 7698c2ecf20Sopenharmony_ci 7708c2ecf20Sopenharmony_cistatic int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss) 7718c2ecf20Sopenharmony_ci{ 7728c2ecf20Sopenharmony_ci /* Store TCP splice context information in read_descriptor_t. */ 7738c2ecf20Sopenharmony_ci read_descriptor_t rd_desc = { 7748c2ecf20Sopenharmony_ci .arg.data = tss, 7758c2ecf20Sopenharmony_ci .count = tss->len, 7768c2ecf20Sopenharmony_ci }; 7778c2ecf20Sopenharmony_ci 7788c2ecf20Sopenharmony_ci return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv); 7798c2ecf20Sopenharmony_ci} 7808c2ecf20Sopenharmony_ci 7818c2ecf20Sopenharmony_ci/** 7828c2ecf20Sopenharmony_ci * tcp_splice_read - splice data from TCP socket to a pipe 7838c2ecf20Sopenharmony_ci * @sock: socket to splice from 7848c2ecf20Sopenharmony_ci * @ppos: position (not valid) 7858c2ecf20Sopenharmony_ci * @pipe: pipe to splice to 7868c2ecf20Sopenharmony_ci * @len: number of bytes to splice 7878c2ecf20Sopenharmony_ci * @flags: splice modifier flags 7888c2ecf20Sopenharmony_ci * 7898c2ecf20Sopenharmony_ci * Description: 7908c2ecf20Sopenharmony_ci * Will read pages from given socket and fill them into a pipe. 7918c2ecf20Sopenharmony_ci * 7928c2ecf20Sopenharmony_ci **/ 7938c2ecf20Sopenharmony_cissize_t tcp_splice_read(struct socket *sock, loff_t *ppos, 7948c2ecf20Sopenharmony_ci struct pipe_inode_info *pipe, size_t len, 7958c2ecf20Sopenharmony_ci unsigned int flags) 7968c2ecf20Sopenharmony_ci{ 7978c2ecf20Sopenharmony_ci struct sock *sk = sock->sk; 7988c2ecf20Sopenharmony_ci struct tcp_splice_state tss = { 7998c2ecf20Sopenharmony_ci .pipe = pipe, 8008c2ecf20Sopenharmony_ci .len = len, 8018c2ecf20Sopenharmony_ci .flags = flags, 8028c2ecf20Sopenharmony_ci }; 8038c2ecf20Sopenharmony_ci long timeo; 8048c2ecf20Sopenharmony_ci ssize_t spliced; 8058c2ecf20Sopenharmony_ci int ret; 8068c2ecf20Sopenharmony_ci 8078c2ecf20Sopenharmony_ci sock_rps_record_flow(sk); 8088c2ecf20Sopenharmony_ci /* 8098c2ecf20Sopenharmony_ci * We can't seek on a socket input 8108c2ecf20Sopenharmony_ci */ 8118c2ecf20Sopenharmony_ci if (unlikely(*ppos)) 8128c2ecf20Sopenharmony_ci return -ESPIPE; 8138c2ecf20Sopenharmony_ci 8148c2ecf20Sopenharmony_ci ret = spliced = 0; 8158c2ecf20Sopenharmony_ci 8168c2ecf20Sopenharmony_ci lock_sock(sk); 8178c2ecf20Sopenharmony_ci 8188c2ecf20Sopenharmony_ci timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK); 8198c2ecf20Sopenharmony_ci while (tss.len) { 8208c2ecf20Sopenharmony_ci ret = __tcp_splice_read(sk, &tss); 8218c2ecf20Sopenharmony_ci if (ret < 0) 8228c2ecf20Sopenharmony_ci break; 8238c2ecf20Sopenharmony_ci else if (!ret) { 8248c2ecf20Sopenharmony_ci if (spliced) 8258c2ecf20Sopenharmony_ci break; 8268c2ecf20Sopenharmony_ci if (sock_flag(sk, SOCK_DONE)) 8278c2ecf20Sopenharmony_ci break; 8288c2ecf20Sopenharmony_ci if (sk->sk_err) { 8298c2ecf20Sopenharmony_ci ret = sock_error(sk); 8308c2ecf20Sopenharmony_ci break; 8318c2ecf20Sopenharmony_ci } 8328c2ecf20Sopenharmony_ci if (sk->sk_shutdown & RCV_SHUTDOWN) 8338c2ecf20Sopenharmony_ci break; 8348c2ecf20Sopenharmony_ci if (sk->sk_state == TCP_CLOSE) { 8358c2ecf20Sopenharmony_ci /* 8368c2ecf20Sopenharmony_ci * This occurs when user tries to read 8378c2ecf20Sopenharmony_ci * from never connected socket. 8388c2ecf20Sopenharmony_ci */ 8398c2ecf20Sopenharmony_ci ret = -ENOTCONN; 8408c2ecf20Sopenharmony_ci break; 8418c2ecf20Sopenharmony_ci } 8428c2ecf20Sopenharmony_ci if (!timeo) { 8438c2ecf20Sopenharmony_ci ret = -EAGAIN; 8448c2ecf20Sopenharmony_ci break; 8458c2ecf20Sopenharmony_ci } 8468c2ecf20Sopenharmony_ci /* if __tcp_splice_read() got nothing while we have 8478c2ecf20Sopenharmony_ci * an skb in receive queue, we do not want to loop. 8488c2ecf20Sopenharmony_ci * This might happen with URG data. 8498c2ecf20Sopenharmony_ci */ 8508c2ecf20Sopenharmony_ci if (!skb_queue_empty(&sk->sk_receive_queue)) 8518c2ecf20Sopenharmony_ci break; 8528c2ecf20Sopenharmony_ci sk_wait_data(sk, &timeo, NULL); 8538c2ecf20Sopenharmony_ci if (signal_pending(current)) { 8548c2ecf20Sopenharmony_ci ret = sock_intr_errno(timeo); 8558c2ecf20Sopenharmony_ci break; 8568c2ecf20Sopenharmony_ci } 8578c2ecf20Sopenharmony_ci continue; 8588c2ecf20Sopenharmony_ci } 8598c2ecf20Sopenharmony_ci tss.len -= ret; 8608c2ecf20Sopenharmony_ci spliced += ret; 8618c2ecf20Sopenharmony_ci 8628c2ecf20Sopenharmony_ci if (!timeo) 8638c2ecf20Sopenharmony_ci break; 8648c2ecf20Sopenharmony_ci release_sock(sk); 8658c2ecf20Sopenharmony_ci lock_sock(sk); 8668c2ecf20Sopenharmony_ci 8678c2ecf20Sopenharmony_ci if (sk->sk_err || sk->sk_state == TCP_CLOSE || 8688c2ecf20Sopenharmony_ci (sk->sk_shutdown & RCV_SHUTDOWN) || 8698c2ecf20Sopenharmony_ci signal_pending(current)) 8708c2ecf20Sopenharmony_ci break; 8718c2ecf20Sopenharmony_ci } 8728c2ecf20Sopenharmony_ci 8738c2ecf20Sopenharmony_ci release_sock(sk); 8748c2ecf20Sopenharmony_ci 8758c2ecf20Sopenharmony_ci if (spliced) 8768c2ecf20Sopenharmony_ci return spliced; 8778c2ecf20Sopenharmony_ci 8788c2ecf20Sopenharmony_ci return ret; 8798c2ecf20Sopenharmony_ci} 8808c2ecf20Sopenharmony_ciEXPORT_SYMBOL(tcp_splice_read); 8818c2ecf20Sopenharmony_ci 8828c2ecf20Sopenharmony_cistruct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, 8838c2ecf20Sopenharmony_ci bool force_schedule) 8848c2ecf20Sopenharmony_ci{ 8858c2ecf20Sopenharmony_ci struct sk_buff *skb; 8868c2ecf20Sopenharmony_ci 8878c2ecf20Sopenharmony_ci if (likely(!size)) { 8888c2ecf20Sopenharmony_ci skb = sk->sk_tx_skb_cache; 8898c2ecf20Sopenharmony_ci if (skb) { 8908c2ecf20Sopenharmony_ci skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); 8918c2ecf20Sopenharmony_ci sk->sk_tx_skb_cache = NULL; 8928c2ecf20Sopenharmony_ci pskb_trim(skb, 0); 8938c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); 8948c2ecf20Sopenharmony_ci skb_shinfo(skb)->tx_flags = 0; 8958c2ecf20Sopenharmony_ci memset(TCP_SKB_CB(skb), 0, sizeof(struct tcp_skb_cb)); 8968c2ecf20Sopenharmony_ci return skb; 8978c2ecf20Sopenharmony_ci } 8988c2ecf20Sopenharmony_ci } 8998c2ecf20Sopenharmony_ci /* The TCP header must be at least 32-bit aligned. */ 9008c2ecf20Sopenharmony_ci size = ALIGN(size, 4); 9018c2ecf20Sopenharmony_ci 9028c2ecf20Sopenharmony_ci if (unlikely(tcp_under_memory_pressure(sk))) 9038c2ecf20Sopenharmony_ci sk_mem_reclaim_partial(sk); 9048c2ecf20Sopenharmony_ci 9058c2ecf20Sopenharmony_ci skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp); 9068c2ecf20Sopenharmony_ci if (likely(skb)) { 9078c2ecf20Sopenharmony_ci bool mem_scheduled; 9088c2ecf20Sopenharmony_ci 9098c2ecf20Sopenharmony_ci if (force_schedule) { 9108c2ecf20Sopenharmony_ci mem_scheduled = true; 9118c2ecf20Sopenharmony_ci sk_forced_mem_schedule(sk, skb->truesize); 9128c2ecf20Sopenharmony_ci } else { 9138c2ecf20Sopenharmony_ci mem_scheduled = sk_wmem_schedule(sk, skb->truesize); 9148c2ecf20Sopenharmony_ci } 9158c2ecf20Sopenharmony_ci if (likely(mem_scheduled)) { 9168c2ecf20Sopenharmony_ci skb_reserve(skb, sk->sk_prot->max_header); 9178c2ecf20Sopenharmony_ci /* 9188c2ecf20Sopenharmony_ci * Make sure that we have exactly size bytes 9198c2ecf20Sopenharmony_ci * available to the caller, no more, no less. 9208c2ecf20Sopenharmony_ci */ 9218c2ecf20Sopenharmony_ci skb->reserved_tailroom = skb->end - skb->tail - size; 9228c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); 9238c2ecf20Sopenharmony_ci return skb; 9248c2ecf20Sopenharmony_ci } 9258c2ecf20Sopenharmony_ci __kfree_skb(skb); 9268c2ecf20Sopenharmony_ci } else { 9278c2ecf20Sopenharmony_ci sk->sk_prot->enter_memory_pressure(sk); 9288c2ecf20Sopenharmony_ci sk_stream_moderate_sndbuf(sk); 9298c2ecf20Sopenharmony_ci } 9308c2ecf20Sopenharmony_ci return NULL; 9318c2ecf20Sopenharmony_ci} 9328c2ecf20Sopenharmony_ci 9338c2ecf20Sopenharmony_cistatic unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, 9348c2ecf20Sopenharmony_ci int large_allowed) 9358c2ecf20Sopenharmony_ci{ 9368c2ecf20Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 9378c2ecf20Sopenharmony_ci u32 new_size_goal, size_goal; 9388c2ecf20Sopenharmony_ci 9398c2ecf20Sopenharmony_ci if (!large_allowed) 9408c2ecf20Sopenharmony_ci return mss_now; 9418c2ecf20Sopenharmony_ci 9428c2ecf20Sopenharmony_ci /* Note : tcp_tso_autosize() will eventually split this later */ 9438c2ecf20Sopenharmony_ci new_size_goal = sk->sk_gso_max_size - 1 - MAX_TCP_HEADER; 9448c2ecf20Sopenharmony_ci new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal); 9458c2ecf20Sopenharmony_ci 9468c2ecf20Sopenharmony_ci /* We try hard to avoid divides here */ 9478c2ecf20Sopenharmony_ci size_goal = tp->gso_segs * mss_now; 9488c2ecf20Sopenharmony_ci if (unlikely(new_size_goal < size_goal || 9498c2ecf20Sopenharmony_ci new_size_goal >= size_goal + mss_now)) { 9508c2ecf20Sopenharmony_ci tp->gso_segs = min_t(u16, new_size_goal / mss_now, 9518c2ecf20Sopenharmony_ci sk->sk_gso_max_segs); 9528c2ecf20Sopenharmony_ci size_goal = tp->gso_segs * mss_now; 9538c2ecf20Sopenharmony_ci } 9548c2ecf20Sopenharmony_ci 9558c2ecf20Sopenharmony_ci return max(size_goal, mss_now); 9568c2ecf20Sopenharmony_ci} 9578c2ecf20Sopenharmony_ci 9588c2ecf20Sopenharmony_ciint tcp_send_mss(struct sock *sk, int *size_goal, int flags) 9598c2ecf20Sopenharmony_ci{ 9608c2ecf20Sopenharmony_ci int mss_now; 9618c2ecf20Sopenharmony_ci 9628c2ecf20Sopenharmony_ci mss_now = tcp_current_mss(sk); 9638c2ecf20Sopenharmony_ci *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB)); 9648c2ecf20Sopenharmony_ci 9658c2ecf20Sopenharmony_ci return mss_now; 9668c2ecf20Sopenharmony_ci} 9678c2ecf20Sopenharmony_ci 9688c2ecf20Sopenharmony_ci/* In some cases, both sendpage() and sendmsg() could have added 9698c2ecf20Sopenharmony_ci * an skb to the write queue, but failed adding payload on it. 9708c2ecf20Sopenharmony_ci * We need to remove it to consume less memory, but more 9718c2ecf20Sopenharmony_ci * importantly be able to generate EPOLLOUT for Edge Trigger epoll() 9728c2ecf20Sopenharmony_ci * users. 9738c2ecf20Sopenharmony_ci */ 9748c2ecf20Sopenharmony_cistatic void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb) 9758c2ecf20Sopenharmony_ci{ 9768c2ecf20Sopenharmony_ci if (skb && TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { 9778c2ecf20Sopenharmony_ci tcp_unlink_write_queue(skb, sk); 9788c2ecf20Sopenharmony_ci if (tcp_write_queue_empty(sk)) 9798c2ecf20Sopenharmony_ci tcp_chrono_stop(sk, TCP_CHRONO_BUSY); 9808c2ecf20Sopenharmony_ci sk_wmem_free_skb(sk, skb); 9818c2ecf20Sopenharmony_ci } 9828c2ecf20Sopenharmony_ci} 9838c2ecf20Sopenharmony_ci 9848c2ecf20Sopenharmony_cissize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, 9858c2ecf20Sopenharmony_ci size_t size, int flags) 9868c2ecf20Sopenharmony_ci{ 9878c2ecf20Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 9888c2ecf20Sopenharmony_ci int mss_now, size_goal; 9898c2ecf20Sopenharmony_ci int err; 9908c2ecf20Sopenharmony_ci ssize_t copied; 9918c2ecf20Sopenharmony_ci long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); 9928c2ecf20Sopenharmony_ci 9938c2ecf20Sopenharmony_ci if (IS_ENABLED(CONFIG_DEBUG_VM) && 9948c2ecf20Sopenharmony_ci WARN_ONCE(!sendpage_ok(page), 9958c2ecf20Sopenharmony_ci "page must not be a Slab one and have page_count > 0")) 9968c2ecf20Sopenharmony_ci return -EINVAL; 9978c2ecf20Sopenharmony_ci 9988c2ecf20Sopenharmony_ci /* Wait for a connection to finish. One exception is TCP Fast Open 9998c2ecf20Sopenharmony_ci * (passive side) where data is allowed to be sent before a connection 10008c2ecf20Sopenharmony_ci * is fully established. 10018c2ecf20Sopenharmony_ci */ 10028c2ecf20Sopenharmony_ci if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && 10038c2ecf20Sopenharmony_ci !tcp_passive_fastopen(sk)) { 10048c2ecf20Sopenharmony_ci err = sk_stream_wait_connect(sk, &timeo); 10058c2ecf20Sopenharmony_ci if (err != 0) 10068c2ecf20Sopenharmony_ci goto out_err; 10078c2ecf20Sopenharmony_ci } 10088c2ecf20Sopenharmony_ci 10098c2ecf20Sopenharmony_ci sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); 10108c2ecf20Sopenharmony_ci 10118c2ecf20Sopenharmony_ci mss_now = tcp_send_mss(sk, &size_goal, flags); 10128c2ecf20Sopenharmony_ci copied = 0; 10138c2ecf20Sopenharmony_ci 10148c2ecf20Sopenharmony_ci err = -EPIPE; 10158c2ecf20Sopenharmony_ci if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) 10168c2ecf20Sopenharmony_ci goto out_err; 10178c2ecf20Sopenharmony_ci 10188c2ecf20Sopenharmony_ci while (size > 0) { 10198c2ecf20Sopenharmony_ci struct sk_buff *skb = tcp_write_queue_tail(sk); 10208c2ecf20Sopenharmony_ci int copy, i; 10218c2ecf20Sopenharmony_ci bool can_coalesce; 10228c2ecf20Sopenharmony_ci 10238c2ecf20Sopenharmony_ci if (!skb || (copy = size_goal - skb->len) <= 0 || 10248c2ecf20Sopenharmony_ci !tcp_skb_can_collapse_to(skb)) { 10258c2ecf20Sopenharmony_cinew_segment: 10268c2ecf20Sopenharmony_ci if (!sk_stream_memory_free(sk)) 10278c2ecf20Sopenharmony_ci goto wait_for_space; 10288c2ecf20Sopenharmony_ci 10298c2ecf20Sopenharmony_ci skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, 10308c2ecf20Sopenharmony_ci tcp_rtx_and_write_queues_empty(sk)); 10318c2ecf20Sopenharmony_ci if (!skb) 10328c2ecf20Sopenharmony_ci goto wait_for_space; 10338c2ecf20Sopenharmony_ci 10348c2ecf20Sopenharmony_ci#ifdef CONFIG_TLS_DEVICE 10358c2ecf20Sopenharmony_ci skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED); 10368c2ecf20Sopenharmony_ci#endif 10378c2ecf20Sopenharmony_ci skb_entail(sk, skb); 10388c2ecf20Sopenharmony_ci copy = size_goal; 10398c2ecf20Sopenharmony_ci } 10408c2ecf20Sopenharmony_ci 10418c2ecf20Sopenharmony_ci if (copy > size) 10428c2ecf20Sopenharmony_ci copy = size; 10438c2ecf20Sopenharmony_ci 10448c2ecf20Sopenharmony_ci i = skb_shinfo(skb)->nr_frags; 10458c2ecf20Sopenharmony_ci can_coalesce = skb_can_coalesce(skb, i, page, offset); 10468c2ecf20Sopenharmony_ci if (!can_coalesce && i >= sysctl_max_skb_frags) { 10478c2ecf20Sopenharmony_ci tcp_mark_push(tp, skb); 10488c2ecf20Sopenharmony_ci goto new_segment; 10498c2ecf20Sopenharmony_ci } 10508c2ecf20Sopenharmony_ci if (!sk_wmem_schedule(sk, copy)) 10518c2ecf20Sopenharmony_ci goto wait_for_space; 10528c2ecf20Sopenharmony_ci 10538c2ecf20Sopenharmony_ci if (can_coalesce) { 10548c2ecf20Sopenharmony_ci skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 10558c2ecf20Sopenharmony_ci } else { 10568c2ecf20Sopenharmony_ci get_page(page); 10578c2ecf20Sopenharmony_ci skb_fill_page_desc(skb, i, page, offset, copy); 10588c2ecf20Sopenharmony_ci } 10598c2ecf20Sopenharmony_ci 10608c2ecf20Sopenharmony_ci if (!(flags & MSG_NO_SHARED_FRAGS)) 10618c2ecf20Sopenharmony_ci skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; 10628c2ecf20Sopenharmony_ci 10638c2ecf20Sopenharmony_ci skb->len += copy; 10648c2ecf20Sopenharmony_ci skb->data_len += copy; 10658c2ecf20Sopenharmony_ci skb->truesize += copy; 10668c2ecf20Sopenharmony_ci sk_wmem_queued_add(sk, copy); 10678c2ecf20Sopenharmony_ci sk_mem_charge(sk, copy); 10688c2ecf20Sopenharmony_ci skb->ip_summed = CHECKSUM_PARTIAL; 10698c2ecf20Sopenharmony_ci WRITE_ONCE(tp->write_seq, tp->write_seq + copy); 10708c2ecf20Sopenharmony_ci TCP_SKB_CB(skb)->end_seq += copy; 10718c2ecf20Sopenharmony_ci tcp_skb_pcount_set(skb, 0); 10728c2ecf20Sopenharmony_ci 10738c2ecf20Sopenharmony_ci if (!copied) 10748c2ecf20Sopenharmony_ci TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; 10758c2ecf20Sopenharmony_ci 10768c2ecf20Sopenharmony_ci copied += copy; 10778c2ecf20Sopenharmony_ci offset += copy; 10788c2ecf20Sopenharmony_ci size -= copy; 10798c2ecf20Sopenharmony_ci if (!size) 10808c2ecf20Sopenharmony_ci goto out; 10818c2ecf20Sopenharmony_ci 10828c2ecf20Sopenharmony_ci if (skb->len < size_goal || (flags & MSG_OOB)) 10838c2ecf20Sopenharmony_ci continue; 10848c2ecf20Sopenharmony_ci 10858c2ecf20Sopenharmony_ci if (forced_push(tp)) { 10868c2ecf20Sopenharmony_ci tcp_mark_push(tp, skb); 10878c2ecf20Sopenharmony_ci __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); 10888c2ecf20Sopenharmony_ci } else if (skb == tcp_send_head(sk)) 10898c2ecf20Sopenharmony_ci tcp_push_one(sk, mss_now); 10908c2ecf20Sopenharmony_ci continue; 10918c2ecf20Sopenharmony_ci 10928c2ecf20Sopenharmony_ciwait_for_space: 10938c2ecf20Sopenharmony_ci set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 10948c2ecf20Sopenharmony_ci tcp_push(sk, flags & ~MSG_MORE, mss_now, 10958c2ecf20Sopenharmony_ci TCP_NAGLE_PUSH, size_goal); 10968c2ecf20Sopenharmony_ci 10978c2ecf20Sopenharmony_ci err = sk_stream_wait_memory(sk, &timeo); 10988c2ecf20Sopenharmony_ci if (err != 0) 10998c2ecf20Sopenharmony_ci goto do_error; 11008c2ecf20Sopenharmony_ci 11018c2ecf20Sopenharmony_ci mss_now = tcp_send_mss(sk, &size_goal, flags); 11028c2ecf20Sopenharmony_ci } 11038c2ecf20Sopenharmony_ci 11048c2ecf20Sopenharmony_ciout: 11058c2ecf20Sopenharmony_ci if (copied) { 11068c2ecf20Sopenharmony_ci tcp_tx_timestamp(sk, sk->sk_tsflags); 11078c2ecf20Sopenharmony_ci if (!(flags & MSG_SENDPAGE_NOTLAST)) 11088c2ecf20Sopenharmony_ci tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); 11098c2ecf20Sopenharmony_ci } 11108c2ecf20Sopenharmony_ci return copied; 11118c2ecf20Sopenharmony_ci 11128c2ecf20Sopenharmony_cido_error: 11138c2ecf20Sopenharmony_ci tcp_remove_empty_skb(sk, tcp_write_queue_tail(sk)); 11148c2ecf20Sopenharmony_ci if (copied) 11158c2ecf20Sopenharmony_ci goto out; 11168c2ecf20Sopenharmony_ciout_err: 11178c2ecf20Sopenharmony_ci /* make sure we wake any epoll edge trigger waiter */ 11188c2ecf20Sopenharmony_ci if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) { 11198c2ecf20Sopenharmony_ci sk->sk_write_space(sk); 11208c2ecf20Sopenharmony_ci tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); 11218c2ecf20Sopenharmony_ci } 11228c2ecf20Sopenharmony_ci return sk_stream_error(sk, flags, err); 11238c2ecf20Sopenharmony_ci} 11248c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(do_tcp_sendpages); 11258c2ecf20Sopenharmony_ci 11268c2ecf20Sopenharmony_ciint tcp_sendpage_locked(struct sock *sk, struct page *page, int offset, 11278c2ecf20Sopenharmony_ci size_t size, int flags) 11288c2ecf20Sopenharmony_ci{ 11298c2ecf20Sopenharmony_ci if (!(sk->sk_route_caps & NETIF_F_SG)) 11308c2ecf20Sopenharmony_ci return sock_no_sendpage_locked(sk, page, offset, size, flags); 11318c2ecf20Sopenharmony_ci 11328c2ecf20Sopenharmony_ci tcp_rate_check_app_limited(sk); /* is sending application-limited? */ 11338c2ecf20Sopenharmony_ci 11348c2ecf20Sopenharmony_ci return do_tcp_sendpages(sk, page, offset, size, flags); 11358c2ecf20Sopenharmony_ci} 11368c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(tcp_sendpage_locked); 11378c2ecf20Sopenharmony_ci 11388c2ecf20Sopenharmony_ciint tcp_sendpage(struct sock *sk, struct page *page, int offset, 11398c2ecf20Sopenharmony_ci size_t size, int flags) 11408c2ecf20Sopenharmony_ci{ 11418c2ecf20Sopenharmony_ci int ret; 11428c2ecf20Sopenharmony_ci 11438c2ecf20Sopenharmony_ci lock_sock(sk); 11448c2ecf20Sopenharmony_ci ret = tcp_sendpage_locked(sk, page, offset, size, flags); 11458c2ecf20Sopenharmony_ci release_sock(sk); 11468c2ecf20Sopenharmony_ci 11478c2ecf20Sopenharmony_ci return ret; 11488c2ecf20Sopenharmony_ci} 11498c2ecf20Sopenharmony_ciEXPORT_SYMBOL(tcp_sendpage); 11508c2ecf20Sopenharmony_ci 11518c2ecf20Sopenharmony_civoid tcp_free_fastopen_req(struct tcp_sock *tp) 11528c2ecf20Sopenharmony_ci{ 11538c2ecf20Sopenharmony_ci if (tp->fastopen_req) { 11548c2ecf20Sopenharmony_ci kfree(tp->fastopen_req); 11558c2ecf20Sopenharmony_ci tp->fastopen_req = NULL; 11568c2ecf20Sopenharmony_ci } 11578c2ecf20Sopenharmony_ci} 11588c2ecf20Sopenharmony_ci 11598c2ecf20Sopenharmony_cistatic int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, 11608c2ecf20Sopenharmony_ci int *copied, size_t size, 11618c2ecf20Sopenharmony_ci struct ubuf_info *uarg) 11628c2ecf20Sopenharmony_ci{ 11638c2ecf20Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 11648c2ecf20Sopenharmony_ci struct inet_sock *inet = inet_sk(sk); 11658c2ecf20Sopenharmony_ci struct sockaddr *uaddr = msg->msg_name; 11668c2ecf20Sopenharmony_ci int err, flags; 11678c2ecf20Sopenharmony_ci 11688c2ecf20Sopenharmony_ci if (!(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen) & 11698c2ecf20Sopenharmony_ci TFO_CLIENT_ENABLE) || 11708c2ecf20Sopenharmony_ci (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) && 11718c2ecf20Sopenharmony_ci uaddr->sa_family == AF_UNSPEC)) 11728c2ecf20Sopenharmony_ci return -EOPNOTSUPP; 11738c2ecf20Sopenharmony_ci if (tp->fastopen_req) 11748c2ecf20Sopenharmony_ci return -EALREADY; /* Another Fast Open is in progress */ 11758c2ecf20Sopenharmony_ci 11768c2ecf20Sopenharmony_ci tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request), 11778c2ecf20Sopenharmony_ci sk->sk_allocation); 11788c2ecf20Sopenharmony_ci if (unlikely(!tp->fastopen_req)) 11798c2ecf20Sopenharmony_ci return -ENOBUFS; 11808c2ecf20Sopenharmony_ci tp->fastopen_req->data = msg; 11818c2ecf20Sopenharmony_ci tp->fastopen_req->size = size; 11828c2ecf20Sopenharmony_ci tp->fastopen_req->uarg = uarg; 11838c2ecf20Sopenharmony_ci 11848c2ecf20Sopenharmony_ci if (inet->defer_connect) { 11858c2ecf20Sopenharmony_ci err = tcp_connect(sk); 11868c2ecf20Sopenharmony_ci /* Same failure procedure as in tcp_v4/6_connect */ 11878c2ecf20Sopenharmony_ci if (err) { 11888c2ecf20Sopenharmony_ci tcp_set_state(sk, TCP_CLOSE); 11898c2ecf20Sopenharmony_ci inet->inet_dport = 0; 11908c2ecf20Sopenharmony_ci sk->sk_route_caps = 0; 11918c2ecf20Sopenharmony_ci } 11928c2ecf20Sopenharmony_ci } 11938c2ecf20Sopenharmony_ci flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0; 11948c2ecf20Sopenharmony_ci err = __inet_stream_connect(sk->sk_socket, uaddr, 11958c2ecf20Sopenharmony_ci msg->msg_namelen, flags, 1); 11968c2ecf20Sopenharmony_ci /* fastopen_req could already be freed in __inet_stream_connect 11978c2ecf20Sopenharmony_ci * if the connection times out or gets rst 11988c2ecf20Sopenharmony_ci */ 11998c2ecf20Sopenharmony_ci if (tp->fastopen_req) { 12008c2ecf20Sopenharmony_ci *copied = tp->fastopen_req->copied; 12018c2ecf20Sopenharmony_ci tcp_free_fastopen_req(tp); 12028c2ecf20Sopenharmony_ci inet->defer_connect = 0; 12038c2ecf20Sopenharmony_ci } 12048c2ecf20Sopenharmony_ci return err; 12058c2ecf20Sopenharmony_ci} 12068c2ecf20Sopenharmony_ci 12078c2ecf20Sopenharmony_ciint tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) 12088c2ecf20Sopenharmony_ci{ 12098c2ecf20Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 12108c2ecf20Sopenharmony_ci struct ubuf_info *uarg = NULL; 12118c2ecf20Sopenharmony_ci struct sk_buff *skb; 12128c2ecf20Sopenharmony_ci struct sockcm_cookie sockc; 12138c2ecf20Sopenharmony_ci int flags, err, copied = 0; 12148c2ecf20Sopenharmony_ci int mss_now = 0, size_goal, copied_syn = 0; 12158c2ecf20Sopenharmony_ci int process_backlog = 0; 12168c2ecf20Sopenharmony_ci bool zc = false; 12178c2ecf20Sopenharmony_ci long timeo; 12188c2ecf20Sopenharmony_ci 12198c2ecf20Sopenharmony_ci flags = msg->msg_flags; 12208c2ecf20Sopenharmony_ci 12218c2ecf20Sopenharmony_ci if (flags & MSG_ZEROCOPY && size && sock_flag(sk, SOCK_ZEROCOPY)) { 12228c2ecf20Sopenharmony_ci skb = tcp_write_queue_tail(sk); 12238c2ecf20Sopenharmony_ci uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb)); 12248c2ecf20Sopenharmony_ci if (!uarg) { 12258c2ecf20Sopenharmony_ci err = -ENOBUFS; 12268c2ecf20Sopenharmony_ci goto out_err; 12278c2ecf20Sopenharmony_ci } 12288c2ecf20Sopenharmony_ci 12298c2ecf20Sopenharmony_ci zc = sk->sk_route_caps & NETIF_F_SG; 12308c2ecf20Sopenharmony_ci if (!zc) 12318c2ecf20Sopenharmony_ci uarg->zerocopy = 0; 12328c2ecf20Sopenharmony_ci } 12338c2ecf20Sopenharmony_ci 12348c2ecf20Sopenharmony_ci if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) && 12358c2ecf20Sopenharmony_ci !tp->repair) { 12368c2ecf20Sopenharmony_ci err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size, uarg); 12378c2ecf20Sopenharmony_ci if (err == -EINPROGRESS && copied_syn > 0) 12388c2ecf20Sopenharmony_ci goto out; 12398c2ecf20Sopenharmony_ci else if (err) 12408c2ecf20Sopenharmony_ci goto out_err; 12418c2ecf20Sopenharmony_ci } 12428c2ecf20Sopenharmony_ci 12438c2ecf20Sopenharmony_ci timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); 12448c2ecf20Sopenharmony_ci 12458c2ecf20Sopenharmony_ci tcp_rate_check_app_limited(sk); /* is sending application-limited? */ 12468c2ecf20Sopenharmony_ci 12478c2ecf20Sopenharmony_ci /* Wait for a connection to finish. One exception is TCP Fast Open 12488c2ecf20Sopenharmony_ci * (passive side) where data is allowed to be sent before a connection 12498c2ecf20Sopenharmony_ci * is fully established. 12508c2ecf20Sopenharmony_ci */ 12518c2ecf20Sopenharmony_ci if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && 12528c2ecf20Sopenharmony_ci !tcp_passive_fastopen(sk)) { 12538c2ecf20Sopenharmony_ci err = sk_stream_wait_connect(sk, &timeo); 12548c2ecf20Sopenharmony_ci if (err != 0) 12558c2ecf20Sopenharmony_ci goto do_error; 12568c2ecf20Sopenharmony_ci } 12578c2ecf20Sopenharmony_ci 12588c2ecf20Sopenharmony_ci if (unlikely(tp->repair)) { 12598c2ecf20Sopenharmony_ci if (tp->repair_queue == TCP_RECV_QUEUE) { 12608c2ecf20Sopenharmony_ci copied = tcp_send_rcvq(sk, msg, size); 12618c2ecf20Sopenharmony_ci goto out_nopush; 12628c2ecf20Sopenharmony_ci } 12638c2ecf20Sopenharmony_ci 12648c2ecf20Sopenharmony_ci err = -EINVAL; 12658c2ecf20Sopenharmony_ci if (tp->repair_queue == TCP_NO_QUEUE) 12668c2ecf20Sopenharmony_ci goto out_err; 12678c2ecf20Sopenharmony_ci 12688c2ecf20Sopenharmony_ci /* 'common' sending to sendq */ 12698c2ecf20Sopenharmony_ci } 12708c2ecf20Sopenharmony_ci 12718c2ecf20Sopenharmony_ci sockcm_init(&sockc, sk); 12728c2ecf20Sopenharmony_ci if (msg->msg_controllen) { 12738c2ecf20Sopenharmony_ci err = sock_cmsg_send(sk, msg, &sockc); 12748c2ecf20Sopenharmony_ci if (unlikely(err)) { 12758c2ecf20Sopenharmony_ci err = -EINVAL; 12768c2ecf20Sopenharmony_ci goto out_err; 12778c2ecf20Sopenharmony_ci } 12788c2ecf20Sopenharmony_ci } 12798c2ecf20Sopenharmony_ci 12808c2ecf20Sopenharmony_ci /* This should be in poll */ 12818c2ecf20Sopenharmony_ci sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); 12828c2ecf20Sopenharmony_ci 12838c2ecf20Sopenharmony_ci /* Ok commence sending. */ 12848c2ecf20Sopenharmony_ci copied = 0; 12858c2ecf20Sopenharmony_ci 12868c2ecf20Sopenharmony_cirestart: 12878c2ecf20Sopenharmony_ci mss_now = tcp_send_mss(sk, &size_goal, flags); 12888c2ecf20Sopenharmony_ci 12898c2ecf20Sopenharmony_ci err = -EPIPE; 12908c2ecf20Sopenharmony_ci if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) 12918c2ecf20Sopenharmony_ci goto do_error; 12928c2ecf20Sopenharmony_ci 12938c2ecf20Sopenharmony_ci while (msg_data_left(msg)) { 12948c2ecf20Sopenharmony_ci int copy = 0; 12958c2ecf20Sopenharmony_ci 12968c2ecf20Sopenharmony_ci skb = tcp_write_queue_tail(sk); 12978c2ecf20Sopenharmony_ci if (skb) 12988c2ecf20Sopenharmony_ci copy = size_goal - skb->len; 12998c2ecf20Sopenharmony_ci 13008c2ecf20Sopenharmony_ci if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) { 13018c2ecf20Sopenharmony_ci bool first_skb; 13028c2ecf20Sopenharmony_ci 13038c2ecf20Sopenharmony_cinew_segment: 13048c2ecf20Sopenharmony_ci if (!sk_stream_memory_free(sk)) 13058c2ecf20Sopenharmony_ci goto wait_for_space; 13068c2ecf20Sopenharmony_ci 13078c2ecf20Sopenharmony_ci if (unlikely(process_backlog >= 16)) { 13088c2ecf20Sopenharmony_ci process_backlog = 0; 13098c2ecf20Sopenharmony_ci if (sk_flush_backlog(sk)) 13108c2ecf20Sopenharmony_ci goto restart; 13118c2ecf20Sopenharmony_ci } 13128c2ecf20Sopenharmony_ci first_skb = tcp_rtx_and_write_queues_empty(sk); 13138c2ecf20Sopenharmony_ci skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, 13148c2ecf20Sopenharmony_ci first_skb); 13158c2ecf20Sopenharmony_ci if (!skb) 13168c2ecf20Sopenharmony_ci goto wait_for_space; 13178c2ecf20Sopenharmony_ci 13188c2ecf20Sopenharmony_ci process_backlog++; 13198c2ecf20Sopenharmony_ci skb->ip_summed = CHECKSUM_PARTIAL; 13208c2ecf20Sopenharmony_ci 13218c2ecf20Sopenharmony_ci skb_entail(sk, skb); 13228c2ecf20Sopenharmony_ci copy = size_goal; 13238c2ecf20Sopenharmony_ci 13248c2ecf20Sopenharmony_ci /* All packets are restored as if they have 13258c2ecf20Sopenharmony_ci * already been sent. skb_mstamp_ns isn't set to 13268c2ecf20Sopenharmony_ci * avoid wrong rtt estimation. 13278c2ecf20Sopenharmony_ci */ 13288c2ecf20Sopenharmony_ci if (tp->repair) 13298c2ecf20Sopenharmony_ci TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED; 13308c2ecf20Sopenharmony_ci } 13318c2ecf20Sopenharmony_ci 13328c2ecf20Sopenharmony_ci /* Try to append data to the end of skb. */ 13338c2ecf20Sopenharmony_ci if (copy > msg_data_left(msg)) 13348c2ecf20Sopenharmony_ci copy = msg_data_left(msg); 13358c2ecf20Sopenharmony_ci 13368c2ecf20Sopenharmony_ci /* Where to copy to? */ 13378c2ecf20Sopenharmony_ci if (skb_availroom(skb) > 0 && !zc) { 13388c2ecf20Sopenharmony_ci /* We have some space in skb head. Superb! */ 13398c2ecf20Sopenharmony_ci copy = min_t(int, copy, skb_availroom(skb)); 13408c2ecf20Sopenharmony_ci err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy); 13418c2ecf20Sopenharmony_ci if (err) 13428c2ecf20Sopenharmony_ci goto do_fault; 13438c2ecf20Sopenharmony_ci } else if (!zc) { 13448c2ecf20Sopenharmony_ci bool merge = true; 13458c2ecf20Sopenharmony_ci int i = skb_shinfo(skb)->nr_frags; 13468c2ecf20Sopenharmony_ci struct page_frag *pfrag = sk_page_frag(sk); 13478c2ecf20Sopenharmony_ci 13488c2ecf20Sopenharmony_ci if (!sk_page_frag_refill(sk, pfrag)) 13498c2ecf20Sopenharmony_ci goto wait_for_space; 13508c2ecf20Sopenharmony_ci 13518c2ecf20Sopenharmony_ci if (!skb_can_coalesce(skb, i, pfrag->page, 13528c2ecf20Sopenharmony_ci pfrag->offset)) { 13538c2ecf20Sopenharmony_ci if (i >= sysctl_max_skb_frags) { 13548c2ecf20Sopenharmony_ci tcp_mark_push(tp, skb); 13558c2ecf20Sopenharmony_ci goto new_segment; 13568c2ecf20Sopenharmony_ci } 13578c2ecf20Sopenharmony_ci merge = false; 13588c2ecf20Sopenharmony_ci } 13598c2ecf20Sopenharmony_ci 13608c2ecf20Sopenharmony_ci copy = min_t(int, copy, pfrag->size - pfrag->offset); 13618c2ecf20Sopenharmony_ci 13628c2ecf20Sopenharmony_ci if (!sk_wmem_schedule(sk, copy)) 13638c2ecf20Sopenharmony_ci goto wait_for_space; 13648c2ecf20Sopenharmony_ci 13658c2ecf20Sopenharmony_ci err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, 13668c2ecf20Sopenharmony_ci pfrag->page, 13678c2ecf20Sopenharmony_ci pfrag->offset, 13688c2ecf20Sopenharmony_ci copy); 13698c2ecf20Sopenharmony_ci if (err) 13708c2ecf20Sopenharmony_ci goto do_error; 13718c2ecf20Sopenharmony_ci 13728c2ecf20Sopenharmony_ci /* Update the skb. */ 13738c2ecf20Sopenharmony_ci if (merge) { 13748c2ecf20Sopenharmony_ci skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 13758c2ecf20Sopenharmony_ci } else { 13768c2ecf20Sopenharmony_ci skb_fill_page_desc(skb, i, pfrag->page, 13778c2ecf20Sopenharmony_ci pfrag->offset, copy); 13788c2ecf20Sopenharmony_ci page_ref_inc(pfrag->page); 13798c2ecf20Sopenharmony_ci } 13808c2ecf20Sopenharmony_ci pfrag->offset += copy; 13818c2ecf20Sopenharmony_ci } else { 13828c2ecf20Sopenharmony_ci if (!sk_wmem_schedule(sk, copy)) 13838c2ecf20Sopenharmony_ci goto wait_for_space; 13848c2ecf20Sopenharmony_ci 13858c2ecf20Sopenharmony_ci err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg); 13868c2ecf20Sopenharmony_ci if (err == -EMSGSIZE || err == -EEXIST) { 13878c2ecf20Sopenharmony_ci tcp_mark_push(tp, skb); 13888c2ecf20Sopenharmony_ci goto new_segment; 13898c2ecf20Sopenharmony_ci } 13908c2ecf20Sopenharmony_ci if (err < 0) 13918c2ecf20Sopenharmony_ci goto do_error; 13928c2ecf20Sopenharmony_ci copy = err; 13938c2ecf20Sopenharmony_ci } 13948c2ecf20Sopenharmony_ci 13958c2ecf20Sopenharmony_ci if (!copied) 13968c2ecf20Sopenharmony_ci TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; 13978c2ecf20Sopenharmony_ci 13988c2ecf20Sopenharmony_ci WRITE_ONCE(tp->write_seq, tp->write_seq + copy); 13998c2ecf20Sopenharmony_ci TCP_SKB_CB(skb)->end_seq += copy; 14008c2ecf20Sopenharmony_ci tcp_skb_pcount_set(skb, 0); 14018c2ecf20Sopenharmony_ci 14028c2ecf20Sopenharmony_ci copied += copy; 14038c2ecf20Sopenharmony_ci if (!msg_data_left(msg)) { 14048c2ecf20Sopenharmony_ci if (unlikely(flags & MSG_EOR)) 14058c2ecf20Sopenharmony_ci TCP_SKB_CB(skb)->eor = 1; 14068c2ecf20Sopenharmony_ci goto out; 14078c2ecf20Sopenharmony_ci } 14088c2ecf20Sopenharmony_ci 14098c2ecf20Sopenharmony_ci if (skb->len < size_goal || (flags & MSG_OOB) || unlikely(tp->repair)) 14108c2ecf20Sopenharmony_ci continue; 14118c2ecf20Sopenharmony_ci 14128c2ecf20Sopenharmony_ci if (forced_push(tp)) { 14138c2ecf20Sopenharmony_ci tcp_mark_push(tp, skb); 14148c2ecf20Sopenharmony_ci __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); 14158c2ecf20Sopenharmony_ci } else if (skb == tcp_send_head(sk)) 14168c2ecf20Sopenharmony_ci tcp_push_one(sk, mss_now); 14178c2ecf20Sopenharmony_ci continue; 14188c2ecf20Sopenharmony_ci 14198c2ecf20Sopenharmony_ciwait_for_space: 14208c2ecf20Sopenharmony_ci set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 14218c2ecf20Sopenharmony_ci if (copied) 14228c2ecf20Sopenharmony_ci tcp_push(sk, flags & ~MSG_MORE, mss_now, 14238c2ecf20Sopenharmony_ci TCP_NAGLE_PUSH, size_goal); 14248c2ecf20Sopenharmony_ci 14258c2ecf20Sopenharmony_ci err = sk_stream_wait_memory(sk, &timeo); 14268c2ecf20Sopenharmony_ci if (err != 0) 14278c2ecf20Sopenharmony_ci goto do_error; 14288c2ecf20Sopenharmony_ci 14298c2ecf20Sopenharmony_ci mss_now = tcp_send_mss(sk, &size_goal, flags); 14308c2ecf20Sopenharmony_ci } 14318c2ecf20Sopenharmony_ci 14328c2ecf20Sopenharmony_ciout: 14338c2ecf20Sopenharmony_ci if (copied) { 14348c2ecf20Sopenharmony_ci tcp_tx_timestamp(sk, sockc.tsflags); 14358c2ecf20Sopenharmony_ci tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); 14368c2ecf20Sopenharmony_ci } 14378c2ecf20Sopenharmony_ciout_nopush: 14388c2ecf20Sopenharmony_ci sock_zerocopy_put(uarg); 14398c2ecf20Sopenharmony_ci return copied + copied_syn; 14408c2ecf20Sopenharmony_ci 14418c2ecf20Sopenharmony_cido_error: 14428c2ecf20Sopenharmony_ci skb = tcp_write_queue_tail(sk); 14438c2ecf20Sopenharmony_cido_fault: 14448c2ecf20Sopenharmony_ci tcp_remove_empty_skb(sk, skb); 14458c2ecf20Sopenharmony_ci 14468c2ecf20Sopenharmony_ci if (copied + copied_syn) 14478c2ecf20Sopenharmony_ci goto out; 14488c2ecf20Sopenharmony_ciout_err: 14498c2ecf20Sopenharmony_ci sock_zerocopy_put_abort(uarg, true); 14508c2ecf20Sopenharmony_ci err = sk_stream_error(sk, flags, err); 14518c2ecf20Sopenharmony_ci /* make sure we wake any epoll edge trigger waiter */ 14528c2ecf20Sopenharmony_ci if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) { 14538c2ecf20Sopenharmony_ci sk->sk_write_space(sk); 14548c2ecf20Sopenharmony_ci tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); 14558c2ecf20Sopenharmony_ci } 14568c2ecf20Sopenharmony_ci return err; 14578c2ecf20Sopenharmony_ci} 14588c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(tcp_sendmsg_locked); 14598c2ecf20Sopenharmony_ci 14608c2ecf20Sopenharmony_ciint tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) 14618c2ecf20Sopenharmony_ci{ 14628c2ecf20Sopenharmony_ci int ret; 14638c2ecf20Sopenharmony_ci 14648c2ecf20Sopenharmony_ci lock_sock(sk); 14658c2ecf20Sopenharmony_ci ret = tcp_sendmsg_locked(sk, msg, size); 14668c2ecf20Sopenharmony_ci release_sock(sk); 14678c2ecf20Sopenharmony_ci 14688c2ecf20Sopenharmony_ci return ret; 14698c2ecf20Sopenharmony_ci} 14708c2ecf20Sopenharmony_ciEXPORT_SYMBOL(tcp_sendmsg); 14718c2ecf20Sopenharmony_ci 14728c2ecf20Sopenharmony_ci/* 14738c2ecf20Sopenharmony_ci * Handle reading urgent data. BSD has very simple semantics for 14748c2ecf20Sopenharmony_ci * this, no blocking and very strange errors 8) 14758c2ecf20Sopenharmony_ci */ 14768c2ecf20Sopenharmony_ci 14778c2ecf20Sopenharmony_cistatic int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags) 14788c2ecf20Sopenharmony_ci{ 14798c2ecf20Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 14808c2ecf20Sopenharmony_ci 14818c2ecf20Sopenharmony_ci /* No URG data to read. */ 14828c2ecf20Sopenharmony_ci if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data || 14838c2ecf20Sopenharmony_ci tp->urg_data == TCP_URG_READ) 14848c2ecf20Sopenharmony_ci return -EINVAL; /* Yes this is right ! */ 14858c2ecf20Sopenharmony_ci 14868c2ecf20Sopenharmony_ci if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE)) 14878c2ecf20Sopenharmony_ci return -ENOTCONN; 14888c2ecf20Sopenharmony_ci 14898c2ecf20Sopenharmony_ci if (tp->urg_data & TCP_URG_VALID) { 14908c2ecf20Sopenharmony_ci int err = 0; 14918c2ecf20Sopenharmony_ci char c = tp->urg_data; 14928c2ecf20Sopenharmony_ci 14938c2ecf20Sopenharmony_ci if (!(flags & MSG_PEEK)) 14948c2ecf20Sopenharmony_ci tp->urg_data = TCP_URG_READ; 14958c2ecf20Sopenharmony_ci 14968c2ecf20Sopenharmony_ci /* Read urgent data. */ 14978c2ecf20Sopenharmony_ci msg->msg_flags |= MSG_OOB; 14988c2ecf20Sopenharmony_ci 14998c2ecf20Sopenharmony_ci if (len > 0) { 15008c2ecf20Sopenharmony_ci if (!(flags & MSG_TRUNC)) 15018c2ecf20Sopenharmony_ci err = memcpy_to_msg(msg, &c, 1); 15028c2ecf20Sopenharmony_ci len = 1; 15038c2ecf20Sopenharmony_ci } else 15048c2ecf20Sopenharmony_ci msg->msg_flags |= MSG_TRUNC; 15058c2ecf20Sopenharmony_ci 15068c2ecf20Sopenharmony_ci return err ? -EFAULT : len; 15078c2ecf20Sopenharmony_ci } 15088c2ecf20Sopenharmony_ci 15098c2ecf20Sopenharmony_ci if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN)) 15108c2ecf20Sopenharmony_ci return 0; 15118c2ecf20Sopenharmony_ci 15128c2ecf20Sopenharmony_ci /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and 15138c2ecf20Sopenharmony_ci * the available implementations agree in this case: 15148c2ecf20Sopenharmony_ci * this call should never block, independent of the 15158c2ecf20Sopenharmony_ci * blocking state of the socket. 15168c2ecf20Sopenharmony_ci * Mike <pall@rz.uni-karlsruhe.de> 15178c2ecf20Sopenharmony_ci */ 15188c2ecf20Sopenharmony_ci return -EAGAIN; 15198c2ecf20Sopenharmony_ci} 15208c2ecf20Sopenharmony_ci 15218c2ecf20Sopenharmony_cistatic int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) 15228c2ecf20Sopenharmony_ci{ 15238c2ecf20Sopenharmony_ci struct sk_buff *skb; 15248c2ecf20Sopenharmony_ci int copied = 0, err = 0; 15258c2ecf20Sopenharmony_ci 15268c2ecf20Sopenharmony_ci /* XXX -- need to support SO_PEEK_OFF */ 15278c2ecf20Sopenharmony_ci 15288c2ecf20Sopenharmony_ci skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { 15298c2ecf20Sopenharmony_ci err = skb_copy_datagram_msg(skb, 0, msg, skb->len); 15308c2ecf20Sopenharmony_ci if (err) 15318c2ecf20Sopenharmony_ci return err; 15328c2ecf20Sopenharmony_ci copied += skb->len; 15338c2ecf20Sopenharmony_ci } 15348c2ecf20Sopenharmony_ci 15358c2ecf20Sopenharmony_ci skb_queue_walk(&sk->sk_write_queue, skb) { 15368c2ecf20Sopenharmony_ci err = skb_copy_datagram_msg(skb, 0, msg, skb->len); 15378c2ecf20Sopenharmony_ci if (err) 15388c2ecf20Sopenharmony_ci break; 15398c2ecf20Sopenharmony_ci 15408c2ecf20Sopenharmony_ci copied += skb->len; 15418c2ecf20Sopenharmony_ci } 15428c2ecf20Sopenharmony_ci 15438c2ecf20Sopenharmony_ci return err ?: copied; 15448c2ecf20Sopenharmony_ci} 15458c2ecf20Sopenharmony_ci 15468c2ecf20Sopenharmony_ci/* Clean up the receive buffer for full frames taken by the user, 15478c2ecf20Sopenharmony_ci * then send an ACK if necessary. COPIED is the number of bytes 15488c2ecf20Sopenharmony_ci * tcp_recvmsg has given to the user so far, it speeds up the 15498c2ecf20Sopenharmony_ci * calculation of whether or not we must ACK for the sake of 15508c2ecf20Sopenharmony_ci * a window update. 15518c2ecf20Sopenharmony_ci */ 15528c2ecf20Sopenharmony_civoid tcp_cleanup_rbuf(struct sock *sk, int copied) 15538c2ecf20Sopenharmony_ci{ 15548c2ecf20Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 15558c2ecf20Sopenharmony_ci bool time_to_ack = false; 15568c2ecf20Sopenharmony_ci 15578c2ecf20Sopenharmony_ci struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); 15588c2ecf20Sopenharmony_ci 15598c2ecf20Sopenharmony_ci WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq), 15608c2ecf20Sopenharmony_ci "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n", 15618c2ecf20Sopenharmony_ci tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt); 15628c2ecf20Sopenharmony_ci 15638c2ecf20Sopenharmony_ci if (inet_csk_ack_scheduled(sk)) { 15648c2ecf20Sopenharmony_ci const struct inet_connection_sock *icsk = inet_csk(sk); 15658c2ecf20Sopenharmony_ci __u16 rcv_mss = icsk->icsk_ack.rcv_mss; 15668c2ecf20Sopenharmony_ci#ifdef CONFIG_LOWPOWER_PROTOCOL 15678c2ecf20Sopenharmony_ci rcv_mss *= tcp_ack_num(sk); 15688c2ecf20Sopenharmony_ci#endif /* CONFIG_LOWPOWER_PROTOCOL */ 15698c2ecf20Sopenharmony_ci 15708c2ecf20Sopenharmony_ci if (/* Once-per-two-segments ACK was not sent by tcp_input.c */ 15718c2ecf20Sopenharmony_ci tp->rcv_nxt - tp->rcv_wup > rcv_mss || 15728c2ecf20Sopenharmony_ci /* 15738c2ecf20Sopenharmony_ci * If this read emptied read buffer, we send ACK, if 15748c2ecf20Sopenharmony_ci * connection is not bidirectional, user drained 15758c2ecf20Sopenharmony_ci * receive buffer and there was a small segment 15768c2ecf20Sopenharmony_ci * in queue. 15778c2ecf20Sopenharmony_ci */ 15788c2ecf20Sopenharmony_ci (copied > 0 && 15798c2ecf20Sopenharmony_ci ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) || 15808c2ecf20Sopenharmony_ci ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && 15818c2ecf20Sopenharmony_ci !inet_csk_in_pingpong_mode(sk))) && 15828c2ecf20Sopenharmony_ci !atomic_read(&sk->sk_rmem_alloc))) 15838c2ecf20Sopenharmony_ci time_to_ack = true; 15848c2ecf20Sopenharmony_ci } 15858c2ecf20Sopenharmony_ci 15868c2ecf20Sopenharmony_ci /* We send an ACK if we can now advertise a non-zero window 15878c2ecf20Sopenharmony_ci * which has been raised "significantly". 15888c2ecf20Sopenharmony_ci * 15898c2ecf20Sopenharmony_ci * Even if window raised up to infinity, do not send window open ACK 15908c2ecf20Sopenharmony_ci * in states, where we will not receive more. It is useless. 15918c2ecf20Sopenharmony_ci */ 15928c2ecf20Sopenharmony_ci if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) { 15938c2ecf20Sopenharmony_ci __u32 rcv_window_now = tcp_receive_window(tp); 15948c2ecf20Sopenharmony_ci 15958c2ecf20Sopenharmony_ci /* Optimize, __tcp_select_window() is not cheap. */ 15968c2ecf20Sopenharmony_ci if (2*rcv_window_now <= tp->window_clamp) { 15978c2ecf20Sopenharmony_ci __u32 new_window = __tcp_select_window(sk); 15988c2ecf20Sopenharmony_ci 15998c2ecf20Sopenharmony_ci /* Send ACK now, if this read freed lots of space 16008c2ecf20Sopenharmony_ci * in our buffer. Certainly, new_window is new window. 16018c2ecf20Sopenharmony_ci * We can advertise it now, if it is not less than current one. 16028c2ecf20Sopenharmony_ci * "Lots" means "at least twice" here. 16038c2ecf20Sopenharmony_ci */ 16048c2ecf20Sopenharmony_ci if (new_window && new_window >= 2 * rcv_window_now) 16058c2ecf20Sopenharmony_ci time_to_ack = true; 16068c2ecf20Sopenharmony_ci } 16078c2ecf20Sopenharmony_ci } 16088c2ecf20Sopenharmony_ci if (time_to_ack) 16098c2ecf20Sopenharmony_ci tcp_send_ack(sk); 16108c2ecf20Sopenharmony_ci} 16118c2ecf20Sopenharmony_ci 16128c2ecf20Sopenharmony_cistatic struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) 16138c2ecf20Sopenharmony_ci{ 16148c2ecf20Sopenharmony_ci struct sk_buff *skb; 16158c2ecf20Sopenharmony_ci u32 offset; 16168c2ecf20Sopenharmony_ci 16178c2ecf20Sopenharmony_ci while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { 16188c2ecf20Sopenharmony_ci offset = seq - TCP_SKB_CB(skb)->seq; 16198c2ecf20Sopenharmony_ci if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { 16208c2ecf20Sopenharmony_ci pr_err_once("%s: found a SYN, please report !\n", __func__); 16218c2ecf20Sopenharmony_ci offset--; 16228c2ecf20Sopenharmony_ci } 16238c2ecf20Sopenharmony_ci if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) { 16248c2ecf20Sopenharmony_ci *off = offset; 16258c2ecf20Sopenharmony_ci return skb; 16268c2ecf20Sopenharmony_ci } 16278c2ecf20Sopenharmony_ci /* This looks weird, but this can happen if TCP collapsing 16288c2ecf20Sopenharmony_ci * splitted a fat GRO packet, while we released socket lock 16298c2ecf20Sopenharmony_ci * in skb_splice_bits() 16308c2ecf20Sopenharmony_ci */ 16318c2ecf20Sopenharmony_ci sk_eat_skb(sk, skb); 16328c2ecf20Sopenharmony_ci } 16338c2ecf20Sopenharmony_ci return NULL; 16348c2ecf20Sopenharmony_ci} 16358c2ecf20Sopenharmony_ci 16368c2ecf20Sopenharmony_ci/* 16378c2ecf20Sopenharmony_ci * This routine provides an alternative to tcp_recvmsg() for routines 16388c2ecf20Sopenharmony_ci * that would like to handle copying from skbuffs directly in 'sendfile' 16398c2ecf20Sopenharmony_ci * fashion. 16408c2ecf20Sopenharmony_ci * Note: 16418c2ecf20Sopenharmony_ci * - It is assumed that the socket was locked by the caller. 16428c2ecf20Sopenharmony_ci * - The routine does not block. 16438c2ecf20Sopenharmony_ci * - At present, there is no support for reading OOB data 16448c2ecf20Sopenharmony_ci * or for 'peeking' the socket using this routine 16458c2ecf20Sopenharmony_ci * (although both would be easy to implement). 16468c2ecf20Sopenharmony_ci */ 16478c2ecf20Sopenharmony_ciint tcp_read_sock(struct sock *sk, read_descriptor_t *desc, 16488c2ecf20Sopenharmony_ci sk_read_actor_t recv_actor) 16498c2ecf20Sopenharmony_ci{ 16508c2ecf20Sopenharmony_ci struct sk_buff *skb; 16518c2ecf20Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 16528c2ecf20Sopenharmony_ci u32 seq = tp->copied_seq; 16538c2ecf20Sopenharmony_ci u32 offset; 16548c2ecf20Sopenharmony_ci int copied = 0; 16558c2ecf20Sopenharmony_ci 16568c2ecf20Sopenharmony_ci if (sk->sk_state == TCP_LISTEN) 16578c2ecf20Sopenharmony_ci return -ENOTCONN; 16588c2ecf20Sopenharmony_ci while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) { 16598c2ecf20Sopenharmony_ci if (offset < skb->len) { 16608c2ecf20Sopenharmony_ci int used; 16618c2ecf20Sopenharmony_ci size_t len; 16628c2ecf20Sopenharmony_ci 16638c2ecf20Sopenharmony_ci len = skb->len - offset; 16648c2ecf20Sopenharmony_ci /* Stop reading if we hit a patch of urgent data */ 16658c2ecf20Sopenharmony_ci if (tp->urg_data) { 16668c2ecf20Sopenharmony_ci u32 urg_offset = tp->urg_seq - seq; 16678c2ecf20Sopenharmony_ci if (urg_offset < len) 16688c2ecf20Sopenharmony_ci len = urg_offset; 16698c2ecf20Sopenharmony_ci if (!len) 16708c2ecf20Sopenharmony_ci break; 16718c2ecf20Sopenharmony_ci } 16728c2ecf20Sopenharmony_ci used = recv_actor(desc, skb, offset, len); 16738c2ecf20Sopenharmony_ci if (used <= 0) { 16748c2ecf20Sopenharmony_ci if (!copied) 16758c2ecf20Sopenharmony_ci copied = used; 16768c2ecf20Sopenharmony_ci break; 16778c2ecf20Sopenharmony_ci } 16788c2ecf20Sopenharmony_ci if (WARN_ON_ONCE(used > len)) 16798c2ecf20Sopenharmony_ci used = len; 16808c2ecf20Sopenharmony_ci seq += used; 16818c2ecf20Sopenharmony_ci copied += used; 16828c2ecf20Sopenharmony_ci offset += used; 16838c2ecf20Sopenharmony_ci 16848c2ecf20Sopenharmony_ci /* If recv_actor drops the lock (e.g. TCP splice 16858c2ecf20Sopenharmony_ci * receive) the skb pointer might be invalid when 16868c2ecf20Sopenharmony_ci * getting here: tcp_collapse might have deleted it 16878c2ecf20Sopenharmony_ci * while aggregating skbs from the socket queue. 16888c2ecf20Sopenharmony_ci */ 16898c2ecf20Sopenharmony_ci skb = tcp_recv_skb(sk, seq - 1, &offset); 16908c2ecf20Sopenharmony_ci if (!skb) 16918c2ecf20Sopenharmony_ci break; 16928c2ecf20Sopenharmony_ci /* TCP coalescing might have appended data to the skb. 16938c2ecf20Sopenharmony_ci * Try to splice more frags 16948c2ecf20Sopenharmony_ci */ 16958c2ecf20Sopenharmony_ci if (offset + 1 != skb->len) 16968c2ecf20Sopenharmony_ci continue; 16978c2ecf20Sopenharmony_ci } 16988c2ecf20Sopenharmony_ci if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { 16998c2ecf20Sopenharmony_ci sk_eat_skb(sk, skb); 17008c2ecf20Sopenharmony_ci ++seq; 17018c2ecf20Sopenharmony_ci break; 17028c2ecf20Sopenharmony_ci } 17038c2ecf20Sopenharmony_ci sk_eat_skb(sk, skb); 17048c2ecf20Sopenharmony_ci if (!desc->count) 17058c2ecf20Sopenharmony_ci break; 17068c2ecf20Sopenharmony_ci WRITE_ONCE(tp->copied_seq, seq); 17078c2ecf20Sopenharmony_ci } 17088c2ecf20Sopenharmony_ci WRITE_ONCE(tp->copied_seq, seq); 17098c2ecf20Sopenharmony_ci 17108c2ecf20Sopenharmony_ci tcp_rcv_space_adjust(sk); 17118c2ecf20Sopenharmony_ci 17128c2ecf20Sopenharmony_ci /* Clean up data we have read: This will do ACK frames. */ 17138c2ecf20Sopenharmony_ci if (copied > 0) { 17148c2ecf20Sopenharmony_ci tcp_recv_skb(sk, seq, &offset); 17158c2ecf20Sopenharmony_ci tcp_cleanup_rbuf(sk, copied); 17168c2ecf20Sopenharmony_ci } 17178c2ecf20Sopenharmony_ci return copied; 17188c2ecf20Sopenharmony_ci} 17198c2ecf20Sopenharmony_ciEXPORT_SYMBOL(tcp_read_sock); 17208c2ecf20Sopenharmony_ci 17218c2ecf20Sopenharmony_ciint tcp_peek_len(struct socket *sock) 17228c2ecf20Sopenharmony_ci{ 17238c2ecf20Sopenharmony_ci return tcp_inq(sock->sk); 17248c2ecf20Sopenharmony_ci} 17258c2ecf20Sopenharmony_ciEXPORT_SYMBOL(tcp_peek_len); 17268c2ecf20Sopenharmony_ci 17278c2ecf20Sopenharmony_ci/* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */ 17288c2ecf20Sopenharmony_ciint tcp_set_rcvlowat(struct sock *sk, int val) 17298c2ecf20Sopenharmony_ci{ 17308c2ecf20Sopenharmony_ci int cap; 17318c2ecf20Sopenharmony_ci 17328c2ecf20Sopenharmony_ci if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) 17338c2ecf20Sopenharmony_ci cap = sk->sk_rcvbuf >> 1; 17348c2ecf20Sopenharmony_ci else 17358c2ecf20Sopenharmony_ci cap = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1; 17368c2ecf20Sopenharmony_ci val = min(val, cap); 17378c2ecf20Sopenharmony_ci WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); 17388c2ecf20Sopenharmony_ci 17398c2ecf20Sopenharmony_ci /* Check if we need to signal EPOLLIN right now */ 17408c2ecf20Sopenharmony_ci tcp_data_ready(sk); 17418c2ecf20Sopenharmony_ci 17428c2ecf20Sopenharmony_ci if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) 17438c2ecf20Sopenharmony_ci return 0; 17448c2ecf20Sopenharmony_ci 17458c2ecf20Sopenharmony_ci val <<= 1; 17468c2ecf20Sopenharmony_ci if (val > sk->sk_rcvbuf) { 17478c2ecf20Sopenharmony_ci WRITE_ONCE(sk->sk_rcvbuf, val); 17488c2ecf20Sopenharmony_ci tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val); 17498c2ecf20Sopenharmony_ci } 17508c2ecf20Sopenharmony_ci return 0; 17518c2ecf20Sopenharmony_ci} 17528c2ecf20Sopenharmony_ciEXPORT_SYMBOL(tcp_set_rcvlowat); 17538c2ecf20Sopenharmony_ci 17548c2ecf20Sopenharmony_ci#ifdef CONFIG_MMU 17558c2ecf20Sopenharmony_cistatic const struct vm_operations_struct tcp_vm_ops = { 17568c2ecf20Sopenharmony_ci}; 17578c2ecf20Sopenharmony_ci 17588c2ecf20Sopenharmony_ciint tcp_mmap(struct file *file, struct socket *sock, 17598c2ecf20Sopenharmony_ci struct vm_area_struct *vma) 17608c2ecf20Sopenharmony_ci{ 17618c2ecf20Sopenharmony_ci if (vma->vm_flags & (VM_WRITE | VM_EXEC)) 17628c2ecf20Sopenharmony_ci return -EPERM; 17638c2ecf20Sopenharmony_ci vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC); 17648c2ecf20Sopenharmony_ci 17658c2ecf20Sopenharmony_ci /* Instruct vm_insert_page() to not mmap_read_lock(mm) */ 17668c2ecf20Sopenharmony_ci vma->vm_flags |= VM_MIXEDMAP; 17678c2ecf20Sopenharmony_ci 17688c2ecf20Sopenharmony_ci vma->vm_ops = &tcp_vm_ops; 17698c2ecf20Sopenharmony_ci return 0; 17708c2ecf20Sopenharmony_ci} 17718c2ecf20Sopenharmony_ciEXPORT_SYMBOL(tcp_mmap); 17728c2ecf20Sopenharmony_ci 17738c2ecf20Sopenharmony_cistatic skb_frag_t *skb_advance_to_frag(struct sk_buff *skb, u32 offset_skb, 17748c2ecf20Sopenharmony_ci u32 *offset_frag) 17758c2ecf20Sopenharmony_ci{ 17768c2ecf20Sopenharmony_ci skb_frag_t *frag; 17778c2ecf20Sopenharmony_ci 17788c2ecf20Sopenharmony_ci if (unlikely(offset_skb >= skb->len)) 17798c2ecf20Sopenharmony_ci return NULL; 17808c2ecf20Sopenharmony_ci 17818c2ecf20Sopenharmony_ci offset_skb -= skb_headlen(skb); 17828c2ecf20Sopenharmony_ci if ((int)offset_skb < 0 || skb_has_frag_list(skb)) 17838c2ecf20Sopenharmony_ci return NULL; 17848c2ecf20Sopenharmony_ci 17858c2ecf20Sopenharmony_ci frag = skb_shinfo(skb)->frags; 17868c2ecf20Sopenharmony_ci while (offset_skb) { 17878c2ecf20Sopenharmony_ci if (skb_frag_size(frag) > offset_skb) { 17888c2ecf20Sopenharmony_ci *offset_frag = offset_skb; 17898c2ecf20Sopenharmony_ci return frag; 17908c2ecf20Sopenharmony_ci } 17918c2ecf20Sopenharmony_ci offset_skb -= skb_frag_size(frag); 17928c2ecf20Sopenharmony_ci ++frag; 17938c2ecf20Sopenharmony_ci } 17948c2ecf20Sopenharmony_ci *offset_frag = 0; 17958c2ecf20Sopenharmony_ci return frag; 17968c2ecf20Sopenharmony_ci} 17978c2ecf20Sopenharmony_ci 17988c2ecf20Sopenharmony_cistatic bool can_map_frag(const skb_frag_t *frag) 17998c2ecf20Sopenharmony_ci{ 18008c2ecf20Sopenharmony_ci struct page *page; 18018c2ecf20Sopenharmony_ci 18028c2ecf20Sopenharmony_ci if (skb_frag_size(frag) != PAGE_SIZE || skb_frag_off(frag)) 18038c2ecf20Sopenharmony_ci return false; 18048c2ecf20Sopenharmony_ci 18058c2ecf20Sopenharmony_ci page = skb_frag_page(frag); 18068c2ecf20Sopenharmony_ci 18078c2ecf20Sopenharmony_ci if (PageCompound(page) || page->mapping) 18088c2ecf20Sopenharmony_ci return false; 18098c2ecf20Sopenharmony_ci 18108c2ecf20Sopenharmony_ci return true; 18118c2ecf20Sopenharmony_ci} 18128c2ecf20Sopenharmony_ci 18138c2ecf20Sopenharmony_cistatic int find_next_mappable_frag(const skb_frag_t *frag, 18148c2ecf20Sopenharmony_ci int remaining_in_skb) 18158c2ecf20Sopenharmony_ci{ 18168c2ecf20Sopenharmony_ci int offset = 0; 18178c2ecf20Sopenharmony_ci 18188c2ecf20Sopenharmony_ci if (likely(can_map_frag(frag))) 18198c2ecf20Sopenharmony_ci return 0; 18208c2ecf20Sopenharmony_ci 18218c2ecf20Sopenharmony_ci while (offset < remaining_in_skb && !can_map_frag(frag)) { 18228c2ecf20Sopenharmony_ci offset += skb_frag_size(frag); 18238c2ecf20Sopenharmony_ci ++frag; 18248c2ecf20Sopenharmony_ci } 18258c2ecf20Sopenharmony_ci return offset; 18268c2ecf20Sopenharmony_ci} 18278c2ecf20Sopenharmony_ci 18288c2ecf20Sopenharmony_cistatic int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc, 18298c2ecf20Sopenharmony_ci struct sk_buff *skb, u32 copylen, 18308c2ecf20Sopenharmony_ci u32 *offset, u32 *seq) 18318c2ecf20Sopenharmony_ci{ 18328c2ecf20Sopenharmony_ci unsigned long copy_address = (unsigned long)zc->copybuf_address; 18338c2ecf20Sopenharmony_ci struct msghdr msg = {}; 18348c2ecf20Sopenharmony_ci struct iovec iov; 18358c2ecf20Sopenharmony_ci int err; 18368c2ecf20Sopenharmony_ci 18378c2ecf20Sopenharmony_ci if (copy_address != zc->copybuf_address) 18388c2ecf20Sopenharmony_ci return -EINVAL; 18398c2ecf20Sopenharmony_ci 18408c2ecf20Sopenharmony_ci err = import_single_range(READ, (void __user *)copy_address, 18418c2ecf20Sopenharmony_ci copylen, &iov, &msg.msg_iter); 18428c2ecf20Sopenharmony_ci if (err) 18438c2ecf20Sopenharmony_ci return err; 18448c2ecf20Sopenharmony_ci err = skb_copy_datagram_msg(skb, *offset, &msg, copylen); 18458c2ecf20Sopenharmony_ci if (err) 18468c2ecf20Sopenharmony_ci return err; 18478c2ecf20Sopenharmony_ci zc->recv_skip_hint -= copylen; 18488c2ecf20Sopenharmony_ci *offset += copylen; 18498c2ecf20Sopenharmony_ci *seq += copylen; 18508c2ecf20Sopenharmony_ci return (__s32)copylen; 18518c2ecf20Sopenharmony_ci} 18528c2ecf20Sopenharmony_ci 18538c2ecf20Sopenharmony_cistatic int tcp_zerocopy_handle_leftover_data(struct tcp_zerocopy_receive *zc, 18548c2ecf20Sopenharmony_ci struct sock *sk, 18558c2ecf20Sopenharmony_ci struct sk_buff *skb, 18568c2ecf20Sopenharmony_ci u32 *seq, 18578c2ecf20Sopenharmony_ci s32 copybuf_len) 18588c2ecf20Sopenharmony_ci{ 18598c2ecf20Sopenharmony_ci u32 offset, copylen = min_t(u32, copybuf_len, zc->recv_skip_hint); 18608c2ecf20Sopenharmony_ci 18618c2ecf20Sopenharmony_ci if (!copylen) 18628c2ecf20Sopenharmony_ci return 0; 18638c2ecf20Sopenharmony_ci /* skb is null if inq < PAGE_SIZE. */ 18648c2ecf20Sopenharmony_ci if (skb) 18658c2ecf20Sopenharmony_ci offset = *seq - TCP_SKB_CB(skb)->seq; 18668c2ecf20Sopenharmony_ci else 18678c2ecf20Sopenharmony_ci skb = tcp_recv_skb(sk, *seq, &offset); 18688c2ecf20Sopenharmony_ci 18698c2ecf20Sopenharmony_ci zc->copybuf_len = tcp_copy_straggler_data(zc, skb, copylen, &offset, 18708c2ecf20Sopenharmony_ci seq); 18718c2ecf20Sopenharmony_ci return zc->copybuf_len < 0 ? 0 : copylen; 18728c2ecf20Sopenharmony_ci} 18738c2ecf20Sopenharmony_ci 18748c2ecf20Sopenharmony_cistatic int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma, 18758c2ecf20Sopenharmony_ci struct page **pages, 18768c2ecf20Sopenharmony_ci unsigned long pages_to_map, 18778c2ecf20Sopenharmony_ci unsigned long *insert_addr, 18788c2ecf20Sopenharmony_ci u32 *length_with_pending, 18798c2ecf20Sopenharmony_ci u32 *seq, 18808c2ecf20Sopenharmony_ci struct tcp_zerocopy_receive *zc) 18818c2ecf20Sopenharmony_ci{ 18828c2ecf20Sopenharmony_ci unsigned long pages_remaining = pages_to_map; 18838c2ecf20Sopenharmony_ci int bytes_mapped; 18848c2ecf20Sopenharmony_ci int ret; 18858c2ecf20Sopenharmony_ci 18868c2ecf20Sopenharmony_ci ret = vm_insert_pages(vma, *insert_addr, pages, &pages_remaining); 18878c2ecf20Sopenharmony_ci bytes_mapped = PAGE_SIZE * (pages_to_map - pages_remaining); 18888c2ecf20Sopenharmony_ci /* Even if vm_insert_pages fails, it may have partially succeeded in 18898c2ecf20Sopenharmony_ci * mapping (some but not all of the pages). 18908c2ecf20Sopenharmony_ci */ 18918c2ecf20Sopenharmony_ci *seq += bytes_mapped; 18928c2ecf20Sopenharmony_ci *insert_addr += bytes_mapped; 18938c2ecf20Sopenharmony_ci if (ret) { 18948c2ecf20Sopenharmony_ci /* But if vm_insert_pages did fail, we have to unroll some state 18958c2ecf20Sopenharmony_ci * we speculatively touched before. 18968c2ecf20Sopenharmony_ci */ 18978c2ecf20Sopenharmony_ci const int bytes_not_mapped = PAGE_SIZE * pages_remaining; 18988c2ecf20Sopenharmony_ci *length_with_pending -= bytes_not_mapped; 18998c2ecf20Sopenharmony_ci zc->recv_skip_hint += bytes_not_mapped; 19008c2ecf20Sopenharmony_ci } 19018c2ecf20Sopenharmony_ci return ret; 19028c2ecf20Sopenharmony_ci} 19038c2ecf20Sopenharmony_ci 19048c2ecf20Sopenharmony_cistatic int tcp_zerocopy_receive(struct sock *sk, 19058c2ecf20Sopenharmony_ci struct tcp_zerocopy_receive *zc) 19068c2ecf20Sopenharmony_ci{ 19078c2ecf20Sopenharmony_ci u32 length = 0, offset, vma_len, avail_len, aligned_len, copylen = 0; 19088c2ecf20Sopenharmony_ci unsigned long address = (unsigned long)zc->address; 19098c2ecf20Sopenharmony_ci s32 copybuf_len = zc->copybuf_len; 19108c2ecf20Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 19118c2ecf20Sopenharmony_ci #define PAGE_BATCH_SIZE 8 19128c2ecf20Sopenharmony_ci struct page *pages[PAGE_BATCH_SIZE]; 19138c2ecf20Sopenharmony_ci const skb_frag_t *frags = NULL; 19148c2ecf20Sopenharmony_ci struct vm_area_struct *vma; 19158c2ecf20Sopenharmony_ci struct sk_buff *skb = NULL; 19168c2ecf20Sopenharmony_ci unsigned long pg_idx = 0; 19178c2ecf20Sopenharmony_ci unsigned long curr_addr; 19188c2ecf20Sopenharmony_ci u32 seq = tp->copied_seq; 19198c2ecf20Sopenharmony_ci int inq = tcp_inq(sk); 19208c2ecf20Sopenharmony_ci int ret; 19218c2ecf20Sopenharmony_ci 19228c2ecf20Sopenharmony_ci zc->copybuf_len = 0; 19238c2ecf20Sopenharmony_ci 19248c2ecf20Sopenharmony_ci if (address & (PAGE_SIZE - 1) || address != zc->address) 19258c2ecf20Sopenharmony_ci return -EINVAL; 19268c2ecf20Sopenharmony_ci 19278c2ecf20Sopenharmony_ci if (sk->sk_state == TCP_LISTEN) 19288c2ecf20Sopenharmony_ci return -ENOTCONN; 19298c2ecf20Sopenharmony_ci 19308c2ecf20Sopenharmony_ci sock_rps_record_flow(sk); 19318c2ecf20Sopenharmony_ci 19328c2ecf20Sopenharmony_ci mmap_read_lock(current->mm); 19338c2ecf20Sopenharmony_ci 19348c2ecf20Sopenharmony_ci vma = find_vma(current->mm, address); 19358c2ecf20Sopenharmony_ci if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops) { 19368c2ecf20Sopenharmony_ci mmap_read_unlock(current->mm); 19378c2ecf20Sopenharmony_ci return -EINVAL; 19388c2ecf20Sopenharmony_ci } 19398c2ecf20Sopenharmony_ci vma_len = min_t(unsigned long, zc->length, vma->vm_end - address); 19408c2ecf20Sopenharmony_ci avail_len = min_t(u32, vma_len, inq); 19418c2ecf20Sopenharmony_ci aligned_len = avail_len & ~(PAGE_SIZE - 1); 19428c2ecf20Sopenharmony_ci if (aligned_len) { 19438c2ecf20Sopenharmony_ci zap_page_range(vma, address, aligned_len); 19448c2ecf20Sopenharmony_ci zc->length = aligned_len; 19458c2ecf20Sopenharmony_ci zc->recv_skip_hint = 0; 19468c2ecf20Sopenharmony_ci } else { 19478c2ecf20Sopenharmony_ci zc->length = avail_len; 19488c2ecf20Sopenharmony_ci zc->recv_skip_hint = avail_len; 19498c2ecf20Sopenharmony_ci } 19508c2ecf20Sopenharmony_ci ret = 0; 19518c2ecf20Sopenharmony_ci curr_addr = address; 19528c2ecf20Sopenharmony_ci while (length + PAGE_SIZE <= zc->length) { 19538c2ecf20Sopenharmony_ci int mappable_offset; 19548c2ecf20Sopenharmony_ci 19558c2ecf20Sopenharmony_ci if (zc->recv_skip_hint < PAGE_SIZE) { 19568c2ecf20Sopenharmony_ci u32 offset_frag; 19578c2ecf20Sopenharmony_ci 19588c2ecf20Sopenharmony_ci /* If we're here, finish the current batch. */ 19598c2ecf20Sopenharmony_ci if (pg_idx) { 19608c2ecf20Sopenharmony_ci ret = tcp_zerocopy_vm_insert_batch(vma, pages, 19618c2ecf20Sopenharmony_ci pg_idx, 19628c2ecf20Sopenharmony_ci &curr_addr, 19638c2ecf20Sopenharmony_ci &length, 19648c2ecf20Sopenharmony_ci &seq, zc); 19658c2ecf20Sopenharmony_ci if (ret) 19668c2ecf20Sopenharmony_ci goto out; 19678c2ecf20Sopenharmony_ci pg_idx = 0; 19688c2ecf20Sopenharmony_ci } 19698c2ecf20Sopenharmony_ci if (skb) { 19708c2ecf20Sopenharmony_ci if (zc->recv_skip_hint > 0) 19718c2ecf20Sopenharmony_ci break; 19728c2ecf20Sopenharmony_ci skb = skb->next; 19738c2ecf20Sopenharmony_ci offset = seq - TCP_SKB_CB(skb)->seq; 19748c2ecf20Sopenharmony_ci } else { 19758c2ecf20Sopenharmony_ci skb = tcp_recv_skb(sk, seq, &offset); 19768c2ecf20Sopenharmony_ci } 19778c2ecf20Sopenharmony_ci zc->recv_skip_hint = skb->len - offset; 19788c2ecf20Sopenharmony_ci frags = skb_advance_to_frag(skb, offset, &offset_frag); 19798c2ecf20Sopenharmony_ci if (!frags || offset_frag) 19808c2ecf20Sopenharmony_ci break; 19818c2ecf20Sopenharmony_ci } 19828c2ecf20Sopenharmony_ci 19838c2ecf20Sopenharmony_ci mappable_offset = find_next_mappable_frag(frags, 19848c2ecf20Sopenharmony_ci zc->recv_skip_hint); 19858c2ecf20Sopenharmony_ci if (mappable_offset) { 19868c2ecf20Sopenharmony_ci zc->recv_skip_hint = mappable_offset; 19878c2ecf20Sopenharmony_ci break; 19888c2ecf20Sopenharmony_ci } 19898c2ecf20Sopenharmony_ci pages[pg_idx] = skb_frag_page(frags); 19908c2ecf20Sopenharmony_ci pg_idx++; 19918c2ecf20Sopenharmony_ci length += PAGE_SIZE; 19928c2ecf20Sopenharmony_ci zc->recv_skip_hint -= PAGE_SIZE; 19938c2ecf20Sopenharmony_ci frags++; 19948c2ecf20Sopenharmony_ci if (pg_idx == PAGE_BATCH_SIZE) { 19958c2ecf20Sopenharmony_ci ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx, 19968c2ecf20Sopenharmony_ci &curr_addr, &length, 19978c2ecf20Sopenharmony_ci &seq, zc); 19988c2ecf20Sopenharmony_ci if (ret) 19998c2ecf20Sopenharmony_ci goto out; 20008c2ecf20Sopenharmony_ci pg_idx = 0; 20018c2ecf20Sopenharmony_ci } 20028c2ecf20Sopenharmony_ci } 20038c2ecf20Sopenharmony_ci if (pg_idx) { 20048c2ecf20Sopenharmony_ci ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx, 20058c2ecf20Sopenharmony_ci &curr_addr, &length, &seq, 20068c2ecf20Sopenharmony_ci zc); 20078c2ecf20Sopenharmony_ci } 20088c2ecf20Sopenharmony_ciout: 20098c2ecf20Sopenharmony_ci mmap_read_unlock(current->mm); 20108c2ecf20Sopenharmony_ci /* Try to copy straggler data. */ 20118c2ecf20Sopenharmony_ci if (!ret) 20128c2ecf20Sopenharmony_ci copylen = tcp_zerocopy_handle_leftover_data(zc, sk, skb, &seq, 20138c2ecf20Sopenharmony_ci copybuf_len); 20148c2ecf20Sopenharmony_ci 20158c2ecf20Sopenharmony_ci if (length + copylen) { 20168c2ecf20Sopenharmony_ci WRITE_ONCE(tp->copied_seq, seq); 20178c2ecf20Sopenharmony_ci tcp_rcv_space_adjust(sk); 20188c2ecf20Sopenharmony_ci 20198c2ecf20Sopenharmony_ci /* Clean up data we have read: This will do ACK frames. */ 20208c2ecf20Sopenharmony_ci tcp_recv_skb(sk, seq, &offset); 20218c2ecf20Sopenharmony_ci tcp_cleanup_rbuf(sk, length + copylen); 20228c2ecf20Sopenharmony_ci ret = 0; 20238c2ecf20Sopenharmony_ci if (length == zc->length) 20248c2ecf20Sopenharmony_ci zc->recv_skip_hint = 0; 20258c2ecf20Sopenharmony_ci } else { 20268c2ecf20Sopenharmony_ci if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE)) 20278c2ecf20Sopenharmony_ci ret = -EIO; 20288c2ecf20Sopenharmony_ci } 20298c2ecf20Sopenharmony_ci zc->length = length; 20308c2ecf20Sopenharmony_ci return ret; 20318c2ecf20Sopenharmony_ci} 20328c2ecf20Sopenharmony_ci#endif 20338c2ecf20Sopenharmony_ci 20348c2ecf20Sopenharmony_cistatic void tcp_update_recv_tstamps(struct sk_buff *skb, 20358c2ecf20Sopenharmony_ci struct scm_timestamping_internal *tss) 20368c2ecf20Sopenharmony_ci{ 20378c2ecf20Sopenharmony_ci if (skb->tstamp) 20388c2ecf20Sopenharmony_ci tss->ts[0] = ktime_to_timespec64(skb->tstamp); 20398c2ecf20Sopenharmony_ci else 20408c2ecf20Sopenharmony_ci tss->ts[0] = (struct timespec64) {0}; 20418c2ecf20Sopenharmony_ci 20428c2ecf20Sopenharmony_ci if (skb_hwtstamps(skb)->hwtstamp) 20438c2ecf20Sopenharmony_ci tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp); 20448c2ecf20Sopenharmony_ci else 20458c2ecf20Sopenharmony_ci tss->ts[2] = (struct timespec64) {0}; 20468c2ecf20Sopenharmony_ci} 20478c2ecf20Sopenharmony_ci 20488c2ecf20Sopenharmony_ci/* Similar to __sock_recv_timestamp, but does not require an skb */ 20498c2ecf20Sopenharmony_cistatic void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, 20508c2ecf20Sopenharmony_ci struct scm_timestamping_internal *tss) 20518c2ecf20Sopenharmony_ci{ 20528c2ecf20Sopenharmony_ci int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW); 20538c2ecf20Sopenharmony_ci bool has_timestamping = false; 20548c2ecf20Sopenharmony_ci 20558c2ecf20Sopenharmony_ci if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) { 20568c2ecf20Sopenharmony_ci if (sock_flag(sk, SOCK_RCVTSTAMP)) { 20578c2ecf20Sopenharmony_ci if (sock_flag(sk, SOCK_RCVTSTAMPNS)) { 20588c2ecf20Sopenharmony_ci if (new_tstamp) { 20598c2ecf20Sopenharmony_ci struct __kernel_timespec kts = { 20608c2ecf20Sopenharmony_ci .tv_sec = tss->ts[0].tv_sec, 20618c2ecf20Sopenharmony_ci .tv_nsec = tss->ts[0].tv_nsec, 20628c2ecf20Sopenharmony_ci }; 20638c2ecf20Sopenharmony_ci put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW, 20648c2ecf20Sopenharmony_ci sizeof(kts), &kts); 20658c2ecf20Sopenharmony_ci } else { 20668c2ecf20Sopenharmony_ci struct __kernel_old_timespec ts_old = { 20678c2ecf20Sopenharmony_ci .tv_sec = tss->ts[0].tv_sec, 20688c2ecf20Sopenharmony_ci .tv_nsec = tss->ts[0].tv_nsec, 20698c2ecf20Sopenharmony_ci }; 20708c2ecf20Sopenharmony_ci put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD, 20718c2ecf20Sopenharmony_ci sizeof(ts_old), &ts_old); 20728c2ecf20Sopenharmony_ci } 20738c2ecf20Sopenharmony_ci } else { 20748c2ecf20Sopenharmony_ci if (new_tstamp) { 20758c2ecf20Sopenharmony_ci struct __kernel_sock_timeval stv = { 20768c2ecf20Sopenharmony_ci .tv_sec = tss->ts[0].tv_sec, 20778c2ecf20Sopenharmony_ci .tv_usec = tss->ts[0].tv_nsec / 1000, 20788c2ecf20Sopenharmony_ci }; 20798c2ecf20Sopenharmony_ci put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW, 20808c2ecf20Sopenharmony_ci sizeof(stv), &stv); 20818c2ecf20Sopenharmony_ci } else { 20828c2ecf20Sopenharmony_ci struct __kernel_old_timeval tv = { 20838c2ecf20Sopenharmony_ci .tv_sec = tss->ts[0].tv_sec, 20848c2ecf20Sopenharmony_ci .tv_usec = tss->ts[0].tv_nsec / 1000, 20858c2ecf20Sopenharmony_ci }; 20868c2ecf20Sopenharmony_ci put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD, 20878c2ecf20Sopenharmony_ci sizeof(tv), &tv); 20888c2ecf20Sopenharmony_ci } 20898c2ecf20Sopenharmony_ci } 20908c2ecf20Sopenharmony_ci } 20918c2ecf20Sopenharmony_ci 20928c2ecf20Sopenharmony_ci if (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) 20938c2ecf20Sopenharmony_ci has_timestamping = true; 20948c2ecf20Sopenharmony_ci else 20958c2ecf20Sopenharmony_ci tss->ts[0] = (struct timespec64) {0}; 20968c2ecf20Sopenharmony_ci } 20978c2ecf20Sopenharmony_ci 20988c2ecf20Sopenharmony_ci if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) { 20998c2ecf20Sopenharmony_ci if (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) 21008c2ecf20Sopenharmony_ci has_timestamping = true; 21018c2ecf20Sopenharmony_ci else 21028c2ecf20Sopenharmony_ci tss->ts[2] = (struct timespec64) {0}; 21038c2ecf20Sopenharmony_ci } 21048c2ecf20Sopenharmony_ci 21058c2ecf20Sopenharmony_ci if (has_timestamping) { 21068c2ecf20Sopenharmony_ci tss->ts[1] = (struct timespec64) {0}; 21078c2ecf20Sopenharmony_ci if (sock_flag(sk, SOCK_TSTAMP_NEW)) 21088c2ecf20Sopenharmony_ci put_cmsg_scm_timestamping64(msg, tss); 21098c2ecf20Sopenharmony_ci else 21108c2ecf20Sopenharmony_ci put_cmsg_scm_timestamping(msg, tss); 21118c2ecf20Sopenharmony_ci } 21128c2ecf20Sopenharmony_ci} 21138c2ecf20Sopenharmony_ci 21148c2ecf20Sopenharmony_cistatic int tcp_inq_hint(struct sock *sk) 21158c2ecf20Sopenharmony_ci{ 21168c2ecf20Sopenharmony_ci const struct tcp_sock *tp = tcp_sk(sk); 21178c2ecf20Sopenharmony_ci u32 copied_seq = READ_ONCE(tp->copied_seq); 21188c2ecf20Sopenharmony_ci u32 rcv_nxt = READ_ONCE(tp->rcv_nxt); 21198c2ecf20Sopenharmony_ci int inq; 21208c2ecf20Sopenharmony_ci 21218c2ecf20Sopenharmony_ci inq = rcv_nxt - copied_seq; 21228c2ecf20Sopenharmony_ci if (unlikely(inq < 0 || copied_seq != READ_ONCE(tp->copied_seq))) { 21238c2ecf20Sopenharmony_ci lock_sock(sk); 21248c2ecf20Sopenharmony_ci inq = tp->rcv_nxt - tp->copied_seq; 21258c2ecf20Sopenharmony_ci release_sock(sk); 21268c2ecf20Sopenharmony_ci } 21278c2ecf20Sopenharmony_ci /* After receiving a FIN, tell the user-space to continue reading 21288c2ecf20Sopenharmony_ci * by returning a non-zero inq. 21298c2ecf20Sopenharmony_ci */ 21308c2ecf20Sopenharmony_ci if (inq == 0 && sock_flag(sk, SOCK_DONE)) 21318c2ecf20Sopenharmony_ci inq = 1; 21328c2ecf20Sopenharmony_ci return inq; 21338c2ecf20Sopenharmony_ci} 21348c2ecf20Sopenharmony_ci 21358c2ecf20Sopenharmony_ci/* 21368c2ecf20Sopenharmony_ci * This routine copies from a sock struct into the user buffer. 21378c2ecf20Sopenharmony_ci * 21388c2ecf20Sopenharmony_ci * Technical note: in 2.3 we work on _locked_ socket, so that 21398c2ecf20Sopenharmony_ci * tricks with *seq access order and skb->users are not required. 21408c2ecf20Sopenharmony_ci * Probably, code can be easily improved even more. 21418c2ecf20Sopenharmony_ci */ 21428c2ecf20Sopenharmony_ci 21438c2ecf20Sopenharmony_ciint tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, 21448c2ecf20Sopenharmony_ci int flags, int *addr_len) 21458c2ecf20Sopenharmony_ci{ 21468c2ecf20Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 21478c2ecf20Sopenharmony_ci int copied = 0; 21488c2ecf20Sopenharmony_ci u32 peek_seq; 21498c2ecf20Sopenharmony_ci u32 *seq; 21508c2ecf20Sopenharmony_ci unsigned long used; 21518c2ecf20Sopenharmony_ci int err, inq; 21528c2ecf20Sopenharmony_ci int target; /* Read at least this many bytes */ 21538c2ecf20Sopenharmony_ci long timeo; 21548c2ecf20Sopenharmony_ci struct sk_buff *skb, *last; 21558c2ecf20Sopenharmony_ci u32 urg_hole = 0; 21568c2ecf20Sopenharmony_ci struct scm_timestamping_internal tss; 21578c2ecf20Sopenharmony_ci int cmsg_flags; 21588c2ecf20Sopenharmony_ci 21598c2ecf20Sopenharmony_ci if (unlikely(flags & MSG_ERRQUEUE)) 21608c2ecf20Sopenharmony_ci return inet_recv_error(sk, msg, len, addr_len); 21618c2ecf20Sopenharmony_ci 21628c2ecf20Sopenharmony_ci if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue) && 21638c2ecf20Sopenharmony_ci (sk->sk_state == TCP_ESTABLISHED)) 21648c2ecf20Sopenharmony_ci sk_busy_loop(sk, nonblock); 21658c2ecf20Sopenharmony_ci 21668c2ecf20Sopenharmony_ci lock_sock(sk); 21678c2ecf20Sopenharmony_ci 21688c2ecf20Sopenharmony_ci err = -ENOTCONN; 21698c2ecf20Sopenharmony_ci if (sk->sk_state == TCP_LISTEN) 21708c2ecf20Sopenharmony_ci goto out; 21718c2ecf20Sopenharmony_ci 21728c2ecf20Sopenharmony_ci cmsg_flags = tp->recvmsg_inq ? 1 : 0; 21738c2ecf20Sopenharmony_ci timeo = sock_rcvtimeo(sk, nonblock); 21748c2ecf20Sopenharmony_ci 21758c2ecf20Sopenharmony_ci /* Urgent data needs to be handled specially. */ 21768c2ecf20Sopenharmony_ci if (flags & MSG_OOB) 21778c2ecf20Sopenharmony_ci goto recv_urg; 21788c2ecf20Sopenharmony_ci 21798c2ecf20Sopenharmony_ci if (unlikely(tp->repair)) { 21808c2ecf20Sopenharmony_ci err = -EPERM; 21818c2ecf20Sopenharmony_ci if (!(flags & MSG_PEEK)) 21828c2ecf20Sopenharmony_ci goto out; 21838c2ecf20Sopenharmony_ci 21848c2ecf20Sopenharmony_ci if (tp->repair_queue == TCP_SEND_QUEUE) 21858c2ecf20Sopenharmony_ci goto recv_sndq; 21868c2ecf20Sopenharmony_ci 21878c2ecf20Sopenharmony_ci err = -EINVAL; 21888c2ecf20Sopenharmony_ci if (tp->repair_queue == TCP_NO_QUEUE) 21898c2ecf20Sopenharmony_ci goto out; 21908c2ecf20Sopenharmony_ci 21918c2ecf20Sopenharmony_ci /* 'common' recv queue MSG_PEEK-ing */ 21928c2ecf20Sopenharmony_ci } 21938c2ecf20Sopenharmony_ci 21948c2ecf20Sopenharmony_ci seq = &tp->copied_seq; 21958c2ecf20Sopenharmony_ci if (flags & MSG_PEEK) { 21968c2ecf20Sopenharmony_ci peek_seq = tp->copied_seq; 21978c2ecf20Sopenharmony_ci seq = &peek_seq; 21988c2ecf20Sopenharmony_ci } 21998c2ecf20Sopenharmony_ci 22008c2ecf20Sopenharmony_ci target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); 22018c2ecf20Sopenharmony_ci 22028c2ecf20Sopenharmony_ci do { 22038c2ecf20Sopenharmony_ci u32 offset; 22048c2ecf20Sopenharmony_ci 22058c2ecf20Sopenharmony_ci /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */ 22068c2ecf20Sopenharmony_ci if (tp->urg_data && tp->urg_seq == *seq) { 22078c2ecf20Sopenharmony_ci if (copied) 22088c2ecf20Sopenharmony_ci break; 22098c2ecf20Sopenharmony_ci if (signal_pending(current)) { 22108c2ecf20Sopenharmony_ci copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; 22118c2ecf20Sopenharmony_ci break; 22128c2ecf20Sopenharmony_ci } 22138c2ecf20Sopenharmony_ci } 22148c2ecf20Sopenharmony_ci 22158c2ecf20Sopenharmony_ci /* Next get a buffer. */ 22168c2ecf20Sopenharmony_ci 22178c2ecf20Sopenharmony_ci last = skb_peek_tail(&sk->sk_receive_queue); 22188c2ecf20Sopenharmony_ci skb_queue_walk(&sk->sk_receive_queue, skb) { 22198c2ecf20Sopenharmony_ci last = skb; 22208c2ecf20Sopenharmony_ci /* Now that we have two receive queues this 22218c2ecf20Sopenharmony_ci * shouldn't happen. 22228c2ecf20Sopenharmony_ci */ 22238c2ecf20Sopenharmony_ci if (WARN(before(*seq, TCP_SKB_CB(skb)->seq), 22248c2ecf20Sopenharmony_ci "TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n", 22258c2ecf20Sopenharmony_ci *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, 22268c2ecf20Sopenharmony_ci flags)) 22278c2ecf20Sopenharmony_ci break; 22288c2ecf20Sopenharmony_ci 22298c2ecf20Sopenharmony_ci offset = *seq - TCP_SKB_CB(skb)->seq; 22308c2ecf20Sopenharmony_ci if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { 22318c2ecf20Sopenharmony_ci pr_err_once("%s: found a SYN, please report !\n", __func__); 22328c2ecf20Sopenharmony_ci offset--; 22338c2ecf20Sopenharmony_ci } 22348c2ecf20Sopenharmony_ci if (offset < skb->len) 22358c2ecf20Sopenharmony_ci goto found_ok_skb; 22368c2ecf20Sopenharmony_ci if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) 22378c2ecf20Sopenharmony_ci goto found_fin_ok; 22388c2ecf20Sopenharmony_ci WARN(!(flags & MSG_PEEK), 22398c2ecf20Sopenharmony_ci "TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n", 22408c2ecf20Sopenharmony_ci *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags); 22418c2ecf20Sopenharmony_ci } 22428c2ecf20Sopenharmony_ci 22438c2ecf20Sopenharmony_ci /* Well, if we have backlog, try to process it now yet. */ 22448c2ecf20Sopenharmony_ci 22458c2ecf20Sopenharmony_ci if (copied >= target && !READ_ONCE(sk->sk_backlog.tail)) 22468c2ecf20Sopenharmony_ci break; 22478c2ecf20Sopenharmony_ci 22488c2ecf20Sopenharmony_ci if (copied) { 22498c2ecf20Sopenharmony_ci if (sk->sk_err || 22508c2ecf20Sopenharmony_ci sk->sk_state == TCP_CLOSE || 22518c2ecf20Sopenharmony_ci (sk->sk_shutdown & RCV_SHUTDOWN) || 22528c2ecf20Sopenharmony_ci !timeo || 22538c2ecf20Sopenharmony_ci signal_pending(current)) 22548c2ecf20Sopenharmony_ci break; 22558c2ecf20Sopenharmony_ci } else { 22568c2ecf20Sopenharmony_ci if (sock_flag(sk, SOCK_DONE)) 22578c2ecf20Sopenharmony_ci break; 22588c2ecf20Sopenharmony_ci 22598c2ecf20Sopenharmony_ci if (sk->sk_err) { 22608c2ecf20Sopenharmony_ci copied = sock_error(sk); 22618c2ecf20Sopenharmony_ci break; 22628c2ecf20Sopenharmony_ci } 22638c2ecf20Sopenharmony_ci 22648c2ecf20Sopenharmony_ci if (sk->sk_shutdown & RCV_SHUTDOWN) 22658c2ecf20Sopenharmony_ci break; 22668c2ecf20Sopenharmony_ci 22678c2ecf20Sopenharmony_ci if (sk->sk_state == TCP_CLOSE) { 22688c2ecf20Sopenharmony_ci /* This occurs when user tries to read 22698c2ecf20Sopenharmony_ci * from never connected socket. 22708c2ecf20Sopenharmony_ci */ 22718c2ecf20Sopenharmony_ci copied = -ENOTCONN; 22728c2ecf20Sopenharmony_ci break; 22738c2ecf20Sopenharmony_ci } 22748c2ecf20Sopenharmony_ci 22758c2ecf20Sopenharmony_ci if (!timeo) { 22768c2ecf20Sopenharmony_ci copied = -EAGAIN; 22778c2ecf20Sopenharmony_ci break; 22788c2ecf20Sopenharmony_ci } 22798c2ecf20Sopenharmony_ci 22808c2ecf20Sopenharmony_ci if (signal_pending(current)) { 22818c2ecf20Sopenharmony_ci copied = sock_intr_errno(timeo); 22828c2ecf20Sopenharmony_ci break; 22838c2ecf20Sopenharmony_ci } 22848c2ecf20Sopenharmony_ci } 22858c2ecf20Sopenharmony_ci 22868c2ecf20Sopenharmony_ci tcp_cleanup_rbuf(sk, copied); 22878c2ecf20Sopenharmony_ci 22888c2ecf20Sopenharmony_ci if (copied >= target) { 22898c2ecf20Sopenharmony_ci /* Do not sleep, just process backlog. */ 22908c2ecf20Sopenharmony_ci release_sock(sk); 22918c2ecf20Sopenharmony_ci lock_sock(sk); 22928c2ecf20Sopenharmony_ci } else { 22938c2ecf20Sopenharmony_ci sk_wait_data(sk, &timeo, last); 22948c2ecf20Sopenharmony_ci } 22958c2ecf20Sopenharmony_ci 22968c2ecf20Sopenharmony_ci if ((flags & MSG_PEEK) && 22978c2ecf20Sopenharmony_ci (peek_seq - copied - urg_hole != tp->copied_seq)) { 22988c2ecf20Sopenharmony_ci net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n", 22998c2ecf20Sopenharmony_ci current->comm, 23008c2ecf20Sopenharmony_ci task_pid_nr(current)); 23018c2ecf20Sopenharmony_ci peek_seq = tp->copied_seq; 23028c2ecf20Sopenharmony_ci } 23038c2ecf20Sopenharmony_ci continue; 23048c2ecf20Sopenharmony_ci 23058c2ecf20Sopenharmony_cifound_ok_skb: 23068c2ecf20Sopenharmony_ci /* Ok so how much can we use? */ 23078c2ecf20Sopenharmony_ci used = skb->len - offset; 23088c2ecf20Sopenharmony_ci if (len < used) 23098c2ecf20Sopenharmony_ci used = len; 23108c2ecf20Sopenharmony_ci 23118c2ecf20Sopenharmony_ci /* Do we have urgent data here? */ 23128c2ecf20Sopenharmony_ci if (tp->urg_data) { 23138c2ecf20Sopenharmony_ci u32 urg_offset = tp->urg_seq - *seq; 23148c2ecf20Sopenharmony_ci if (urg_offset < used) { 23158c2ecf20Sopenharmony_ci if (!urg_offset) { 23168c2ecf20Sopenharmony_ci if (!sock_flag(sk, SOCK_URGINLINE)) { 23178c2ecf20Sopenharmony_ci WRITE_ONCE(*seq, *seq + 1); 23188c2ecf20Sopenharmony_ci urg_hole++; 23198c2ecf20Sopenharmony_ci offset++; 23208c2ecf20Sopenharmony_ci used--; 23218c2ecf20Sopenharmony_ci if (!used) 23228c2ecf20Sopenharmony_ci goto skip_copy; 23238c2ecf20Sopenharmony_ci } 23248c2ecf20Sopenharmony_ci } else 23258c2ecf20Sopenharmony_ci used = urg_offset; 23268c2ecf20Sopenharmony_ci } 23278c2ecf20Sopenharmony_ci } 23288c2ecf20Sopenharmony_ci 23298c2ecf20Sopenharmony_ci if (!(flags & MSG_TRUNC)) { 23308c2ecf20Sopenharmony_ci err = skb_copy_datagram_msg(skb, offset, msg, used); 23318c2ecf20Sopenharmony_ci if (err) { 23328c2ecf20Sopenharmony_ci /* Exception. Bailout! */ 23338c2ecf20Sopenharmony_ci if (!copied) 23348c2ecf20Sopenharmony_ci copied = -EFAULT; 23358c2ecf20Sopenharmony_ci break; 23368c2ecf20Sopenharmony_ci } 23378c2ecf20Sopenharmony_ci } 23388c2ecf20Sopenharmony_ci 23398c2ecf20Sopenharmony_ci WRITE_ONCE(*seq, *seq + used); 23408c2ecf20Sopenharmony_ci copied += used; 23418c2ecf20Sopenharmony_ci len -= used; 23428c2ecf20Sopenharmony_ci 23438c2ecf20Sopenharmony_ci tcp_rcv_space_adjust(sk); 23448c2ecf20Sopenharmony_ci 23458c2ecf20Sopenharmony_ciskip_copy: 23468c2ecf20Sopenharmony_ci if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) { 23478c2ecf20Sopenharmony_ci tp->urg_data = 0; 23488c2ecf20Sopenharmony_ci tcp_fast_path_check(sk); 23498c2ecf20Sopenharmony_ci } 23508c2ecf20Sopenharmony_ci 23518c2ecf20Sopenharmony_ci if (TCP_SKB_CB(skb)->has_rxtstamp) { 23528c2ecf20Sopenharmony_ci tcp_update_recv_tstamps(skb, &tss); 23538c2ecf20Sopenharmony_ci cmsg_flags |= 2; 23548c2ecf20Sopenharmony_ci } 23558c2ecf20Sopenharmony_ci 23568c2ecf20Sopenharmony_ci if (used + offset < skb->len) 23578c2ecf20Sopenharmony_ci continue; 23588c2ecf20Sopenharmony_ci 23598c2ecf20Sopenharmony_ci if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) 23608c2ecf20Sopenharmony_ci goto found_fin_ok; 23618c2ecf20Sopenharmony_ci if (!(flags & MSG_PEEK)) 23628c2ecf20Sopenharmony_ci sk_eat_skb(sk, skb); 23638c2ecf20Sopenharmony_ci continue; 23648c2ecf20Sopenharmony_ci 23658c2ecf20Sopenharmony_cifound_fin_ok: 23668c2ecf20Sopenharmony_ci /* Process the FIN. */ 23678c2ecf20Sopenharmony_ci WRITE_ONCE(*seq, *seq + 1); 23688c2ecf20Sopenharmony_ci if (!(flags & MSG_PEEK)) 23698c2ecf20Sopenharmony_ci sk_eat_skb(sk, skb); 23708c2ecf20Sopenharmony_ci break; 23718c2ecf20Sopenharmony_ci } while (len > 0); 23728c2ecf20Sopenharmony_ci 23738c2ecf20Sopenharmony_ci /* According to UNIX98, msg_name/msg_namelen are ignored 23748c2ecf20Sopenharmony_ci * on connected socket. I was just happy when found this 8) --ANK 23758c2ecf20Sopenharmony_ci */ 23768c2ecf20Sopenharmony_ci 23778c2ecf20Sopenharmony_ci /* Clean up data we have read: This will do ACK frames. */ 23788c2ecf20Sopenharmony_ci tcp_cleanup_rbuf(sk, copied); 23798c2ecf20Sopenharmony_ci 23808c2ecf20Sopenharmony_ci release_sock(sk); 23818c2ecf20Sopenharmony_ci 23828c2ecf20Sopenharmony_ci if (cmsg_flags) { 23838c2ecf20Sopenharmony_ci if (cmsg_flags & 2) 23848c2ecf20Sopenharmony_ci tcp_recv_timestamp(msg, sk, &tss); 23858c2ecf20Sopenharmony_ci if (cmsg_flags & 1) { 23868c2ecf20Sopenharmony_ci inq = tcp_inq_hint(sk); 23878c2ecf20Sopenharmony_ci put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq); 23888c2ecf20Sopenharmony_ci } 23898c2ecf20Sopenharmony_ci } 23908c2ecf20Sopenharmony_ci 23918c2ecf20Sopenharmony_ci return copied; 23928c2ecf20Sopenharmony_ci 23938c2ecf20Sopenharmony_ciout: 23948c2ecf20Sopenharmony_ci release_sock(sk); 23958c2ecf20Sopenharmony_ci return err; 23968c2ecf20Sopenharmony_ci 23978c2ecf20Sopenharmony_cirecv_urg: 23988c2ecf20Sopenharmony_ci err = tcp_recv_urg(sk, msg, len, flags); 23998c2ecf20Sopenharmony_ci goto out; 24008c2ecf20Sopenharmony_ci 24018c2ecf20Sopenharmony_cirecv_sndq: 24028c2ecf20Sopenharmony_ci err = tcp_peek_sndq(sk, msg, len); 24038c2ecf20Sopenharmony_ci goto out; 24048c2ecf20Sopenharmony_ci} 24058c2ecf20Sopenharmony_ciEXPORT_SYMBOL(tcp_recvmsg); 24068c2ecf20Sopenharmony_ci 24078c2ecf20Sopenharmony_civoid tcp_set_state(struct sock *sk, int state) 24088c2ecf20Sopenharmony_ci{ 24098c2ecf20Sopenharmony_ci int oldstate = sk->sk_state; 24108c2ecf20Sopenharmony_ci 24118c2ecf20Sopenharmony_ci /* We defined a new enum for TCP states that are exported in BPF 24128c2ecf20Sopenharmony_ci * so as not force the internal TCP states to be frozen. The 24138c2ecf20Sopenharmony_ci * following checks will detect if an internal state value ever 24148c2ecf20Sopenharmony_ci * differs from the BPF value. If this ever happens, then we will 24158c2ecf20Sopenharmony_ci * need to remap the internal value to the BPF value before calling 24168c2ecf20Sopenharmony_ci * tcp_call_bpf_2arg. 24178c2ecf20Sopenharmony_ci */ 24188c2ecf20Sopenharmony_ci BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED); 24198c2ecf20Sopenharmony_ci BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT); 24208c2ecf20Sopenharmony_ci BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV); 24218c2ecf20Sopenharmony_ci BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1); 24228c2ecf20Sopenharmony_ci BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2); 24238c2ecf20Sopenharmony_ci BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT); 24248c2ecf20Sopenharmony_ci BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE); 24258c2ecf20Sopenharmony_ci BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT); 24268c2ecf20Sopenharmony_ci BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK); 24278c2ecf20Sopenharmony_ci BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN); 24288c2ecf20Sopenharmony_ci BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING); 24298c2ecf20Sopenharmony_ci BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV); 24308c2ecf20Sopenharmony_ci BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES); 24318c2ecf20Sopenharmony_ci 24328c2ecf20Sopenharmony_ci if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG)) 24338c2ecf20Sopenharmony_ci tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state); 24348c2ecf20Sopenharmony_ci 24358c2ecf20Sopenharmony_ci switch (state) { 24368c2ecf20Sopenharmony_ci case TCP_ESTABLISHED: 24378c2ecf20Sopenharmony_ci if (oldstate != TCP_ESTABLISHED) 24388c2ecf20Sopenharmony_ci TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); 24398c2ecf20Sopenharmony_ci break; 24408c2ecf20Sopenharmony_ci 24418c2ecf20Sopenharmony_ci case TCP_CLOSE: 24428c2ecf20Sopenharmony_ci if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED) 24438c2ecf20Sopenharmony_ci TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS); 24448c2ecf20Sopenharmony_ci 24458c2ecf20Sopenharmony_ci sk->sk_prot->unhash(sk); 24468c2ecf20Sopenharmony_ci if (inet_csk(sk)->icsk_bind_hash && 24478c2ecf20Sopenharmony_ci !(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) 24488c2ecf20Sopenharmony_ci inet_put_port(sk); 24498c2ecf20Sopenharmony_ci fallthrough; 24508c2ecf20Sopenharmony_ci default: 24518c2ecf20Sopenharmony_ci if (oldstate == TCP_ESTABLISHED) 24528c2ecf20Sopenharmony_ci TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); 24538c2ecf20Sopenharmony_ci } 24548c2ecf20Sopenharmony_ci 24558c2ecf20Sopenharmony_ci /* Change state AFTER socket is unhashed to avoid closed 24568c2ecf20Sopenharmony_ci * socket sitting in hash tables. 24578c2ecf20Sopenharmony_ci */ 24588c2ecf20Sopenharmony_ci inet_sk_state_store(sk, state); 24598c2ecf20Sopenharmony_ci} 24608c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(tcp_set_state); 24618c2ecf20Sopenharmony_ci 24628c2ecf20Sopenharmony_ci/* 24638c2ecf20Sopenharmony_ci * State processing on a close. This implements the state shift for 24648c2ecf20Sopenharmony_ci * sending our FIN frame. Note that we only send a FIN for some 24658c2ecf20Sopenharmony_ci * states. A shutdown() may have already sent the FIN, or we may be 24668c2ecf20Sopenharmony_ci * closed. 24678c2ecf20Sopenharmony_ci */ 24688c2ecf20Sopenharmony_ci 24698c2ecf20Sopenharmony_cistatic const unsigned char new_state[16] = { 24708c2ecf20Sopenharmony_ci /* current state: new state: action: */ 24718c2ecf20Sopenharmony_ci [0 /* (Invalid) */] = TCP_CLOSE, 24728c2ecf20Sopenharmony_ci [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, 24738c2ecf20Sopenharmony_ci [TCP_SYN_SENT] = TCP_CLOSE, 24748c2ecf20Sopenharmony_ci [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, 24758c2ecf20Sopenharmony_ci [TCP_FIN_WAIT1] = TCP_FIN_WAIT1, 24768c2ecf20Sopenharmony_ci [TCP_FIN_WAIT2] = TCP_FIN_WAIT2, 24778c2ecf20Sopenharmony_ci [TCP_TIME_WAIT] = TCP_CLOSE, 24788c2ecf20Sopenharmony_ci [TCP_CLOSE] = TCP_CLOSE, 24798c2ecf20Sopenharmony_ci [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN, 24808c2ecf20Sopenharmony_ci [TCP_LAST_ACK] = TCP_LAST_ACK, 24818c2ecf20Sopenharmony_ci [TCP_LISTEN] = TCP_CLOSE, 24828c2ecf20Sopenharmony_ci [TCP_CLOSING] = TCP_CLOSING, 24838c2ecf20Sopenharmony_ci [TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */ 24848c2ecf20Sopenharmony_ci}; 24858c2ecf20Sopenharmony_ci 24868c2ecf20Sopenharmony_cistatic int tcp_close_state(struct sock *sk) 24878c2ecf20Sopenharmony_ci{ 24888c2ecf20Sopenharmony_ci int next = (int)new_state[sk->sk_state]; 24898c2ecf20Sopenharmony_ci int ns = next & TCP_STATE_MASK; 24908c2ecf20Sopenharmony_ci 24918c2ecf20Sopenharmony_ci tcp_set_state(sk, ns); 24928c2ecf20Sopenharmony_ci 24938c2ecf20Sopenharmony_ci return next & TCP_ACTION_FIN; 24948c2ecf20Sopenharmony_ci} 24958c2ecf20Sopenharmony_ci 24968c2ecf20Sopenharmony_ci/* 24978c2ecf20Sopenharmony_ci * Shutdown the sending side of a connection. Much like close except 24988c2ecf20Sopenharmony_ci * that we don't receive shut down or sock_set_flag(sk, SOCK_DEAD). 24998c2ecf20Sopenharmony_ci */ 25008c2ecf20Sopenharmony_ci 25018c2ecf20Sopenharmony_civoid tcp_shutdown(struct sock *sk, int how) 25028c2ecf20Sopenharmony_ci{ 25038c2ecf20Sopenharmony_ci /* We need to grab some memory, and put together a FIN, 25048c2ecf20Sopenharmony_ci * and then put it into the queue to be sent. 25058c2ecf20Sopenharmony_ci * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92. 25068c2ecf20Sopenharmony_ci */ 25078c2ecf20Sopenharmony_ci if (!(how & SEND_SHUTDOWN)) 25088c2ecf20Sopenharmony_ci return; 25098c2ecf20Sopenharmony_ci 25108c2ecf20Sopenharmony_ci /* If we've already sent a FIN, or it's a closed state, skip this. */ 25118c2ecf20Sopenharmony_ci if ((1 << sk->sk_state) & 25128c2ecf20Sopenharmony_ci (TCPF_ESTABLISHED | TCPF_SYN_SENT | 25138c2ecf20Sopenharmony_ci TCPF_CLOSE_WAIT)) { 25148c2ecf20Sopenharmony_ci /* Clear out any half completed packets. FIN if needed. */ 25158c2ecf20Sopenharmony_ci if (tcp_close_state(sk)) 25168c2ecf20Sopenharmony_ci tcp_send_fin(sk); 25178c2ecf20Sopenharmony_ci } 25188c2ecf20Sopenharmony_ci} 25198c2ecf20Sopenharmony_ciEXPORT_SYMBOL(tcp_shutdown); 25208c2ecf20Sopenharmony_ci 25218c2ecf20Sopenharmony_ciint tcp_orphan_count_sum(void) 25228c2ecf20Sopenharmony_ci{ 25238c2ecf20Sopenharmony_ci int i, total = 0; 25248c2ecf20Sopenharmony_ci 25258c2ecf20Sopenharmony_ci for_each_possible_cpu(i) 25268c2ecf20Sopenharmony_ci total += per_cpu(tcp_orphan_count, i); 25278c2ecf20Sopenharmony_ci 25288c2ecf20Sopenharmony_ci return max(total, 0); 25298c2ecf20Sopenharmony_ci} 25308c2ecf20Sopenharmony_ci 25318c2ecf20Sopenharmony_cistatic int tcp_orphan_cache; 25328c2ecf20Sopenharmony_cistatic struct timer_list tcp_orphan_timer; 25338c2ecf20Sopenharmony_ci#define TCP_ORPHAN_TIMER_PERIOD msecs_to_jiffies(100) 25348c2ecf20Sopenharmony_ci 25358c2ecf20Sopenharmony_cistatic void tcp_orphan_update(struct timer_list *unused) 25368c2ecf20Sopenharmony_ci{ 25378c2ecf20Sopenharmony_ci WRITE_ONCE(tcp_orphan_cache, tcp_orphan_count_sum()); 25388c2ecf20Sopenharmony_ci mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD); 25398c2ecf20Sopenharmony_ci} 25408c2ecf20Sopenharmony_ci 25418c2ecf20Sopenharmony_cistatic bool tcp_too_many_orphans(int shift) 25428c2ecf20Sopenharmony_ci{ 25438c2ecf20Sopenharmony_ci return READ_ONCE(tcp_orphan_cache) << shift > 25448c2ecf20Sopenharmony_ci READ_ONCE(sysctl_tcp_max_orphans); 25458c2ecf20Sopenharmony_ci} 25468c2ecf20Sopenharmony_ci 25478c2ecf20Sopenharmony_cibool tcp_check_oom(struct sock *sk, int shift) 25488c2ecf20Sopenharmony_ci{ 25498c2ecf20Sopenharmony_ci bool too_many_orphans, out_of_socket_memory; 25508c2ecf20Sopenharmony_ci 25518c2ecf20Sopenharmony_ci too_many_orphans = tcp_too_many_orphans(shift); 25528c2ecf20Sopenharmony_ci out_of_socket_memory = tcp_out_of_memory(sk); 25538c2ecf20Sopenharmony_ci 25548c2ecf20Sopenharmony_ci if (too_many_orphans) 25558c2ecf20Sopenharmony_ci net_info_ratelimited("too many orphaned sockets\n"); 25568c2ecf20Sopenharmony_ci if (out_of_socket_memory) 25578c2ecf20Sopenharmony_ci net_info_ratelimited("out of memory -- consider tuning tcp_mem\n"); 25588c2ecf20Sopenharmony_ci return too_many_orphans || out_of_socket_memory; 25598c2ecf20Sopenharmony_ci} 25608c2ecf20Sopenharmony_ci 25618c2ecf20Sopenharmony_civoid __tcp_close(struct sock *sk, long timeout) 25628c2ecf20Sopenharmony_ci{ 25638c2ecf20Sopenharmony_ci struct sk_buff *skb; 25648c2ecf20Sopenharmony_ci int data_was_unread = 0; 25658c2ecf20Sopenharmony_ci int state; 25668c2ecf20Sopenharmony_ci 25678c2ecf20Sopenharmony_ci WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); 25688c2ecf20Sopenharmony_ci 25698c2ecf20Sopenharmony_ci if (sk->sk_state == TCP_LISTEN) { 25708c2ecf20Sopenharmony_ci tcp_set_state(sk, TCP_CLOSE); 25718c2ecf20Sopenharmony_ci 25728c2ecf20Sopenharmony_ci /* Special case. */ 25738c2ecf20Sopenharmony_ci inet_csk_listen_stop(sk); 25748c2ecf20Sopenharmony_ci 25758c2ecf20Sopenharmony_ci goto adjudge_to_death; 25768c2ecf20Sopenharmony_ci } 25778c2ecf20Sopenharmony_ci 25788c2ecf20Sopenharmony_ci /* We need to flush the recv. buffs. We do this only on the 25798c2ecf20Sopenharmony_ci * descriptor close, not protocol-sourced closes, because the 25808c2ecf20Sopenharmony_ci * reader process may not have drained the data yet! 25818c2ecf20Sopenharmony_ci */ 25828c2ecf20Sopenharmony_ci while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { 25838c2ecf20Sopenharmony_ci u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq; 25848c2ecf20Sopenharmony_ci 25858c2ecf20Sopenharmony_ci if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) 25868c2ecf20Sopenharmony_ci len--; 25878c2ecf20Sopenharmony_ci data_was_unread += len; 25888c2ecf20Sopenharmony_ci __kfree_skb(skb); 25898c2ecf20Sopenharmony_ci } 25908c2ecf20Sopenharmony_ci 25918c2ecf20Sopenharmony_ci sk_mem_reclaim(sk); 25928c2ecf20Sopenharmony_ci 25938c2ecf20Sopenharmony_ci /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */ 25948c2ecf20Sopenharmony_ci if (sk->sk_state == TCP_CLOSE) 25958c2ecf20Sopenharmony_ci goto adjudge_to_death; 25968c2ecf20Sopenharmony_ci 25978c2ecf20Sopenharmony_ci /* As outlined in RFC 2525, section 2.17, we send a RST here because 25988c2ecf20Sopenharmony_ci * data was lost. To witness the awful effects of the old behavior of 25998c2ecf20Sopenharmony_ci * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk 26008c2ecf20Sopenharmony_ci * GET in an FTP client, suspend the process, wait for the client to 26018c2ecf20Sopenharmony_ci * advertise a zero window, then kill -9 the FTP client, wheee... 26028c2ecf20Sopenharmony_ci * Note: timeout is always zero in such a case. 26038c2ecf20Sopenharmony_ci */ 26048c2ecf20Sopenharmony_ci if (unlikely(tcp_sk(sk)->repair)) { 26058c2ecf20Sopenharmony_ci sk->sk_prot->disconnect(sk, 0); 26068c2ecf20Sopenharmony_ci } else if (data_was_unread) { 26078c2ecf20Sopenharmony_ci /* Unread data was tossed, zap the connection. */ 26088c2ecf20Sopenharmony_ci NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); 26098c2ecf20Sopenharmony_ci tcp_set_state(sk, TCP_CLOSE); 26108c2ecf20Sopenharmony_ci tcp_send_active_reset(sk, sk->sk_allocation); 26118c2ecf20Sopenharmony_ci } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { 26128c2ecf20Sopenharmony_ci /* Check zero linger _after_ checking for unread data. */ 26138c2ecf20Sopenharmony_ci sk->sk_prot->disconnect(sk, 0); 26148c2ecf20Sopenharmony_ci NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); 26158c2ecf20Sopenharmony_ci } else if (tcp_close_state(sk)) { 26168c2ecf20Sopenharmony_ci /* We FIN if the application ate all the data before 26178c2ecf20Sopenharmony_ci * zapping the connection. 26188c2ecf20Sopenharmony_ci */ 26198c2ecf20Sopenharmony_ci 26208c2ecf20Sopenharmony_ci /* RED-PEN. Formally speaking, we have broken TCP state 26218c2ecf20Sopenharmony_ci * machine. State transitions: 26228c2ecf20Sopenharmony_ci * 26238c2ecf20Sopenharmony_ci * TCP_ESTABLISHED -> TCP_FIN_WAIT1 26248c2ecf20Sopenharmony_ci * TCP_SYN_RECV -> TCP_FIN_WAIT1 (it is difficult) 26258c2ecf20Sopenharmony_ci * TCP_CLOSE_WAIT -> TCP_LAST_ACK 26268c2ecf20Sopenharmony_ci * 26278c2ecf20Sopenharmony_ci * are legal only when FIN has been sent (i.e. in window), 26288c2ecf20Sopenharmony_ci * rather than queued out of window. Purists blame. 26298c2ecf20Sopenharmony_ci * 26308c2ecf20Sopenharmony_ci * F.e. "RFC state" is ESTABLISHED, 26318c2ecf20Sopenharmony_ci * if Linux state is FIN-WAIT-1, but FIN is still not sent. 26328c2ecf20Sopenharmony_ci * 26338c2ecf20Sopenharmony_ci * The visible declinations are that sometimes 26348c2ecf20Sopenharmony_ci * we enter time-wait state, when it is not required really 26358c2ecf20Sopenharmony_ci * (harmless), do not send active resets, when they are 26368c2ecf20Sopenharmony_ci * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when 26378c2ecf20Sopenharmony_ci * they look as CLOSING or LAST_ACK for Linux) 26388c2ecf20Sopenharmony_ci * Probably, I missed some more holelets. 26398c2ecf20Sopenharmony_ci * --ANK 26408c2ecf20Sopenharmony_ci * XXX (TFO) - To start off we don't support SYN+ACK+FIN 26418c2ecf20Sopenharmony_ci * in a single packet! (May consider it later but will 26428c2ecf20Sopenharmony_ci * probably need API support or TCP_CORK SYN-ACK until 26438c2ecf20Sopenharmony_ci * data is written and socket is closed.) 26448c2ecf20Sopenharmony_ci */ 26458c2ecf20Sopenharmony_ci tcp_send_fin(sk); 26468c2ecf20Sopenharmony_ci } 26478c2ecf20Sopenharmony_ci 26488c2ecf20Sopenharmony_ci sk_stream_wait_close(sk, timeout); 26498c2ecf20Sopenharmony_ci 26508c2ecf20Sopenharmony_ciadjudge_to_death: 26518c2ecf20Sopenharmony_ci state = sk->sk_state; 26528c2ecf20Sopenharmony_ci sock_hold(sk); 26538c2ecf20Sopenharmony_ci sock_orphan(sk); 26548c2ecf20Sopenharmony_ci 26558c2ecf20Sopenharmony_ci local_bh_disable(); 26568c2ecf20Sopenharmony_ci bh_lock_sock(sk); 26578c2ecf20Sopenharmony_ci /* remove backlog if any, without releasing ownership. */ 26588c2ecf20Sopenharmony_ci __release_sock(sk); 26598c2ecf20Sopenharmony_ci 26608c2ecf20Sopenharmony_ci this_cpu_inc(tcp_orphan_count); 26618c2ecf20Sopenharmony_ci 26628c2ecf20Sopenharmony_ci /* Have we already been destroyed by a softirq or backlog? */ 26638c2ecf20Sopenharmony_ci if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE) 26648c2ecf20Sopenharmony_ci goto out; 26658c2ecf20Sopenharmony_ci 26668c2ecf20Sopenharmony_ci /* This is a (useful) BSD violating of the RFC. There is a 26678c2ecf20Sopenharmony_ci * problem with TCP as specified in that the other end could 26688c2ecf20Sopenharmony_ci * keep a socket open forever with no application left this end. 26698c2ecf20Sopenharmony_ci * We use a 1 minute timeout (about the same as BSD) then kill 26708c2ecf20Sopenharmony_ci * our end. If they send after that then tough - BUT: long enough 26718c2ecf20Sopenharmony_ci * that we won't make the old 4*rto = almost no time - whoops 26728c2ecf20Sopenharmony_ci * reset mistake. 26738c2ecf20Sopenharmony_ci * 26748c2ecf20Sopenharmony_ci * Nope, it was not mistake. It is really desired behaviour 26758c2ecf20Sopenharmony_ci * f.e. on http servers, when such sockets are useless, but 26768c2ecf20Sopenharmony_ci * consume significant resources. Let's do it with special 26778c2ecf20Sopenharmony_ci * linger2 option. --ANK 26788c2ecf20Sopenharmony_ci */ 26798c2ecf20Sopenharmony_ci 26808c2ecf20Sopenharmony_ci if (sk->sk_state == TCP_FIN_WAIT2) { 26818c2ecf20Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 26828c2ecf20Sopenharmony_ci if (tp->linger2 < 0) { 26838c2ecf20Sopenharmony_ci tcp_set_state(sk, TCP_CLOSE); 26848c2ecf20Sopenharmony_ci tcp_send_active_reset(sk, GFP_ATOMIC); 26858c2ecf20Sopenharmony_ci __NET_INC_STATS(sock_net(sk), 26868c2ecf20Sopenharmony_ci LINUX_MIB_TCPABORTONLINGER); 26878c2ecf20Sopenharmony_ci } else { 26888c2ecf20Sopenharmony_ci const int tmo = tcp_fin_time(sk); 26898c2ecf20Sopenharmony_ci 26908c2ecf20Sopenharmony_ci if (tmo > TCP_TIMEWAIT_LEN) { 26918c2ecf20Sopenharmony_ci inet_csk_reset_keepalive_timer(sk, 26928c2ecf20Sopenharmony_ci tmo - TCP_TIMEWAIT_LEN); 26938c2ecf20Sopenharmony_ci } else { 26948c2ecf20Sopenharmony_ci tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); 26958c2ecf20Sopenharmony_ci goto out; 26968c2ecf20Sopenharmony_ci } 26978c2ecf20Sopenharmony_ci } 26988c2ecf20Sopenharmony_ci } 26998c2ecf20Sopenharmony_ci if (sk->sk_state != TCP_CLOSE) { 27008c2ecf20Sopenharmony_ci sk_mem_reclaim(sk); 27018c2ecf20Sopenharmony_ci if (tcp_check_oom(sk, 0)) { 27028c2ecf20Sopenharmony_ci tcp_set_state(sk, TCP_CLOSE); 27038c2ecf20Sopenharmony_ci tcp_send_active_reset(sk, GFP_ATOMIC); 27048c2ecf20Sopenharmony_ci __NET_INC_STATS(sock_net(sk), 27058c2ecf20Sopenharmony_ci LINUX_MIB_TCPABORTONMEMORY); 27068c2ecf20Sopenharmony_ci } else if (!check_net(sock_net(sk))) { 27078c2ecf20Sopenharmony_ci /* Not possible to send reset; just close */ 27088c2ecf20Sopenharmony_ci tcp_set_state(sk, TCP_CLOSE); 27098c2ecf20Sopenharmony_ci } 27108c2ecf20Sopenharmony_ci } 27118c2ecf20Sopenharmony_ci 27128c2ecf20Sopenharmony_ci if (sk->sk_state == TCP_CLOSE) { 27138c2ecf20Sopenharmony_ci struct request_sock *req; 27148c2ecf20Sopenharmony_ci 27158c2ecf20Sopenharmony_ci req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, 27168c2ecf20Sopenharmony_ci lockdep_sock_is_held(sk)); 27178c2ecf20Sopenharmony_ci /* We could get here with a non-NULL req if the socket is 27188c2ecf20Sopenharmony_ci * aborted (e.g., closed with unread data) before 3WHS 27198c2ecf20Sopenharmony_ci * finishes. 27208c2ecf20Sopenharmony_ci */ 27218c2ecf20Sopenharmony_ci if (req) 27228c2ecf20Sopenharmony_ci reqsk_fastopen_remove(sk, req, false); 27238c2ecf20Sopenharmony_ci inet_csk_destroy_sock(sk); 27248c2ecf20Sopenharmony_ci } 27258c2ecf20Sopenharmony_ci /* Otherwise, socket is reprieved until protocol close. */ 27268c2ecf20Sopenharmony_ci 27278c2ecf20Sopenharmony_ciout: 27288c2ecf20Sopenharmony_ci bh_unlock_sock(sk); 27298c2ecf20Sopenharmony_ci local_bh_enable(); 27308c2ecf20Sopenharmony_ci} 27318c2ecf20Sopenharmony_ci 27328c2ecf20Sopenharmony_civoid tcp_close(struct sock *sk, long timeout) 27338c2ecf20Sopenharmony_ci{ 27348c2ecf20Sopenharmony_ci lock_sock(sk); 27358c2ecf20Sopenharmony_ci __tcp_close(sk, timeout); 27368c2ecf20Sopenharmony_ci release_sock(sk); 27378c2ecf20Sopenharmony_ci if (!sk->sk_net_refcnt) 27388c2ecf20Sopenharmony_ci inet_csk_clear_xmit_timers_sync(sk); 27398c2ecf20Sopenharmony_ci sock_put(sk); 27408c2ecf20Sopenharmony_ci} 27418c2ecf20Sopenharmony_ciEXPORT_SYMBOL(tcp_close); 27428c2ecf20Sopenharmony_ci 27438c2ecf20Sopenharmony_ci/* These states need RST on ABORT according to RFC793 */ 27448c2ecf20Sopenharmony_ci 27458c2ecf20Sopenharmony_cistatic inline bool tcp_need_reset(int state) 27468c2ecf20Sopenharmony_ci{ 27478c2ecf20Sopenharmony_ci return (1 << state) & 27488c2ecf20Sopenharmony_ci (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 | 27498c2ecf20Sopenharmony_ci TCPF_FIN_WAIT2 | TCPF_SYN_RECV); 27508c2ecf20Sopenharmony_ci} 27518c2ecf20Sopenharmony_ci 27528c2ecf20Sopenharmony_cistatic void tcp_rtx_queue_purge(struct sock *sk) 27538c2ecf20Sopenharmony_ci{ 27548c2ecf20Sopenharmony_ci struct rb_node *p = rb_first(&sk->tcp_rtx_queue); 27558c2ecf20Sopenharmony_ci 27568c2ecf20Sopenharmony_ci tcp_sk(sk)->highest_sack = NULL; 27578c2ecf20Sopenharmony_ci while (p) { 27588c2ecf20Sopenharmony_ci struct sk_buff *skb = rb_to_skb(p); 27598c2ecf20Sopenharmony_ci 27608c2ecf20Sopenharmony_ci p = rb_next(p); 27618c2ecf20Sopenharmony_ci /* Since we are deleting whole queue, no need to 27628c2ecf20Sopenharmony_ci * list_del(&skb->tcp_tsorted_anchor) 27638c2ecf20Sopenharmony_ci */ 27648c2ecf20Sopenharmony_ci tcp_rtx_queue_unlink(skb, sk); 27658c2ecf20Sopenharmony_ci sk_wmem_free_skb(sk, skb); 27668c2ecf20Sopenharmony_ci } 27678c2ecf20Sopenharmony_ci} 27688c2ecf20Sopenharmony_ci 27698c2ecf20Sopenharmony_civoid tcp_write_queue_purge(struct sock *sk) 27708c2ecf20Sopenharmony_ci{ 27718c2ecf20Sopenharmony_ci struct sk_buff *skb; 27728c2ecf20Sopenharmony_ci 27738c2ecf20Sopenharmony_ci tcp_chrono_stop(sk, TCP_CHRONO_BUSY); 27748c2ecf20Sopenharmony_ci while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { 27758c2ecf20Sopenharmony_ci tcp_skb_tsorted_anchor_cleanup(skb); 27768c2ecf20Sopenharmony_ci sk_wmem_free_skb(sk, skb); 27778c2ecf20Sopenharmony_ci } 27788c2ecf20Sopenharmony_ci tcp_rtx_queue_purge(sk); 27798c2ecf20Sopenharmony_ci skb = sk->sk_tx_skb_cache; 27808c2ecf20Sopenharmony_ci if (skb) { 27818c2ecf20Sopenharmony_ci __kfree_skb(skb); 27828c2ecf20Sopenharmony_ci sk->sk_tx_skb_cache = NULL; 27838c2ecf20Sopenharmony_ci } 27848c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue); 27858c2ecf20Sopenharmony_ci sk_mem_reclaim(sk); 27868c2ecf20Sopenharmony_ci tcp_clear_all_retrans_hints(tcp_sk(sk)); 27878c2ecf20Sopenharmony_ci tcp_sk(sk)->packets_out = 0; 27888c2ecf20Sopenharmony_ci inet_csk(sk)->icsk_backoff = 0; 27898c2ecf20Sopenharmony_ci} 27908c2ecf20Sopenharmony_ci 27918c2ecf20Sopenharmony_ciint tcp_disconnect(struct sock *sk, int flags) 27928c2ecf20Sopenharmony_ci{ 27938c2ecf20Sopenharmony_ci struct inet_sock *inet = inet_sk(sk); 27948c2ecf20Sopenharmony_ci struct inet_connection_sock *icsk = inet_csk(sk); 27958c2ecf20Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 27968c2ecf20Sopenharmony_ci int old_state = sk->sk_state; 27978c2ecf20Sopenharmony_ci u32 seq; 27988c2ecf20Sopenharmony_ci 27998c2ecf20Sopenharmony_ci /* Deny disconnect if other threads are blocked in sk_wait_event() 28008c2ecf20Sopenharmony_ci * or inet_wait_for_connect(). 28018c2ecf20Sopenharmony_ci */ 28028c2ecf20Sopenharmony_ci if (sk->sk_wait_pending) 28038c2ecf20Sopenharmony_ci return -EBUSY; 28048c2ecf20Sopenharmony_ci 28058c2ecf20Sopenharmony_ci if (old_state != TCP_CLOSE) 28068c2ecf20Sopenharmony_ci tcp_set_state(sk, TCP_CLOSE); 28078c2ecf20Sopenharmony_ci 28088c2ecf20Sopenharmony_ci /* ABORT function of RFC793 */ 28098c2ecf20Sopenharmony_ci if (old_state == TCP_LISTEN) { 28108c2ecf20Sopenharmony_ci inet_csk_listen_stop(sk); 28118c2ecf20Sopenharmony_ci } else if (unlikely(tp->repair)) { 28128c2ecf20Sopenharmony_ci sk->sk_err = ECONNABORTED; 28138c2ecf20Sopenharmony_ci } else if (tcp_need_reset(old_state) || 28148c2ecf20Sopenharmony_ci (tp->snd_nxt != tp->write_seq && 28158c2ecf20Sopenharmony_ci (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { 28168c2ecf20Sopenharmony_ci /* The last check adjusts for discrepancy of Linux wrt. RFC 28178c2ecf20Sopenharmony_ci * states 28188c2ecf20Sopenharmony_ci */ 28198c2ecf20Sopenharmony_ci tcp_send_active_reset(sk, gfp_any()); 28208c2ecf20Sopenharmony_ci sk->sk_err = ECONNRESET; 28218c2ecf20Sopenharmony_ci } else if (old_state == TCP_SYN_SENT) 28228c2ecf20Sopenharmony_ci sk->sk_err = ECONNRESET; 28238c2ecf20Sopenharmony_ci 28248c2ecf20Sopenharmony_ci tcp_clear_xmit_timers(sk); 28258c2ecf20Sopenharmony_ci __skb_queue_purge(&sk->sk_receive_queue); 28268c2ecf20Sopenharmony_ci if (sk->sk_rx_skb_cache) { 28278c2ecf20Sopenharmony_ci __kfree_skb(sk->sk_rx_skb_cache); 28288c2ecf20Sopenharmony_ci sk->sk_rx_skb_cache = NULL; 28298c2ecf20Sopenharmony_ci } 28308c2ecf20Sopenharmony_ci WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); 28318c2ecf20Sopenharmony_ci tp->urg_data = 0; 28328c2ecf20Sopenharmony_ci tcp_write_queue_purge(sk); 28338c2ecf20Sopenharmony_ci tcp_fastopen_active_disable_ofo_check(sk); 28348c2ecf20Sopenharmony_ci skb_rbtree_purge(&tp->out_of_order_queue); 28358c2ecf20Sopenharmony_ci 28368c2ecf20Sopenharmony_ci inet->inet_dport = 0; 28378c2ecf20Sopenharmony_ci 28388c2ecf20Sopenharmony_ci if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) 28398c2ecf20Sopenharmony_ci inet_reset_saddr(sk); 28408c2ecf20Sopenharmony_ci 28418c2ecf20Sopenharmony_ci WRITE_ONCE(sk->sk_shutdown, 0); 28428c2ecf20Sopenharmony_ci sock_reset_flag(sk, SOCK_DONE); 28438c2ecf20Sopenharmony_ci tp->srtt_us = 0; 28448c2ecf20Sopenharmony_ci tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); 28458c2ecf20Sopenharmony_ci tp->rcv_rtt_last_tsecr = 0; 28468c2ecf20Sopenharmony_ci 28478c2ecf20Sopenharmony_ci seq = tp->write_seq + tp->max_window + 2; 28488c2ecf20Sopenharmony_ci if (!seq) 28498c2ecf20Sopenharmony_ci seq = 1; 28508c2ecf20Sopenharmony_ci WRITE_ONCE(tp->write_seq, seq); 28518c2ecf20Sopenharmony_ci 28528c2ecf20Sopenharmony_ci icsk->icsk_backoff = 0; 28538c2ecf20Sopenharmony_ci icsk->icsk_probes_out = 0; 28548c2ecf20Sopenharmony_ci icsk->icsk_probes_tstamp = 0; 28558c2ecf20Sopenharmony_ci icsk->icsk_rto = TCP_TIMEOUT_INIT; 28568c2ecf20Sopenharmony_ci icsk->icsk_rto_min = TCP_RTO_MIN; 28578c2ecf20Sopenharmony_ci icsk->icsk_delack_max = TCP_DELACK_MAX; 28588c2ecf20Sopenharmony_ci#if defined(CONFIG_TCP_NATA_URC) || defined(CONFIG_TCP_NATA_STL) 28598c2ecf20Sopenharmony_ci icsk->nata_retries_enabled = 0; 28608c2ecf20Sopenharmony_ci icsk->nata_retries_type = NATA_NA; 28618c2ecf20Sopenharmony_ci icsk->nata_syn_rto = TCP_TIMEOUT_INIT; 28628c2ecf20Sopenharmony_ci icsk->nata_data_rto = TCP_TIMEOUT_INIT; 28638c2ecf20Sopenharmony_ci icsk->nata_data_retries = 0; 28648c2ecf20Sopenharmony_ci#endif 28658c2ecf20Sopenharmony_ci tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; 28668c2ecf20Sopenharmony_ci tp->snd_cwnd = TCP_INIT_CWND; 28678c2ecf20Sopenharmony_ci tp->snd_cwnd_cnt = 0; 28688c2ecf20Sopenharmony_ci tp->is_cwnd_limited = 0; 28698c2ecf20Sopenharmony_ci tp->max_packets_out = 0; 28708c2ecf20Sopenharmony_ci tp->window_clamp = 0; 28718c2ecf20Sopenharmony_ci tp->delivered = 0; 28728c2ecf20Sopenharmony_ci tp->delivered_ce = 0; 28738c2ecf20Sopenharmony_ci if (icsk->icsk_ca_ops->release) 28748c2ecf20Sopenharmony_ci icsk->icsk_ca_ops->release(sk); 28758c2ecf20Sopenharmony_ci memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); 28768c2ecf20Sopenharmony_ci icsk->icsk_ca_initialized = 0; 28778c2ecf20Sopenharmony_ci tcp_set_ca_state(sk, TCP_CA_Open); 28788c2ecf20Sopenharmony_ci tp->is_sack_reneg = 0; 28798c2ecf20Sopenharmony_ci tcp_clear_retrans(tp); 28808c2ecf20Sopenharmony_ci tp->total_retrans = 0; 28818c2ecf20Sopenharmony_ci inet_csk_delack_init(sk); 28828c2ecf20Sopenharmony_ci /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0 28838c2ecf20Sopenharmony_ci * issue in __tcp_select_window() 28848c2ecf20Sopenharmony_ci */ 28858c2ecf20Sopenharmony_ci icsk->icsk_ack.rcv_mss = TCP_MIN_MSS; 28868c2ecf20Sopenharmony_ci memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); 28878c2ecf20Sopenharmony_ci __sk_dst_reset(sk); 28888c2ecf20Sopenharmony_ci dst_release(xchg((__force struct dst_entry **)&sk->sk_rx_dst, NULL)); 28898c2ecf20Sopenharmony_ci tcp_saved_syn_free(tp); 28908c2ecf20Sopenharmony_ci tp->compressed_ack = 0; 28918c2ecf20Sopenharmony_ci tp->segs_in = 0; 28928c2ecf20Sopenharmony_ci tp->segs_out = 0; 28938c2ecf20Sopenharmony_ci tp->bytes_sent = 0; 28948c2ecf20Sopenharmony_ci tp->bytes_acked = 0; 28958c2ecf20Sopenharmony_ci tp->bytes_received = 0; 28968c2ecf20Sopenharmony_ci tp->bytes_retrans = 0; 28978c2ecf20Sopenharmony_ci tp->data_segs_in = 0; 28988c2ecf20Sopenharmony_ci tp->data_segs_out = 0; 28998c2ecf20Sopenharmony_ci tp->duplicate_sack[0].start_seq = 0; 29008c2ecf20Sopenharmony_ci tp->duplicate_sack[0].end_seq = 0; 29018c2ecf20Sopenharmony_ci tp->dsack_dups = 0; 29028c2ecf20Sopenharmony_ci tp->reord_seen = 0; 29038c2ecf20Sopenharmony_ci tp->retrans_out = 0; 29048c2ecf20Sopenharmony_ci tp->sacked_out = 0; 29058c2ecf20Sopenharmony_ci tp->tlp_high_seq = 0; 29068c2ecf20Sopenharmony_ci tp->last_oow_ack_time = 0; 29078c2ecf20Sopenharmony_ci /* There's a bubble in the pipe until at least the first ACK. */ 29088c2ecf20Sopenharmony_ci tp->app_limited = ~0U; 29098c2ecf20Sopenharmony_ci tp->rate_app_limited = 1; 29108c2ecf20Sopenharmony_ci tp->rack.mstamp = 0; 29118c2ecf20Sopenharmony_ci tp->rack.advanced = 0; 29128c2ecf20Sopenharmony_ci tp->rack.reo_wnd_steps = 1; 29138c2ecf20Sopenharmony_ci tp->rack.last_delivered = 0; 29148c2ecf20Sopenharmony_ci tp->rack.reo_wnd_persist = 0; 29158c2ecf20Sopenharmony_ci tp->rack.dsack_seen = 0; 29168c2ecf20Sopenharmony_ci tp->syn_data_acked = 0; 29178c2ecf20Sopenharmony_ci tp->rx_opt.saw_tstamp = 0; 29188c2ecf20Sopenharmony_ci tp->rx_opt.dsack = 0; 29198c2ecf20Sopenharmony_ci tp->rx_opt.num_sacks = 0; 29208c2ecf20Sopenharmony_ci tp->rcv_ooopack = 0; 29218c2ecf20Sopenharmony_ci 29228c2ecf20Sopenharmony_ci 29238c2ecf20Sopenharmony_ci /* Clean up fastopen related fields */ 29248c2ecf20Sopenharmony_ci tcp_free_fastopen_req(tp); 29258c2ecf20Sopenharmony_ci inet->defer_connect = 0; 29268c2ecf20Sopenharmony_ci tp->fastopen_client_fail = 0; 29278c2ecf20Sopenharmony_ci 29288c2ecf20Sopenharmony_ci WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); 29298c2ecf20Sopenharmony_ci 29308c2ecf20Sopenharmony_ci if (sk->sk_frag.page) { 29318c2ecf20Sopenharmony_ci put_page(sk->sk_frag.page); 29328c2ecf20Sopenharmony_ci sk->sk_frag.page = NULL; 29338c2ecf20Sopenharmony_ci sk->sk_frag.offset = 0; 29348c2ecf20Sopenharmony_ci } 29358c2ecf20Sopenharmony_ci 29368c2ecf20Sopenharmony_ci sk->sk_error_report(sk); 29378c2ecf20Sopenharmony_ci return 0; 29388c2ecf20Sopenharmony_ci} 29398c2ecf20Sopenharmony_ciEXPORT_SYMBOL(tcp_disconnect); 29408c2ecf20Sopenharmony_ci 29418c2ecf20Sopenharmony_cistatic inline bool tcp_can_repair_sock(const struct sock *sk) 29428c2ecf20Sopenharmony_ci{ 29438c2ecf20Sopenharmony_ci return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) && 29448c2ecf20Sopenharmony_ci (sk->sk_state != TCP_LISTEN); 29458c2ecf20Sopenharmony_ci} 29468c2ecf20Sopenharmony_ci 29478c2ecf20Sopenharmony_cistatic int tcp_repair_set_window(struct tcp_sock *tp, sockptr_t optbuf, int len) 29488c2ecf20Sopenharmony_ci{ 29498c2ecf20Sopenharmony_ci struct tcp_repair_window opt; 29508c2ecf20Sopenharmony_ci 29518c2ecf20Sopenharmony_ci if (!tp->repair) 29528c2ecf20Sopenharmony_ci return -EPERM; 29538c2ecf20Sopenharmony_ci 29548c2ecf20Sopenharmony_ci if (len != sizeof(opt)) 29558c2ecf20Sopenharmony_ci return -EINVAL; 29568c2ecf20Sopenharmony_ci 29578c2ecf20Sopenharmony_ci if (copy_from_sockptr(&opt, optbuf, sizeof(opt))) 29588c2ecf20Sopenharmony_ci return -EFAULT; 29598c2ecf20Sopenharmony_ci 29608c2ecf20Sopenharmony_ci if (opt.max_window < opt.snd_wnd) 29618c2ecf20Sopenharmony_ci return -EINVAL; 29628c2ecf20Sopenharmony_ci 29638c2ecf20Sopenharmony_ci if (after(opt.snd_wl1, tp->rcv_nxt + opt.rcv_wnd)) 29648c2ecf20Sopenharmony_ci return -EINVAL; 29658c2ecf20Sopenharmony_ci 29668c2ecf20Sopenharmony_ci if (after(opt.rcv_wup, tp->rcv_nxt)) 29678c2ecf20Sopenharmony_ci return -EINVAL; 29688c2ecf20Sopenharmony_ci 29698c2ecf20Sopenharmony_ci tp->snd_wl1 = opt.snd_wl1; 29708c2ecf20Sopenharmony_ci tp->snd_wnd = opt.snd_wnd; 29718c2ecf20Sopenharmony_ci tp->max_window = opt.max_window; 29728c2ecf20Sopenharmony_ci 29738c2ecf20Sopenharmony_ci tp->rcv_wnd = opt.rcv_wnd; 29748c2ecf20Sopenharmony_ci tp->rcv_wup = opt.rcv_wup; 29758c2ecf20Sopenharmony_ci 29768c2ecf20Sopenharmony_ci return 0; 29778c2ecf20Sopenharmony_ci} 29788c2ecf20Sopenharmony_ci 29798c2ecf20Sopenharmony_cistatic int tcp_repair_options_est(struct sock *sk, sockptr_t optbuf, 29808c2ecf20Sopenharmony_ci unsigned int len) 29818c2ecf20Sopenharmony_ci{ 29828c2ecf20Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 29838c2ecf20Sopenharmony_ci struct tcp_repair_opt opt; 29848c2ecf20Sopenharmony_ci size_t offset = 0; 29858c2ecf20Sopenharmony_ci 29868c2ecf20Sopenharmony_ci while (len >= sizeof(opt)) { 29878c2ecf20Sopenharmony_ci if (copy_from_sockptr_offset(&opt, optbuf, offset, sizeof(opt))) 29888c2ecf20Sopenharmony_ci return -EFAULT; 29898c2ecf20Sopenharmony_ci 29908c2ecf20Sopenharmony_ci offset += sizeof(opt); 29918c2ecf20Sopenharmony_ci len -= sizeof(opt); 29928c2ecf20Sopenharmony_ci 29938c2ecf20Sopenharmony_ci switch (opt.opt_code) { 29948c2ecf20Sopenharmony_ci case TCPOPT_MSS: 29958c2ecf20Sopenharmony_ci tp->rx_opt.mss_clamp = opt.opt_val; 29968c2ecf20Sopenharmony_ci tcp_mtup_init(sk); 29978c2ecf20Sopenharmony_ci break; 29988c2ecf20Sopenharmony_ci case TCPOPT_WINDOW: 29998c2ecf20Sopenharmony_ci { 30008c2ecf20Sopenharmony_ci u16 snd_wscale = opt.opt_val & 0xFFFF; 30018c2ecf20Sopenharmony_ci u16 rcv_wscale = opt.opt_val >> 16; 30028c2ecf20Sopenharmony_ci 30038c2ecf20Sopenharmony_ci if (snd_wscale > TCP_MAX_WSCALE || rcv_wscale > TCP_MAX_WSCALE) 30048c2ecf20Sopenharmony_ci return -EFBIG; 30058c2ecf20Sopenharmony_ci 30068c2ecf20Sopenharmony_ci tp->rx_opt.snd_wscale = snd_wscale; 30078c2ecf20Sopenharmony_ci tp->rx_opt.rcv_wscale = rcv_wscale; 30088c2ecf20Sopenharmony_ci tp->rx_opt.wscale_ok = 1; 30098c2ecf20Sopenharmony_ci } 30108c2ecf20Sopenharmony_ci break; 30118c2ecf20Sopenharmony_ci case TCPOPT_SACK_PERM: 30128c2ecf20Sopenharmony_ci if (opt.opt_val != 0) 30138c2ecf20Sopenharmony_ci return -EINVAL; 30148c2ecf20Sopenharmony_ci 30158c2ecf20Sopenharmony_ci tp->rx_opt.sack_ok |= TCP_SACK_SEEN; 30168c2ecf20Sopenharmony_ci break; 30178c2ecf20Sopenharmony_ci case TCPOPT_TIMESTAMP: 30188c2ecf20Sopenharmony_ci if (opt.opt_val != 0) 30198c2ecf20Sopenharmony_ci return -EINVAL; 30208c2ecf20Sopenharmony_ci 30218c2ecf20Sopenharmony_ci tp->rx_opt.tstamp_ok = 1; 30228c2ecf20Sopenharmony_ci break; 30238c2ecf20Sopenharmony_ci } 30248c2ecf20Sopenharmony_ci } 30258c2ecf20Sopenharmony_ci 30268c2ecf20Sopenharmony_ci return 0; 30278c2ecf20Sopenharmony_ci} 30288c2ecf20Sopenharmony_ci 30298c2ecf20Sopenharmony_ciDEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled); 30308c2ecf20Sopenharmony_ciEXPORT_SYMBOL(tcp_tx_delay_enabled); 30318c2ecf20Sopenharmony_ci 30328c2ecf20Sopenharmony_cistatic void tcp_enable_tx_delay(void) 30338c2ecf20Sopenharmony_ci{ 30348c2ecf20Sopenharmony_ci if (!static_branch_unlikely(&tcp_tx_delay_enabled)) { 30358c2ecf20Sopenharmony_ci static int __tcp_tx_delay_enabled = 0; 30368c2ecf20Sopenharmony_ci 30378c2ecf20Sopenharmony_ci if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) { 30388c2ecf20Sopenharmony_ci static_branch_enable(&tcp_tx_delay_enabled); 30398c2ecf20Sopenharmony_ci pr_info("TCP_TX_DELAY enabled\n"); 30408c2ecf20Sopenharmony_ci } 30418c2ecf20Sopenharmony_ci } 30428c2ecf20Sopenharmony_ci} 30438c2ecf20Sopenharmony_ci 30448c2ecf20Sopenharmony_ci/* When set indicates to always queue non-full frames. Later the user clears 30458c2ecf20Sopenharmony_ci * this option and we transmit any pending partial frames in the queue. This is 30468c2ecf20Sopenharmony_ci * meant to be used alongside sendfile() to get properly filled frames when the 30478c2ecf20Sopenharmony_ci * user (for example) must write out headers with a write() call first and then 30488c2ecf20Sopenharmony_ci * use sendfile to send out the data parts. 30498c2ecf20Sopenharmony_ci * 30508c2ecf20Sopenharmony_ci * TCP_CORK can be set together with TCP_NODELAY and it is stronger than 30518c2ecf20Sopenharmony_ci * TCP_NODELAY. 30528c2ecf20Sopenharmony_ci */ 30538c2ecf20Sopenharmony_cistatic void __tcp_sock_set_cork(struct sock *sk, bool on) 30548c2ecf20Sopenharmony_ci{ 30558c2ecf20Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 30568c2ecf20Sopenharmony_ci 30578c2ecf20Sopenharmony_ci if (on) { 30588c2ecf20Sopenharmony_ci tp->nonagle |= TCP_NAGLE_CORK; 30598c2ecf20Sopenharmony_ci } else { 30608c2ecf20Sopenharmony_ci tp->nonagle &= ~TCP_NAGLE_CORK; 30618c2ecf20Sopenharmony_ci if (tp->nonagle & TCP_NAGLE_OFF) 30628c2ecf20Sopenharmony_ci tp->nonagle |= TCP_NAGLE_PUSH; 30638c2ecf20Sopenharmony_ci tcp_push_pending_frames(sk); 30648c2ecf20Sopenharmony_ci } 30658c2ecf20Sopenharmony_ci} 30668c2ecf20Sopenharmony_ci 30678c2ecf20Sopenharmony_civoid tcp_sock_set_cork(struct sock *sk, bool on) 30688c2ecf20Sopenharmony_ci{ 30698c2ecf20Sopenharmony_ci lock_sock(sk); 30708c2ecf20Sopenharmony_ci __tcp_sock_set_cork(sk, on); 30718c2ecf20Sopenharmony_ci release_sock(sk); 30728c2ecf20Sopenharmony_ci} 30738c2ecf20Sopenharmony_ciEXPORT_SYMBOL(tcp_sock_set_cork); 30748c2ecf20Sopenharmony_ci 30758c2ecf20Sopenharmony_ci/* TCP_NODELAY is weaker than TCP_CORK, so that this option on corked socket is 30768c2ecf20Sopenharmony_ci * remembered, but it is not activated until cork is cleared. 30778c2ecf20Sopenharmony_ci * 30788c2ecf20Sopenharmony_ci * However, when TCP_NODELAY is set we make an explicit push, which overrides 30798c2ecf20Sopenharmony_ci * even TCP_CORK for currently queued segments. 30808c2ecf20Sopenharmony_ci */ 30818c2ecf20Sopenharmony_cistatic void __tcp_sock_set_nodelay(struct sock *sk, bool on) 30828c2ecf20Sopenharmony_ci{ 30838c2ecf20Sopenharmony_ci if (on) { 30848c2ecf20Sopenharmony_ci tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH; 30858c2ecf20Sopenharmony_ci tcp_push_pending_frames(sk); 30868c2ecf20Sopenharmony_ci } else { 30878c2ecf20Sopenharmony_ci tcp_sk(sk)->nonagle &= ~TCP_NAGLE_OFF; 30888c2ecf20Sopenharmony_ci } 30898c2ecf20Sopenharmony_ci} 30908c2ecf20Sopenharmony_ci 30918c2ecf20Sopenharmony_civoid tcp_sock_set_nodelay(struct sock *sk) 30928c2ecf20Sopenharmony_ci{ 30938c2ecf20Sopenharmony_ci lock_sock(sk); 30948c2ecf20Sopenharmony_ci __tcp_sock_set_nodelay(sk, true); 30958c2ecf20Sopenharmony_ci release_sock(sk); 30968c2ecf20Sopenharmony_ci} 30978c2ecf20Sopenharmony_ciEXPORT_SYMBOL(tcp_sock_set_nodelay); 30988c2ecf20Sopenharmony_ci 30998c2ecf20Sopenharmony_cistatic void __tcp_sock_set_quickack(struct sock *sk, int val) 31008c2ecf20Sopenharmony_ci{ 31018c2ecf20Sopenharmony_ci if (!val) { 31028c2ecf20Sopenharmony_ci inet_csk_enter_pingpong_mode(sk); 31038c2ecf20Sopenharmony_ci return; 31048c2ecf20Sopenharmony_ci } 31058c2ecf20Sopenharmony_ci 31068c2ecf20Sopenharmony_ci inet_csk_exit_pingpong_mode(sk); 31078c2ecf20Sopenharmony_ci if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && 31088c2ecf20Sopenharmony_ci inet_csk_ack_scheduled(sk)) { 31098c2ecf20Sopenharmony_ci inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_PUSHED; 31108c2ecf20Sopenharmony_ci tcp_cleanup_rbuf(sk, 1); 31118c2ecf20Sopenharmony_ci if (!(val & 1)) 31128c2ecf20Sopenharmony_ci inet_csk_enter_pingpong_mode(sk); 31138c2ecf20Sopenharmony_ci } 31148c2ecf20Sopenharmony_ci} 31158c2ecf20Sopenharmony_ci 31168c2ecf20Sopenharmony_civoid tcp_sock_set_quickack(struct sock *sk, int val) 31178c2ecf20Sopenharmony_ci{ 31188c2ecf20Sopenharmony_ci lock_sock(sk); 31198c2ecf20Sopenharmony_ci __tcp_sock_set_quickack(sk, val); 31208c2ecf20Sopenharmony_ci release_sock(sk); 31218c2ecf20Sopenharmony_ci} 31228c2ecf20Sopenharmony_ciEXPORT_SYMBOL(tcp_sock_set_quickack); 31238c2ecf20Sopenharmony_ci 31248c2ecf20Sopenharmony_ciint tcp_sock_set_syncnt(struct sock *sk, int val) 31258c2ecf20Sopenharmony_ci{ 31268c2ecf20Sopenharmony_ci if (val < 1 || val > MAX_TCP_SYNCNT) 31278c2ecf20Sopenharmony_ci return -EINVAL; 31288c2ecf20Sopenharmony_ci 31298c2ecf20Sopenharmony_ci lock_sock(sk); 31308c2ecf20Sopenharmony_ci WRITE_ONCE(inet_csk(sk)->icsk_syn_retries, val); 31318c2ecf20Sopenharmony_ci release_sock(sk); 31328c2ecf20Sopenharmony_ci return 0; 31338c2ecf20Sopenharmony_ci} 31348c2ecf20Sopenharmony_ciEXPORT_SYMBOL(tcp_sock_set_syncnt); 31358c2ecf20Sopenharmony_ci 31368c2ecf20Sopenharmony_civoid tcp_sock_set_user_timeout(struct sock *sk, u32 val) 31378c2ecf20Sopenharmony_ci{ 31388c2ecf20Sopenharmony_ci lock_sock(sk); 31398c2ecf20Sopenharmony_ci WRITE_ONCE(inet_csk(sk)->icsk_user_timeout, val); 31408c2ecf20Sopenharmony_ci release_sock(sk); 31418c2ecf20Sopenharmony_ci} 31428c2ecf20Sopenharmony_ciEXPORT_SYMBOL(tcp_sock_set_user_timeout); 31438c2ecf20Sopenharmony_ci 31448c2ecf20Sopenharmony_ciint tcp_sock_set_keepidle_locked(struct sock *sk, int val) 31458c2ecf20Sopenharmony_ci{ 31468c2ecf20Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 31478c2ecf20Sopenharmony_ci 31488c2ecf20Sopenharmony_ci if (val < 1 || val > MAX_TCP_KEEPIDLE) 31498c2ecf20Sopenharmony_ci return -EINVAL; 31508c2ecf20Sopenharmony_ci 31518c2ecf20Sopenharmony_ci /* Paired with WRITE_ONCE() in keepalive_time_when() */ 31528c2ecf20Sopenharmony_ci WRITE_ONCE(tp->keepalive_time, val * HZ); 31538c2ecf20Sopenharmony_ci if (sock_flag(sk, SOCK_KEEPOPEN) && 31548c2ecf20Sopenharmony_ci !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { 31558c2ecf20Sopenharmony_ci u32 elapsed = keepalive_time_elapsed(tp); 31568c2ecf20Sopenharmony_ci 31578c2ecf20Sopenharmony_ci if (tp->keepalive_time > elapsed) 31588c2ecf20Sopenharmony_ci elapsed = tp->keepalive_time - elapsed; 31598c2ecf20Sopenharmony_ci else 31608c2ecf20Sopenharmony_ci elapsed = 0; 31618c2ecf20Sopenharmony_ci inet_csk_reset_keepalive_timer(sk, elapsed); 31628c2ecf20Sopenharmony_ci } 31638c2ecf20Sopenharmony_ci 31648c2ecf20Sopenharmony_ci return 0; 31658c2ecf20Sopenharmony_ci} 31668c2ecf20Sopenharmony_ci 31678c2ecf20Sopenharmony_ciint tcp_sock_set_keepidle(struct sock *sk, int val) 31688c2ecf20Sopenharmony_ci{ 31698c2ecf20Sopenharmony_ci int err; 31708c2ecf20Sopenharmony_ci 31718c2ecf20Sopenharmony_ci lock_sock(sk); 31728c2ecf20Sopenharmony_ci err = tcp_sock_set_keepidle_locked(sk, val); 31738c2ecf20Sopenharmony_ci release_sock(sk); 31748c2ecf20Sopenharmony_ci return err; 31758c2ecf20Sopenharmony_ci} 31768c2ecf20Sopenharmony_ciEXPORT_SYMBOL(tcp_sock_set_keepidle); 31778c2ecf20Sopenharmony_ci 31788c2ecf20Sopenharmony_ciint tcp_sock_set_keepintvl(struct sock *sk, int val) 31798c2ecf20Sopenharmony_ci{ 31808c2ecf20Sopenharmony_ci if (val < 1 || val > MAX_TCP_KEEPINTVL) 31818c2ecf20Sopenharmony_ci return -EINVAL; 31828c2ecf20Sopenharmony_ci 31838c2ecf20Sopenharmony_ci lock_sock(sk); 31848c2ecf20Sopenharmony_ci WRITE_ONCE(tcp_sk(sk)->keepalive_intvl, val * HZ); 31858c2ecf20Sopenharmony_ci release_sock(sk); 31868c2ecf20Sopenharmony_ci return 0; 31878c2ecf20Sopenharmony_ci} 31888c2ecf20Sopenharmony_ciEXPORT_SYMBOL(tcp_sock_set_keepintvl); 31898c2ecf20Sopenharmony_ci 31908c2ecf20Sopenharmony_ciint tcp_sock_set_keepcnt(struct sock *sk, int val) 31918c2ecf20Sopenharmony_ci{ 31928c2ecf20Sopenharmony_ci if (val < 1 || val > MAX_TCP_KEEPCNT) 31938c2ecf20Sopenharmony_ci return -EINVAL; 31948c2ecf20Sopenharmony_ci 31958c2ecf20Sopenharmony_ci lock_sock(sk); 31968c2ecf20Sopenharmony_ci /* Paired with READ_ONCE() in keepalive_probes() */ 31978c2ecf20Sopenharmony_ci WRITE_ONCE(tcp_sk(sk)->keepalive_probes, val); 31988c2ecf20Sopenharmony_ci release_sock(sk); 31998c2ecf20Sopenharmony_ci return 0; 32008c2ecf20Sopenharmony_ci} 32018c2ecf20Sopenharmony_ciEXPORT_SYMBOL(tcp_sock_set_keepcnt); 32028c2ecf20Sopenharmony_ci 32038c2ecf20Sopenharmony_ci/* 32048c2ecf20Sopenharmony_ci * Socket option code for TCP. 32058c2ecf20Sopenharmony_ci */ 32068c2ecf20Sopenharmony_cistatic int do_tcp_setsockopt(struct sock *sk, int level, int optname, 32078c2ecf20Sopenharmony_ci sockptr_t optval, unsigned int optlen) 32088c2ecf20Sopenharmony_ci{ 32098c2ecf20Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 32108c2ecf20Sopenharmony_ci struct inet_connection_sock *icsk = inet_csk(sk); 32118c2ecf20Sopenharmony_ci struct net *net = sock_net(sk); 32128c2ecf20Sopenharmony_ci int val; 32138c2ecf20Sopenharmony_ci int err = 0; 32148c2ecf20Sopenharmony_ci 32158c2ecf20Sopenharmony_ci /* These are data/string values, all the others are ints */ 32168c2ecf20Sopenharmony_ci switch (optname) { 32178c2ecf20Sopenharmony_ci case TCP_CONGESTION: { 32188c2ecf20Sopenharmony_ci char name[TCP_CA_NAME_MAX]; 32198c2ecf20Sopenharmony_ci 32208c2ecf20Sopenharmony_ci if (optlen < 1) 32218c2ecf20Sopenharmony_ci return -EINVAL; 32228c2ecf20Sopenharmony_ci 32238c2ecf20Sopenharmony_ci val = strncpy_from_sockptr(name, optval, 32248c2ecf20Sopenharmony_ci min_t(long, TCP_CA_NAME_MAX-1, optlen)); 32258c2ecf20Sopenharmony_ci if (val < 0) 32268c2ecf20Sopenharmony_ci return -EFAULT; 32278c2ecf20Sopenharmony_ci name[val] = 0; 32288c2ecf20Sopenharmony_ci 32298c2ecf20Sopenharmony_ci lock_sock(sk); 32308c2ecf20Sopenharmony_ci err = tcp_set_congestion_control(sk, name, true, 32318c2ecf20Sopenharmony_ci ns_capable(sock_net(sk)->user_ns, 32328c2ecf20Sopenharmony_ci CAP_NET_ADMIN)); 32338c2ecf20Sopenharmony_ci release_sock(sk); 32348c2ecf20Sopenharmony_ci return err; 32358c2ecf20Sopenharmony_ci } 32368c2ecf20Sopenharmony_ci case TCP_ULP: { 32378c2ecf20Sopenharmony_ci char name[TCP_ULP_NAME_MAX]; 32388c2ecf20Sopenharmony_ci 32398c2ecf20Sopenharmony_ci if (optlen < 1) 32408c2ecf20Sopenharmony_ci return -EINVAL; 32418c2ecf20Sopenharmony_ci 32428c2ecf20Sopenharmony_ci val = strncpy_from_sockptr(name, optval, 32438c2ecf20Sopenharmony_ci min_t(long, TCP_ULP_NAME_MAX - 1, 32448c2ecf20Sopenharmony_ci optlen)); 32458c2ecf20Sopenharmony_ci if (val < 0) 32468c2ecf20Sopenharmony_ci return -EFAULT; 32478c2ecf20Sopenharmony_ci name[val] = 0; 32488c2ecf20Sopenharmony_ci 32498c2ecf20Sopenharmony_ci lock_sock(sk); 32508c2ecf20Sopenharmony_ci err = tcp_set_ulp(sk, name); 32518c2ecf20Sopenharmony_ci release_sock(sk); 32528c2ecf20Sopenharmony_ci return err; 32538c2ecf20Sopenharmony_ci } 32548c2ecf20Sopenharmony_ci case TCP_FASTOPEN_KEY: { 32558c2ecf20Sopenharmony_ci __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH]; 32568c2ecf20Sopenharmony_ci __u8 *backup_key = NULL; 32578c2ecf20Sopenharmony_ci 32588c2ecf20Sopenharmony_ci /* Allow a backup key as well to facilitate key rotation 32598c2ecf20Sopenharmony_ci * First key is the active one. 32608c2ecf20Sopenharmony_ci */ 32618c2ecf20Sopenharmony_ci if (optlen != TCP_FASTOPEN_KEY_LENGTH && 32628c2ecf20Sopenharmony_ci optlen != TCP_FASTOPEN_KEY_BUF_LENGTH) 32638c2ecf20Sopenharmony_ci return -EINVAL; 32648c2ecf20Sopenharmony_ci 32658c2ecf20Sopenharmony_ci if (copy_from_sockptr(key, optval, optlen)) 32668c2ecf20Sopenharmony_ci return -EFAULT; 32678c2ecf20Sopenharmony_ci 32688c2ecf20Sopenharmony_ci if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH) 32698c2ecf20Sopenharmony_ci backup_key = key + TCP_FASTOPEN_KEY_LENGTH; 32708c2ecf20Sopenharmony_ci 32718c2ecf20Sopenharmony_ci return tcp_fastopen_reset_cipher(net, sk, key, backup_key); 32728c2ecf20Sopenharmony_ci } 32738c2ecf20Sopenharmony_ci default: 32748c2ecf20Sopenharmony_ci /* fallthru */ 32758c2ecf20Sopenharmony_ci break; 32768c2ecf20Sopenharmony_ci } 32778c2ecf20Sopenharmony_ci 32788c2ecf20Sopenharmony_ci if (optlen < sizeof(int)) 32798c2ecf20Sopenharmony_ci return -EINVAL; 32808c2ecf20Sopenharmony_ci 32818c2ecf20Sopenharmony_ci if (copy_from_sockptr(&val, optval, sizeof(val))) 32828c2ecf20Sopenharmony_ci return -EFAULT; 32838c2ecf20Sopenharmony_ci 32848c2ecf20Sopenharmony_ci lock_sock(sk); 32858c2ecf20Sopenharmony_ci 32868c2ecf20Sopenharmony_ci switch (optname) { 32878c2ecf20Sopenharmony_ci case TCP_MAXSEG: 32888c2ecf20Sopenharmony_ci /* Values greater than interface MTU won't take effect. However 32898c2ecf20Sopenharmony_ci * at the point when this call is done we typically don't yet 32908c2ecf20Sopenharmony_ci * know which interface is going to be used 32918c2ecf20Sopenharmony_ci */ 32928c2ecf20Sopenharmony_ci if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW)) { 32938c2ecf20Sopenharmony_ci err = -EINVAL; 32948c2ecf20Sopenharmony_ci break; 32958c2ecf20Sopenharmony_ci } 32968c2ecf20Sopenharmony_ci tp->rx_opt.user_mss = val; 32978c2ecf20Sopenharmony_ci break; 32988c2ecf20Sopenharmony_ci 32998c2ecf20Sopenharmony_ci case TCP_NODELAY: 33008c2ecf20Sopenharmony_ci __tcp_sock_set_nodelay(sk, val); 33018c2ecf20Sopenharmony_ci break; 33028c2ecf20Sopenharmony_ci 33038c2ecf20Sopenharmony_ci case TCP_THIN_LINEAR_TIMEOUTS: 33048c2ecf20Sopenharmony_ci if (val < 0 || val > 1) 33058c2ecf20Sopenharmony_ci err = -EINVAL; 33068c2ecf20Sopenharmony_ci else 33078c2ecf20Sopenharmony_ci tp->thin_lto = val; 33088c2ecf20Sopenharmony_ci break; 33098c2ecf20Sopenharmony_ci 33108c2ecf20Sopenharmony_ci case TCP_THIN_DUPACK: 33118c2ecf20Sopenharmony_ci if (val < 0 || val > 1) 33128c2ecf20Sopenharmony_ci err = -EINVAL; 33138c2ecf20Sopenharmony_ci break; 33148c2ecf20Sopenharmony_ci 33158c2ecf20Sopenharmony_ci case TCP_REPAIR: 33168c2ecf20Sopenharmony_ci if (!tcp_can_repair_sock(sk)) 33178c2ecf20Sopenharmony_ci err = -EPERM; 33188c2ecf20Sopenharmony_ci else if (val == TCP_REPAIR_ON) { 33198c2ecf20Sopenharmony_ci tp->repair = 1; 33208c2ecf20Sopenharmony_ci sk->sk_reuse = SK_FORCE_REUSE; 33218c2ecf20Sopenharmony_ci tp->repair_queue = TCP_NO_QUEUE; 33228c2ecf20Sopenharmony_ci } else if (val == TCP_REPAIR_OFF) { 33238c2ecf20Sopenharmony_ci tp->repair = 0; 33248c2ecf20Sopenharmony_ci sk->sk_reuse = SK_NO_REUSE; 33258c2ecf20Sopenharmony_ci tcp_send_window_probe(sk); 33268c2ecf20Sopenharmony_ci } else if (val == TCP_REPAIR_OFF_NO_WP) { 33278c2ecf20Sopenharmony_ci tp->repair = 0; 33288c2ecf20Sopenharmony_ci sk->sk_reuse = SK_NO_REUSE; 33298c2ecf20Sopenharmony_ci } else 33308c2ecf20Sopenharmony_ci err = -EINVAL; 33318c2ecf20Sopenharmony_ci 33328c2ecf20Sopenharmony_ci break; 33338c2ecf20Sopenharmony_ci 33348c2ecf20Sopenharmony_ci case TCP_REPAIR_QUEUE: 33358c2ecf20Sopenharmony_ci if (!tp->repair) 33368c2ecf20Sopenharmony_ci err = -EPERM; 33378c2ecf20Sopenharmony_ci else if ((unsigned int)val < TCP_QUEUES_NR) 33388c2ecf20Sopenharmony_ci tp->repair_queue = val; 33398c2ecf20Sopenharmony_ci else 33408c2ecf20Sopenharmony_ci err = -EINVAL; 33418c2ecf20Sopenharmony_ci break; 33428c2ecf20Sopenharmony_ci 33438c2ecf20Sopenharmony_ci case TCP_QUEUE_SEQ: 33448c2ecf20Sopenharmony_ci if (sk->sk_state != TCP_CLOSE) { 33458c2ecf20Sopenharmony_ci err = -EPERM; 33468c2ecf20Sopenharmony_ci } else if (tp->repair_queue == TCP_SEND_QUEUE) { 33478c2ecf20Sopenharmony_ci if (!tcp_rtx_queue_empty(sk)) 33488c2ecf20Sopenharmony_ci err = -EPERM; 33498c2ecf20Sopenharmony_ci else 33508c2ecf20Sopenharmony_ci WRITE_ONCE(tp->write_seq, val); 33518c2ecf20Sopenharmony_ci } else if (tp->repair_queue == TCP_RECV_QUEUE) { 33528c2ecf20Sopenharmony_ci if (tp->rcv_nxt != tp->copied_seq) { 33538c2ecf20Sopenharmony_ci err = -EPERM; 33548c2ecf20Sopenharmony_ci } else { 33558c2ecf20Sopenharmony_ci WRITE_ONCE(tp->rcv_nxt, val); 33568c2ecf20Sopenharmony_ci WRITE_ONCE(tp->copied_seq, val); 33578c2ecf20Sopenharmony_ci } 33588c2ecf20Sopenharmony_ci } else { 33598c2ecf20Sopenharmony_ci err = -EINVAL; 33608c2ecf20Sopenharmony_ci } 33618c2ecf20Sopenharmony_ci break; 33628c2ecf20Sopenharmony_ci 33638c2ecf20Sopenharmony_ci case TCP_REPAIR_OPTIONS: 33648c2ecf20Sopenharmony_ci if (!tp->repair) 33658c2ecf20Sopenharmony_ci err = -EINVAL; 33668c2ecf20Sopenharmony_ci else if (sk->sk_state == TCP_ESTABLISHED && !tp->bytes_sent) 33678c2ecf20Sopenharmony_ci err = tcp_repair_options_est(sk, optval, optlen); 33688c2ecf20Sopenharmony_ci else 33698c2ecf20Sopenharmony_ci err = -EPERM; 33708c2ecf20Sopenharmony_ci break; 33718c2ecf20Sopenharmony_ci 33728c2ecf20Sopenharmony_ci case TCP_CORK: 33738c2ecf20Sopenharmony_ci __tcp_sock_set_cork(sk, val); 33748c2ecf20Sopenharmony_ci break; 33758c2ecf20Sopenharmony_ci 33768c2ecf20Sopenharmony_ci case TCP_KEEPIDLE: 33778c2ecf20Sopenharmony_ci err = tcp_sock_set_keepidle_locked(sk, val); 33788c2ecf20Sopenharmony_ci break; 33798c2ecf20Sopenharmony_ci case TCP_KEEPINTVL: 33808c2ecf20Sopenharmony_ci if (val < 1 || val > MAX_TCP_KEEPINTVL) 33818c2ecf20Sopenharmony_ci err = -EINVAL; 33828c2ecf20Sopenharmony_ci else 33838c2ecf20Sopenharmony_ci WRITE_ONCE(tp->keepalive_intvl, val * HZ); 33848c2ecf20Sopenharmony_ci break; 33858c2ecf20Sopenharmony_ci case TCP_KEEPCNT: 33868c2ecf20Sopenharmony_ci if (val < 1 || val > MAX_TCP_KEEPCNT) 33878c2ecf20Sopenharmony_ci err = -EINVAL; 33888c2ecf20Sopenharmony_ci else 33898c2ecf20Sopenharmony_ci WRITE_ONCE(tp->keepalive_probes, val); 33908c2ecf20Sopenharmony_ci break; 33918c2ecf20Sopenharmony_ci case TCP_SYNCNT: 33928c2ecf20Sopenharmony_ci if (val < 1 || val > MAX_TCP_SYNCNT) 33938c2ecf20Sopenharmony_ci err = -EINVAL; 33948c2ecf20Sopenharmony_ci else 33958c2ecf20Sopenharmony_ci WRITE_ONCE(icsk->icsk_syn_retries, val); 33968c2ecf20Sopenharmony_ci break; 33978c2ecf20Sopenharmony_ci 33988c2ecf20Sopenharmony_ci case TCP_SAVE_SYN: 33998c2ecf20Sopenharmony_ci /* 0: disable, 1: enable, 2: start from ether_header */ 34008c2ecf20Sopenharmony_ci if (val < 0 || val > 2) 34018c2ecf20Sopenharmony_ci err = -EINVAL; 34028c2ecf20Sopenharmony_ci else 34038c2ecf20Sopenharmony_ci tp->save_syn = val; 34048c2ecf20Sopenharmony_ci break; 34058c2ecf20Sopenharmony_ci 34068c2ecf20Sopenharmony_ci case TCP_LINGER2: 34078c2ecf20Sopenharmony_ci if (val < 0) 34088c2ecf20Sopenharmony_ci WRITE_ONCE(tp->linger2, -1); 34098c2ecf20Sopenharmony_ci else if (val > TCP_FIN_TIMEOUT_MAX / HZ) 34108c2ecf20Sopenharmony_ci WRITE_ONCE(tp->linger2, TCP_FIN_TIMEOUT_MAX); 34118c2ecf20Sopenharmony_ci else 34128c2ecf20Sopenharmony_ci WRITE_ONCE(tp->linger2, val * HZ); 34138c2ecf20Sopenharmony_ci break; 34148c2ecf20Sopenharmony_ci 34158c2ecf20Sopenharmony_ci case TCP_DEFER_ACCEPT: 34168c2ecf20Sopenharmony_ci /* Translate value in seconds to number of retransmits */ 34178c2ecf20Sopenharmony_ci WRITE_ONCE(icsk->icsk_accept_queue.rskq_defer_accept, 34188c2ecf20Sopenharmony_ci secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ, 34198c2ecf20Sopenharmony_ci TCP_RTO_MAX / HZ)); 34208c2ecf20Sopenharmony_ci break; 34218c2ecf20Sopenharmony_ci 34228c2ecf20Sopenharmony_ci case TCP_WINDOW_CLAMP: 34238c2ecf20Sopenharmony_ci if (!val) { 34248c2ecf20Sopenharmony_ci if (sk->sk_state != TCP_CLOSE) { 34258c2ecf20Sopenharmony_ci err = -EINVAL; 34268c2ecf20Sopenharmony_ci break; 34278c2ecf20Sopenharmony_ci } 34288c2ecf20Sopenharmony_ci tp->window_clamp = 0; 34298c2ecf20Sopenharmony_ci } else 34308c2ecf20Sopenharmony_ci tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ? 34318c2ecf20Sopenharmony_ci SOCK_MIN_RCVBUF / 2 : val; 34328c2ecf20Sopenharmony_ci break; 34338c2ecf20Sopenharmony_ci 34348c2ecf20Sopenharmony_ci case TCP_QUICKACK: 34358c2ecf20Sopenharmony_ci __tcp_sock_set_quickack(sk, val); 34368c2ecf20Sopenharmony_ci break; 34378c2ecf20Sopenharmony_ci 34388c2ecf20Sopenharmony_ci#ifdef CONFIG_TCP_MD5SIG 34398c2ecf20Sopenharmony_ci case TCP_MD5SIG: 34408c2ecf20Sopenharmony_ci case TCP_MD5SIG_EXT: 34418c2ecf20Sopenharmony_ci err = tp->af_specific->md5_parse(sk, optname, optval, optlen); 34428c2ecf20Sopenharmony_ci break; 34438c2ecf20Sopenharmony_ci#endif 34448c2ecf20Sopenharmony_ci case TCP_USER_TIMEOUT: 34458c2ecf20Sopenharmony_ci /* Cap the max time in ms TCP will retry or probe the window 34468c2ecf20Sopenharmony_ci * before giving up and aborting (ETIMEDOUT) a connection. 34478c2ecf20Sopenharmony_ci */ 34488c2ecf20Sopenharmony_ci if (val < 0) 34498c2ecf20Sopenharmony_ci err = -EINVAL; 34508c2ecf20Sopenharmony_ci else 34518c2ecf20Sopenharmony_ci WRITE_ONCE(icsk->icsk_user_timeout, val); 34528c2ecf20Sopenharmony_ci break; 34538c2ecf20Sopenharmony_ci 34548c2ecf20Sopenharmony_ci case TCP_FASTOPEN: 34558c2ecf20Sopenharmony_ci if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE | 34568c2ecf20Sopenharmony_ci TCPF_LISTEN))) { 34578c2ecf20Sopenharmony_ci tcp_fastopen_init_key_once(net); 34588c2ecf20Sopenharmony_ci 34598c2ecf20Sopenharmony_ci fastopen_queue_tune(sk, val); 34608c2ecf20Sopenharmony_ci } else { 34618c2ecf20Sopenharmony_ci err = -EINVAL; 34628c2ecf20Sopenharmony_ci } 34638c2ecf20Sopenharmony_ci break; 34648c2ecf20Sopenharmony_ci case TCP_FASTOPEN_CONNECT: 34658c2ecf20Sopenharmony_ci if (val > 1 || val < 0) { 34668c2ecf20Sopenharmony_ci err = -EINVAL; 34678c2ecf20Sopenharmony_ci } else if (READ_ONCE(net->ipv4.sysctl_tcp_fastopen) & 34688c2ecf20Sopenharmony_ci TFO_CLIENT_ENABLE) { 34698c2ecf20Sopenharmony_ci if (sk->sk_state == TCP_CLOSE) 34708c2ecf20Sopenharmony_ci tp->fastopen_connect = val; 34718c2ecf20Sopenharmony_ci else 34728c2ecf20Sopenharmony_ci err = -EINVAL; 34738c2ecf20Sopenharmony_ci } else { 34748c2ecf20Sopenharmony_ci err = -EOPNOTSUPP; 34758c2ecf20Sopenharmony_ci } 34768c2ecf20Sopenharmony_ci break; 34778c2ecf20Sopenharmony_ci case TCP_FASTOPEN_NO_COOKIE: 34788c2ecf20Sopenharmony_ci if (val > 1 || val < 0) 34798c2ecf20Sopenharmony_ci err = -EINVAL; 34808c2ecf20Sopenharmony_ci else if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) 34818c2ecf20Sopenharmony_ci err = -EINVAL; 34828c2ecf20Sopenharmony_ci else 34838c2ecf20Sopenharmony_ci tp->fastopen_no_cookie = val; 34848c2ecf20Sopenharmony_ci break; 34858c2ecf20Sopenharmony_ci case TCP_TIMESTAMP: 34868c2ecf20Sopenharmony_ci if (!tp->repair) 34878c2ecf20Sopenharmony_ci err = -EPERM; 34888c2ecf20Sopenharmony_ci else 34898c2ecf20Sopenharmony_ci tp->tsoffset = val - tcp_time_stamp_raw(); 34908c2ecf20Sopenharmony_ci break; 34918c2ecf20Sopenharmony_ci case TCP_REPAIR_WINDOW: 34928c2ecf20Sopenharmony_ci err = tcp_repair_set_window(tp, optval, optlen); 34938c2ecf20Sopenharmony_ci break; 34948c2ecf20Sopenharmony_ci case TCP_NOTSENT_LOWAT: 34958c2ecf20Sopenharmony_ci WRITE_ONCE(tp->notsent_lowat, val); 34968c2ecf20Sopenharmony_ci sk->sk_write_space(sk); 34978c2ecf20Sopenharmony_ci break; 34988c2ecf20Sopenharmony_ci case TCP_INQ: 34998c2ecf20Sopenharmony_ci if (val > 1 || val < 0) 35008c2ecf20Sopenharmony_ci err = -EINVAL; 35018c2ecf20Sopenharmony_ci else 35028c2ecf20Sopenharmony_ci tp->recvmsg_inq = val; 35038c2ecf20Sopenharmony_ci break; 35048c2ecf20Sopenharmony_ci case TCP_TX_DELAY: 35058c2ecf20Sopenharmony_ci if (val) 35068c2ecf20Sopenharmony_ci tcp_enable_tx_delay(); 35078c2ecf20Sopenharmony_ci WRITE_ONCE(tp->tcp_tx_delay, val); 35088c2ecf20Sopenharmony_ci break; 35098c2ecf20Sopenharmony_ci#ifdef CONFIG_TCP_NATA_URC 35108c2ecf20Sopenharmony_ci case TCP_NATA_URC: 35118c2ecf20Sopenharmony_ci err = tcp_set_nata_urc(sk, optval, optlen); 35128c2ecf20Sopenharmony_ci break; 35138c2ecf20Sopenharmony_ci#endif 35148c2ecf20Sopenharmony_ci#ifdef CONFIG_TCP_NATA_STL 35158c2ecf20Sopenharmony_ci case TCP_NATA_STL: 35168c2ecf20Sopenharmony_ci err = tcp_set_nata_stl(sk, optval, optlen); 35178c2ecf20Sopenharmony_ci break; 35188c2ecf20Sopenharmony_ci#endif 35198c2ecf20Sopenharmony_ci default: 35208c2ecf20Sopenharmony_ci err = -ENOPROTOOPT; 35218c2ecf20Sopenharmony_ci break; 35228c2ecf20Sopenharmony_ci } 35238c2ecf20Sopenharmony_ci 35248c2ecf20Sopenharmony_ci release_sock(sk); 35258c2ecf20Sopenharmony_ci return err; 35268c2ecf20Sopenharmony_ci} 35278c2ecf20Sopenharmony_ci 35288c2ecf20Sopenharmony_ciint tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, 35298c2ecf20Sopenharmony_ci unsigned int optlen) 35308c2ecf20Sopenharmony_ci{ 35318c2ecf20Sopenharmony_ci const struct inet_connection_sock *icsk = inet_csk(sk); 35328c2ecf20Sopenharmony_ci 35338c2ecf20Sopenharmony_ci if (level != SOL_TCP) 35348c2ecf20Sopenharmony_ci /* Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() */ 35358c2ecf20Sopenharmony_ci return READ_ONCE(icsk->icsk_af_ops)->setsockopt(sk, level, optname, 35368c2ecf20Sopenharmony_ci optval, optlen); 35378c2ecf20Sopenharmony_ci return do_tcp_setsockopt(sk, level, optname, optval, optlen); 35388c2ecf20Sopenharmony_ci} 35398c2ecf20Sopenharmony_ciEXPORT_SYMBOL(tcp_setsockopt); 35408c2ecf20Sopenharmony_ci 35418c2ecf20Sopenharmony_cistatic void tcp_get_info_chrono_stats(const struct tcp_sock *tp, 35428c2ecf20Sopenharmony_ci struct tcp_info *info) 35438c2ecf20Sopenharmony_ci{ 35448c2ecf20Sopenharmony_ci u64 stats[__TCP_CHRONO_MAX], total = 0; 35458c2ecf20Sopenharmony_ci enum tcp_chrono i; 35468c2ecf20Sopenharmony_ci 35478c2ecf20Sopenharmony_ci for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) { 35488c2ecf20Sopenharmony_ci stats[i] = tp->chrono_stat[i - 1]; 35498c2ecf20Sopenharmony_ci if (i == tp->chrono_type) 35508c2ecf20Sopenharmony_ci stats[i] += tcp_jiffies32 - tp->chrono_start; 35518c2ecf20Sopenharmony_ci stats[i] *= USEC_PER_SEC / HZ; 35528c2ecf20Sopenharmony_ci total += stats[i]; 35538c2ecf20Sopenharmony_ci } 35548c2ecf20Sopenharmony_ci 35558c2ecf20Sopenharmony_ci info->tcpi_busy_time = total; 35568c2ecf20Sopenharmony_ci info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED]; 35578c2ecf20Sopenharmony_ci info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED]; 35588c2ecf20Sopenharmony_ci} 35598c2ecf20Sopenharmony_ci 35608c2ecf20Sopenharmony_ci/* Return information about state of tcp endpoint in API format. */ 35618c2ecf20Sopenharmony_civoid tcp_get_info(struct sock *sk, struct tcp_info *info) 35628c2ecf20Sopenharmony_ci{ 35638c2ecf20Sopenharmony_ci const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ 35648c2ecf20Sopenharmony_ci const struct inet_connection_sock *icsk = inet_csk(sk); 35658c2ecf20Sopenharmony_ci unsigned long rate; 35668c2ecf20Sopenharmony_ci u32 now; 35678c2ecf20Sopenharmony_ci u64 rate64; 35688c2ecf20Sopenharmony_ci bool slow; 35698c2ecf20Sopenharmony_ci 35708c2ecf20Sopenharmony_ci memset(info, 0, sizeof(*info)); 35718c2ecf20Sopenharmony_ci if (sk->sk_type != SOCK_STREAM) 35728c2ecf20Sopenharmony_ci return; 35738c2ecf20Sopenharmony_ci 35748c2ecf20Sopenharmony_ci info->tcpi_state = inet_sk_state_load(sk); 35758c2ecf20Sopenharmony_ci 35768c2ecf20Sopenharmony_ci /* Report meaningful fields for all TCP states, including listeners */ 35778c2ecf20Sopenharmony_ci rate = READ_ONCE(sk->sk_pacing_rate); 35788c2ecf20Sopenharmony_ci rate64 = (rate != ~0UL) ? rate : ~0ULL; 35798c2ecf20Sopenharmony_ci info->tcpi_pacing_rate = rate64; 35808c2ecf20Sopenharmony_ci 35818c2ecf20Sopenharmony_ci rate = READ_ONCE(sk->sk_max_pacing_rate); 35828c2ecf20Sopenharmony_ci rate64 = (rate != ~0UL) ? rate : ~0ULL; 35838c2ecf20Sopenharmony_ci info->tcpi_max_pacing_rate = rate64; 35848c2ecf20Sopenharmony_ci 35858c2ecf20Sopenharmony_ci info->tcpi_reordering = tp->reordering; 35868c2ecf20Sopenharmony_ci info->tcpi_snd_cwnd = tp->snd_cwnd; 35878c2ecf20Sopenharmony_ci 35888c2ecf20Sopenharmony_ci if (info->tcpi_state == TCP_LISTEN) { 35898c2ecf20Sopenharmony_ci /* listeners aliased fields : 35908c2ecf20Sopenharmony_ci * tcpi_unacked -> Number of children ready for accept() 35918c2ecf20Sopenharmony_ci * tcpi_sacked -> max backlog 35928c2ecf20Sopenharmony_ci */ 35938c2ecf20Sopenharmony_ci info->tcpi_unacked = READ_ONCE(sk->sk_ack_backlog); 35948c2ecf20Sopenharmony_ci info->tcpi_sacked = READ_ONCE(sk->sk_max_ack_backlog); 35958c2ecf20Sopenharmony_ci return; 35968c2ecf20Sopenharmony_ci } 35978c2ecf20Sopenharmony_ci 35988c2ecf20Sopenharmony_ci slow = lock_sock_fast(sk); 35998c2ecf20Sopenharmony_ci 36008c2ecf20Sopenharmony_ci info->tcpi_ca_state = icsk->icsk_ca_state; 36018c2ecf20Sopenharmony_ci info->tcpi_retransmits = icsk->icsk_retransmits; 36028c2ecf20Sopenharmony_ci info->tcpi_probes = icsk->icsk_probes_out; 36038c2ecf20Sopenharmony_ci info->tcpi_backoff = icsk->icsk_backoff; 36048c2ecf20Sopenharmony_ci 36058c2ecf20Sopenharmony_ci if (tp->rx_opt.tstamp_ok) 36068c2ecf20Sopenharmony_ci info->tcpi_options |= TCPI_OPT_TIMESTAMPS; 36078c2ecf20Sopenharmony_ci if (tcp_is_sack(tp)) 36088c2ecf20Sopenharmony_ci info->tcpi_options |= TCPI_OPT_SACK; 36098c2ecf20Sopenharmony_ci if (tp->rx_opt.wscale_ok) { 36108c2ecf20Sopenharmony_ci info->tcpi_options |= TCPI_OPT_WSCALE; 36118c2ecf20Sopenharmony_ci info->tcpi_snd_wscale = tp->rx_opt.snd_wscale; 36128c2ecf20Sopenharmony_ci info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale; 36138c2ecf20Sopenharmony_ci } 36148c2ecf20Sopenharmony_ci 36158c2ecf20Sopenharmony_ci if (tp->ecn_flags & TCP_ECN_OK) 36168c2ecf20Sopenharmony_ci info->tcpi_options |= TCPI_OPT_ECN; 36178c2ecf20Sopenharmony_ci if (tp->ecn_flags & TCP_ECN_SEEN) 36188c2ecf20Sopenharmony_ci info->tcpi_options |= TCPI_OPT_ECN_SEEN; 36198c2ecf20Sopenharmony_ci if (tp->syn_data_acked) 36208c2ecf20Sopenharmony_ci info->tcpi_options |= TCPI_OPT_SYN_DATA; 36218c2ecf20Sopenharmony_ci 36228c2ecf20Sopenharmony_ci info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto); 36238c2ecf20Sopenharmony_ci info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato); 36248c2ecf20Sopenharmony_ci info->tcpi_snd_mss = tp->mss_cache; 36258c2ecf20Sopenharmony_ci info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss; 36268c2ecf20Sopenharmony_ci 36278c2ecf20Sopenharmony_ci info->tcpi_unacked = tp->packets_out; 36288c2ecf20Sopenharmony_ci info->tcpi_sacked = tp->sacked_out; 36298c2ecf20Sopenharmony_ci 36308c2ecf20Sopenharmony_ci info->tcpi_lost = tp->lost_out; 36318c2ecf20Sopenharmony_ci info->tcpi_retrans = tp->retrans_out; 36328c2ecf20Sopenharmony_ci 36338c2ecf20Sopenharmony_ci now = tcp_jiffies32; 36348c2ecf20Sopenharmony_ci info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime); 36358c2ecf20Sopenharmony_ci info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime); 36368c2ecf20Sopenharmony_ci info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); 36378c2ecf20Sopenharmony_ci 36388c2ecf20Sopenharmony_ci info->tcpi_pmtu = icsk->icsk_pmtu_cookie; 36398c2ecf20Sopenharmony_ci info->tcpi_rcv_ssthresh = tp->rcv_ssthresh; 36408c2ecf20Sopenharmony_ci info->tcpi_rtt = tp->srtt_us >> 3; 36418c2ecf20Sopenharmony_ci info->tcpi_rttvar = tp->mdev_us >> 2; 36428c2ecf20Sopenharmony_ci info->tcpi_snd_ssthresh = tp->snd_ssthresh; 36438c2ecf20Sopenharmony_ci info->tcpi_advmss = tp->advmss; 36448c2ecf20Sopenharmony_ci 36458c2ecf20Sopenharmony_ci info->tcpi_rcv_rtt = tp->rcv_rtt_est.rtt_us >> 3; 36468c2ecf20Sopenharmony_ci info->tcpi_rcv_space = tp->rcvq_space.space; 36478c2ecf20Sopenharmony_ci 36488c2ecf20Sopenharmony_ci info->tcpi_total_retrans = tp->total_retrans; 36498c2ecf20Sopenharmony_ci 36508c2ecf20Sopenharmony_ci info->tcpi_bytes_acked = tp->bytes_acked; 36518c2ecf20Sopenharmony_ci info->tcpi_bytes_received = tp->bytes_received; 36528c2ecf20Sopenharmony_ci info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt); 36538c2ecf20Sopenharmony_ci tcp_get_info_chrono_stats(tp, info); 36548c2ecf20Sopenharmony_ci 36558c2ecf20Sopenharmony_ci info->tcpi_segs_out = tp->segs_out; 36568c2ecf20Sopenharmony_ci info->tcpi_segs_in = tp->segs_in; 36578c2ecf20Sopenharmony_ci 36588c2ecf20Sopenharmony_ci info->tcpi_min_rtt = tcp_min_rtt(tp); 36598c2ecf20Sopenharmony_ci info->tcpi_data_segs_in = tp->data_segs_in; 36608c2ecf20Sopenharmony_ci info->tcpi_data_segs_out = tp->data_segs_out; 36618c2ecf20Sopenharmony_ci 36628c2ecf20Sopenharmony_ci info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0; 36638c2ecf20Sopenharmony_ci rate64 = tcp_compute_delivery_rate(tp); 36648c2ecf20Sopenharmony_ci if (rate64) 36658c2ecf20Sopenharmony_ci info->tcpi_delivery_rate = rate64; 36668c2ecf20Sopenharmony_ci info->tcpi_delivered = tp->delivered; 36678c2ecf20Sopenharmony_ci info->tcpi_delivered_ce = tp->delivered_ce; 36688c2ecf20Sopenharmony_ci info->tcpi_bytes_sent = tp->bytes_sent; 36698c2ecf20Sopenharmony_ci info->tcpi_bytes_retrans = tp->bytes_retrans; 36708c2ecf20Sopenharmony_ci info->tcpi_dsack_dups = tp->dsack_dups; 36718c2ecf20Sopenharmony_ci info->tcpi_reord_seen = tp->reord_seen; 36728c2ecf20Sopenharmony_ci info->tcpi_rcv_ooopack = tp->rcv_ooopack; 36738c2ecf20Sopenharmony_ci info->tcpi_snd_wnd = tp->snd_wnd; 36748c2ecf20Sopenharmony_ci info->tcpi_fastopen_client_fail = tp->fastopen_client_fail; 36758c2ecf20Sopenharmony_ci unlock_sock_fast(sk, slow); 36768c2ecf20Sopenharmony_ci} 36778c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(tcp_get_info); 36788c2ecf20Sopenharmony_ci 36798c2ecf20Sopenharmony_cistatic size_t tcp_opt_stats_get_size(void) 36808c2ecf20Sopenharmony_ci{ 36818c2ecf20Sopenharmony_ci return 36828c2ecf20Sopenharmony_ci nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BUSY */ 36838c2ecf20Sopenharmony_ci nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_RWND_LIMITED */ 36848c2ecf20Sopenharmony_ci nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_SNDBUF_LIMITED */ 36858c2ecf20Sopenharmony_ci nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DATA_SEGS_OUT */ 36868c2ecf20Sopenharmony_ci nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_TOTAL_RETRANS */ 36878c2ecf20Sopenharmony_ci nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_PACING_RATE */ 36888c2ecf20Sopenharmony_ci nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DELIVERY_RATE */ 36898c2ecf20Sopenharmony_ci nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_CWND */ 36908c2ecf20Sopenharmony_ci nla_total_size(sizeof(u32)) + /* TCP_NLA_REORDERING */ 36918c2ecf20Sopenharmony_ci nla_total_size(sizeof(u32)) + /* TCP_NLA_MIN_RTT */ 36928c2ecf20Sopenharmony_ci nla_total_size(sizeof(u8)) + /* TCP_NLA_RECUR_RETRANS */ 36938c2ecf20Sopenharmony_ci nla_total_size(sizeof(u8)) + /* TCP_NLA_DELIVERY_RATE_APP_LMT */ 36948c2ecf20Sopenharmony_ci nla_total_size(sizeof(u32)) + /* TCP_NLA_SNDQ_SIZE */ 36958c2ecf20Sopenharmony_ci nla_total_size(sizeof(u8)) + /* TCP_NLA_CA_STATE */ 36968c2ecf20Sopenharmony_ci nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_SSTHRESH */ 36978c2ecf20Sopenharmony_ci nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED */ 36988c2ecf20Sopenharmony_ci nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED_CE */ 36998c2ecf20Sopenharmony_ci nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_SENT */ 37008c2ecf20Sopenharmony_ci nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */ 37018c2ecf20Sopenharmony_ci nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */ 37028c2ecf20Sopenharmony_ci nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */ 37038c2ecf20Sopenharmony_ci nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */ 37048c2ecf20Sopenharmony_ci nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */ 37058c2ecf20Sopenharmony_ci nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */ 37068c2ecf20Sopenharmony_ci nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */ 37078c2ecf20Sopenharmony_ci 0; 37088c2ecf20Sopenharmony_ci} 37098c2ecf20Sopenharmony_ci 37108c2ecf20Sopenharmony_cistruct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, 37118c2ecf20Sopenharmony_ci const struct sk_buff *orig_skb) 37128c2ecf20Sopenharmony_ci{ 37138c2ecf20Sopenharmony_ci const struct tcp_sock *tp = tcp_sk(sk); 37148c2ecf20Sopenharmony_ci struct sk_buff *stats; 37158c2ecf20Sopenharmony_ci struct tcp_info info; 37168c2ecf20Sopenharmony_ci unsigned long rate; 37178c2ecf20Sopenharmony_ci u64 rate64; 37188c2ecf20Sopenharmony_ci 37198c2ecf20Sopenharmony_ci stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC); 37208c2ecf20Sopenharmony_ci if (!stats) 37218c2ecf20Sopenharmony_ci return NULL; 37228c2ecf20Sopenharmony_ci 37238c2ecf20Sopenharmony_ci tcp_get_info_chrono_stats(tp, &info); 37248c2ecf20Sopenharmony_ci nla_put_u64_64bit(stats, TCP_NLA_BUSY, 37258c2ecf20Sopenharmony_ci info.tcpi_busy_time, TCP_NLA_PAD); 37268c2ecf20Sopenharmony_ci nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED, 37278c2ecf20Sopenharmony_ci info.tcpi_rwnd_limited, TCP_NLA_PAD); 37288c2ecf20Sopenharmony_ci nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED, 37298c2ecf20Sopenharmony_ci info.tcpi_sndbuf_limited, TCP_NLA_PAD); 37308c2ecf20Sopenharmony_ci nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT, 37318c2ecf20Sopenharmony_ci tp->data_segs_out, TCP_NLA_PAD); 37328c2ecf20Sopenharmony_ci nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS, 37338c2ecf20Sopenharmony_ci tp->total_retrans, TCP_NLA_PAD); 37348c2ecf20Sopenharmony_ci 37358c2ecf20Sopenharmony_ci rate = READ_ONCE(sk->sk_pacing_rate); 37368c2ecf20Sopenharmony_ci rate64 = (rate != ~0UL) ? rate : ~0ULL; 37378c2ecf20Sopenharmony_ci nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD); 37388c2ecf20Sopenharmony_ci 37398c2ecf20Sopenharmony_ci rate64 = tcp_compute_delivery_rate(tp); 37408c2ecf20Sopenharmony_ci nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD); 37418c2ecf20Sopenharmony_ci 37428c2ecf20Sopenharmony_ci nla_put_u32(stats, TCP_NLA_SND_CWND, tp->snd_cwnd); 37438c2ecf20Sopenharmony_ci nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering); 37448c2ecf20Sopenharmony_ci nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp)); 37458c2ecf20Sopenharmony_ci 37468c2ecf20Sopenharmony_ci nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits); 37478c2ecf20Sopenharmony_ci nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited); 37488c2ecf20Sopenharmony_ci nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh); 37498c2ecf20Sopenharmony_ci nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered); 37508c2ecf20Sopenharmony_ci nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce); 37518c2ecf20Sopenharmony_ci 37528c2ecf20Sopenharmony_ci nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una); 37538c2ecf20Sopenharmony_ci nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state); 37548c2ecf20Sopenharmony_ci 37558c2ecf20Sopenharmony_ci nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent, 37568c2ecf20Sopenharmony_ci TCP_NLA_PAD); 37578c2ecf20Sopenharmony_ci nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans, 37588c2ecf20Sopenharmony_ci TCP_NLA_PAD); 37598c2ecf20Sopenharmony_ci nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups); 37608c2ecf20Sopenharmony_ci nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen); 37618c2ecf20Sopenharmony_ci nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3); 37628c2ecf20Sopenharmony_ci nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash); 37638c2ecf20Sopenharmony_ci nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT, 37648c2ecf20Sopenharmony_ci max_t(int, 0, tp->write_seq - tp->snd_nxt)); 37658c2ecf20Sopenharmony_ci nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns, 37668c2ecf20Sopenharmony_ci TCP_NLA_PAD); 37678c2ecf20Sopenharmony_ci 37688c2ecf20Sopenharmony_ci return stats; 37698c2ecf20Sopenharmony_ci} 37708c2ecf20Sopenharmony_ci 37718c2ecf20Sopenharmony_cistatic int do_tcp_getsockopt(struct sock *sk, int level, 37728c2ecf20Sopenharmony_ci int optname, char __user *optval, int __user *optlen) 37738c2ecf20Sopenharmony_ci{ 37748c2ecf20Sopenharmony_ci struct inet_connection_sock *icsk = inet_csk(sk); 37758c2ecf20Sopenharmony_ci struct tcp_sock *tp = tcp_sk(sk); 37768c2ecf20Sopenharmony_ci struct net *net = sock_net(sk); 37778c2ecf20Sopenharmony_ci int val, len; 37788c2ecf20Sopenharmony_ci 37798c2ecf20Sopenharmony_ci if (get_user(len, optlen)) 37808c2ecf20Sopenharmony_ci return -EFAULT; 37818c2ecf20Sopenharmony_ci 37828c2ecf20Sopenharmony_ci len = min_t(unsigned int, len, sizeof(int)); 37838c2ecf20Sopenharmony_ci 37848c2ecf20Sopenharmony_ci if (len < 0) 37858c2ecf20Sopenharmony_ci return -EINVAL; 37868c2ecf20Sopenharmony_ci 37878c2ecf20Sopenharmony_ci switch (optname) { 37888c2ecf20Sopenharmony_ci case TCP_MAXSEG: 37898c2ecf20Sopenharmony_ci val = tp->mss_cache; 37908c2ecf20Sopenharmony_ci if (tp->rx_opt.user_mss && 37918c2ecf20Sopenharmony_ci ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) 37928c2ecf20Sopenharmony_ci val = tp->rx_opt.user_mss; 37938c2ecf20Sopenharmony_ci if (tp->repair) 37948c2ecf20Sopenharmony_ci val = tp->rx_opt.mss_clamp; 37958c2ecf20Sopenharmony_ci break; 37968c2ecf20Sopenharmony_ci case TCP_NODELAY: 37978c2ecf20Sopenharmony_ci val = !!(tp->nonagle&TCP_NAGLE_OFF); 37988c2ecf20Sopenharmony_ci break; 37998c2ecf20Sopenharmony_ci case TCP_CORK: 38008c2ecf20Sopenharmony_ci val = !!(tp->nonagle&TCP_NAGLE_CORK); 38018c2ecf20Sopenharmony_ci break; 38028c2ecf20Sopenharmony_ci case TCP_KEEPIDLE: 38038c2ecf20Sopenharmony_ci val = keepalive_time_when(tp) / HZ; 38048c2ecf20Sopenharmony_ci break; 38058c2ecf20Sopenharmony_ci case TCP_KEEPINTVL: 38068c2ecf20Sopenharmony_ci val = keepalive_intvl_when(tp) / HZ; 38078c2ecf20Sopenharmony_ci break; 38088c2ecf20Sopenharmony_ci case TCP_KEEPCNT: 38098c2ecf20Sopenharmony_ci val = keepalive_probes(tp); 38108c2ecf20Sopenharmony_ci break; 38118c2ecf20Sopenharmony_ci case TCP_SYNCNT: 38128c2ecf20Sopenharmony_ci val = READ_ONCE(icsk->icsk_syn_retries) ? : 38138c2ecf20Sopenharmony_ci READ_ONCE(net->ipv4.sysctl_tcp_syn_retries); 38148c2ecf20Sopenharmony_ci break; 38158c2ecf20Sopenharmony_ci case TCP_LINGER2: 38168c2ecf20Sopenharmony_ci val = READ_ONCE(tp->linger2); 38178c2ecf20Sopenharmony_ci if (val >= 0) 38188c2ecf20Sopenharmony_ci val = (val ? : READ_ONCE(net->ipv4.sysctl_tcp_fin_timeout)) / HZ; 38198c2ecf20Sopenharmony_ci break; 38208c2ecf20Sopenharmony_ci case TCP_DEFER_ACCEPT: 38218c2ecf20Sopenharmony_ci val = READ_ONCE(icsk->icsk_accept_queue.rskq_defer_accept); 38228c2ecf20Sopenharmony_ci val = retrans_to_secs(val, TCP_TIMEOUT_INIT / HZ, 38238c2ecf20Sopenharmony_ci TCP_RTO_MAX / HZ); 38248c2ecf20Sopenharmony_ci break; 38258c2ecf20Sopenharmony_ci case TCP_WINDOW_CLAMP: 38268c2ecf20Sopenharmony_ci val = tp->window_clamp; 38278c2ecf20Sopenharmony_ci break; 38288c2ecf20Sopenharmony_ci case TCP_INFO: { 38298c2ecf20Sopenharmony_ci struct tcp_info info; 38308c2ecf20Sopenharmony_ci 38318c2ecf20Sopenharmony_ci if (get_user(len, optlen)) 38328c2ecf20Sopenharmony_ci return -EFAULT; 38338c2ecf20Sopenharmony_ci 38348c2ecf20Sopenharmony_ci tcp_get_info(sk, &info); 38358c2ecf20Sopenharmony_ci 38368c2ecf20Sopenharmony_ci len = min_t(unsigned int, len, sizeof(info)); 38378c2ecf20Sopenharmony_ci if (put_user(len, optlen)) 38388c2ecf20Sopenharmony_ci return -EFAULT; 38398c2ecf20Sopenharmony_ci if (copy_to_user(optval, &info, len)) 38408c2ecf20Sopenharmony_ci return -EFAULT; 38418c2ecf20Sopenharmony_ci return 0; 38428c2ecf20Sopenharmony_ci } 38438c2ecf20Sopenharmony_ci case TCP_CC_INFO: { 38448c2ecf20Sopenharmony_ci const struct tcp_congestion_ops *ca_ops; 38458c2ecf20Sopenharmony_ci union tcp_cc_info info; 38468c2ecf20Sopenharmony_ci size_t sz = 0; 38478c2ecf20Sopenharmony_ci int attr; 38488c2ecf20Sopenharmony_ci 38498c2ecf20Sopenharmony_ci if (get_user(len, optlen)) 38508c2ecf20Sopenharmony_ci return -EFAULT; 38518c2ecf20Sopenharmony_ci 38528c2ecf20Sopenharmony_ci ca_ops = icsk->icsk_ca_ops; 38538c2ecf20Sopenharmony_ci if (ca_ops && ca_ops->get_info) 38548c2ecf20Sopenharmony_ci sz = ca_ops->get_info(sk, ~0U, &attr, &info); 38558c2ecf20Sopenharmony_ci 38568c2ecf20Sopenharmony_ci len = min_t(unsigned int, len, sz); 38578c2ecf20Sopenharmony_ci if (put_user(len, optlen)) 38588c2ecf20Sopenharmony_ci return -EFAULT; 38598c2ecf20Sopenharmony_ci if (copy_to_user(optval, &info, len)) 38608c2ecf20Sopenharmony_ci return -EFAULT; 38618c2ecf20Sopenharmony_ci return 0; 38628c2ecf20Sopenharmony_ci } 38638c2ecf20Sopenharmony_ci case TCP_QUICKACK: 38648c2ecf20Sopenharmony_ci val = !inet_csk_in_pingpong_mode(sk); 38658c2ecf20Sopenharmony_ci break; 38668c2ecf20Sopenharmony_ci 38678c2ecf20Sopenharmony_ci case TCP_CONGESTION: 38688c2ecf20Sopenharmony_ci if (get_user(len, optlen)) 38698c2ecf20Sopenharmony_ci return -EFAULT; 38708c2ecf20Sopenharmony_ci len = min_t(unsigned int, len, TCP_CA_NAME_MAX); 38718c2ecf20Sopenharmony_ci if (put_user(len, optlen)) 38728c2ecf20Sopenharmony_ci return -EFAULT; 38738c2ecf20Sopenharmony_ci if (copy_to_user(optval, icsk->icsk_ca_ops->name, len)) 38748c2ecf20Sopenharmony_ci return -EFAULT; 38758c2ecf20Sopenharmony_ci return 0; 38768c2ecf20Sopenharmony_ci 38778c2ecf20Sopenharmony_ci case TCP_ULP: 38788c2ecf20Sopenharmony_ci if (get_user(len, optlen)) 38798c2ecf20Sopenharmony_ci return -EFAULT; 38808c2ecf20Sopenharmony_ci len = min_t(unsigned int, len, TCP_ULP_NAME_MAX); 38818c2ecf20Sopenharmony_ci if (!icsk->icsk_ulp_ops) { 38828c2ecf20Sopenharmony_ci if (put_user(0, optlen)) 38838c2ecf20Sopenharmony_ci return -EFAULT; 38848c2ecf20Sopenharmony_ci return 0; 38858c2ecf20Sopenharmony_ci } 38868c2ecf20Sopenharmony_ci if (put_user(len, optlen)) 38878c2ecf20Sopenharmony_ci return -EFAULT; 38888c2ecf20Sopenharmony_ci if (copy_to_user(optval, icsk->icsk_ulp_ops->name, len)) 38898c2ecf20Sopenharmony_ci return -EFAULT; 38908c2ecf20Sopenharmony_ci return 0; 38918c2ecf20Sopenharmony_ci 38928c2ecf20Sopenharmony_ci case TCP_FASTOPEN_KEY: { 38938c2ecf20Sopenharmony_ci u64 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u64)]; 38948c2ecf20Sopenharmony_ci unsigned int key_len; 38958c2ecf20Sopenharmony_ci 38968c2ecf20Sopenharmony_ci if (get_user(len, optlen)) 38978c2ecf20Sopenharmony_ci return -EFAULT; 38988c2ecf20Sopenharmony_ci 38998c2ecf20Sopenharmony_ci key_len = tcp_fastopen_get_cipher(net, icsk, key) * 39008c2ecf20Sopenharmony_ci TCP_FASTOPEN_KEY_LENGTH; 39018c2ecf20Sopenharmony_ci len = min_t(unsigned int, len, key_len); 39028c2ecf20Sopenharmony_ci if (put_user(len, optlen)) 39038c2ecf20Sopenharmony_ci return -EFAULT; 39048c2ecf20Sopenharmony_ci if (copy_to_user(optval, key, len)) 39058c2ecf20Sopenharmony_ci return -EFAULT; 39068c2ecf20Sopenharmony_ci return 0; 39078c2ecf20Sopenharmony_ci } 39088c2ecf20Sopenharmony_ci case TCP_THIN_LINEAR_TIMEOUTS: 39098c2ecf20Sopenharmony_ci val = tp->thin_lto; 39108c2ecf20Sopenharmony_ci break; 39118c2ecf20Sopenharmony_ci 39128c2ecf20Sopenharmony_ci case TCP_THIN_DUPACK: 39138c2ecf20Sopenharmony_ci val = 0; 39148c2ecf20Sopenharmony_ci break; 39158c2ecf20Sopenharmony_ci 39168c2ecf20Sopenharmony_ci case TCP_REPAIR: 39178c2ecf20Sopenharmony_ci val = tp->repair; 39188c2ecf20Sopenharmony_ci break; 39198c2ecf20Sopenharmony_ci 39208c2ecf20Sopenharmony_ci case TCP_REPAIR_QUEUE: 39218c2ecf20Sopenharmony_ci if (tp->repair) 39228c2ecf20Sopenharmony_ci val = tp->repair_queue; 39238c2ecf20Sopenharmony_ci else 39248c2ecf20Sopenharmony_ci return -EINVAL; 39258c2ecf20Sopenharmony_ci break; 39268c2ecf20Sopenharmony_ci 39278c2ecf20Sopenharmony_ci case TCP_REPAIR_WINDOW: { 39288c2ecf20Sopenharmony_ci struct tcp_repair_window opt; 39298c2ecf20Sopenharmony_ci 39308c2ecf20Sopenharmony_ci if (get_user(len, optlen)) 39318c2ecf20Sopenharmony_ci return -EFAULT; 39328c2ecf20Sopenharmony_ci 39338c2ecf20Sopenharmony_ci if (len != sizeof(opt)) 39348c2ecf20Sopenharmony_ci return -EINVAL; 39358c2ecf20Sopenharmony_ci 39368c2ecf20Sopenharmony_ci if (!tp->repair) 39378c2ecf20Sopenharmony_ci return -EPERM; 39388c2ecf20Sopenharmony_ci 39398c2ecf20Sopenharmony_ci opt.snd_wl1 = tp->snd_wl1; 39408c2ecf20Sopenharmony_ci opt.snd_wnd = tp->snd_wnd; 39418c2ecf20Sopenharmony_ci opt.max_window = tp->max_window; 39428c2ecf20Sopenharmony_ci opt.rcv_wnd = tp->rcv_wnd; 39438c2ecf20Sopenharmony_ci opt.rcv_wup = tp->rcv_wup; 39448c2ecf20Sopenharmony_ci 39458c2ecf20Sopenharmony_ci if (copy_to_user(optval, &opt, len)) 39468c2ecf20Sopenharmony_ci return -EFAULT; 39478c2ecf20Sopenharmony_ci return 0; 39488c2ecf20Sopenharmony_ci } 39498c2ecf20Sopenharmony_ci case TCP_QUEUE_SEQ: 39508c2ecf20Sopenharmony_ci if (tp->repair_queue == TCP_SEND_QUEUE) 39518c2ecf20Sopenharmony_ci val = tp->write_seq; 39528c2ecf20Sopenharmony_ci else if (tp->repair_queue == TCP_RECV_QUEUE) 39538c2ecf20Sopenharmony_ci val = tp->rcv_nxt; 39548c2ecf20Sopenharmony_ci else 39558c2ecf20Sopenharmony_ci return -EINVAL; 39568c2ecf20Sopenharmony_ci break; 39578c2ecf20Sopenharmony_ci 39588c2ecf20Sopenharmony_ci case TCP_USER_TIMEOUT: 39598c2ecf20Sopenharmony_ci val = READ_ONCE(icsk->icsk_user_timeout); 39608c2ecf20Sopenharmony_ci break; 39618c2ecf20Sopenharmony_ci 39628c2ecf20Sopenharmony_ci case TCP_FASTOPEN: 39638c2ecf20Sopenharmony_ci val = READ_ONCE(icsk->icsk_accept_queue.fastopenq.max_qlen); 39648c2ecf20Sopenharmony_ci break; 39658c2ecf20Sopenharmony_ci 39668c2ecf20Sopenharmony_ci case TCP_FASTOPEN_CONNECT: 39678c2ecf20Sopenharmony_ci val = tp->fastopen_connect; 39688c2ecf20Sopenharmony_ci break; 39698c2ecf20Sopenharmony_ci 39708c2ecf20Sopenharmony_ci case TCP_FASTOPEN_NO_COOKIE: 39718c2ecf20Sopenharmony_ci val = tp->fastopen_no_cookie; 39728c2ecf20Sopenharmony_ci break; 39738c2ecf20Sopenharmony_ci 39748c2ecf20Sopenharmony_ci case TCP_TX_DELAY: 39758c2ecf20Sopenharmony_ci val = READ_ONCE(tp->tcp_tx_delay); 39768c2ecf20Sopenharmony_ci break; 39778c2ecf20Sopenharmony_ci 39788c2ecf20Sopenharmony_ci case TCP_TIMESTAMP: 39798c2ecf20Sopenharmony_ci val = tcp_time_stamp_raw() + tp->tsoffset; 39808c2ecf20Sopenharmony_ci break; 39818c2ecf20Sopenharmony_ci case TCP_NOTSENT_LOWAT: 39828c2ecf20Sopenharmony_ci val = READ_ONCE(tp->notsent_lowat); 39838c2ecf20Sopenharmony_ci break; 39848c2ecf20Sopenharmony_ci case TCP_INQ: 39858c2ecf20Sopenharmony_ci val = tp->recvmsg_inq; 39868c2ecf20Sopenharmony_ci break; 39878c2ecf20Sopenharmony_ci case TCP_SAVE_SYN: 39888c2ecf20Sopenharmony_ci val = tp->save_syn; 39898c2ecf20Sopenharmony_ci break; 39908c2ecf20Sopenharmony_ci case TCP_SAVED_SYN: { 39918c2ecf20Sopenharmony_ci if (get_user(len, optlen)) 39928c2ecf20Sopenharmony_ci return -EFAULT; 39938c2ecf20Sopenharmony_ci 39948c2ecf20Sopenharmony_ci lock_sock(sk); 39958c2ecf20Sopenharmony_ci if (tp->saved_syn) { 39968c2ecf20Sopenharmony_ci if (len < tcp_saved_syn_len(tp->saved_syn)) { 39978c2ecf20Sopenharmony_ci if (put_user(tcp_saved_syn_len(tp->saved_syn), 39988c2ecf20Sopenharmony_ci optlen)) { 39998c2ecf20Sopenharmony_ci release_sock(sk); 40008c2ecf20Sopenharmony_ci return -EFAULT; 40018c2ecf20Sopenharmony_ci } 40028c2ecf20Sopenharmony_ci release_sock(sk); 40038c2ecf20Sopenharmony_ci return -EINVAL; 40048c2ecf20Sopenharmony_ci } 40058c2ecf20Sopenharmony_ci len = tcp_saved_syn_len(tp->saved_syn); 40068c2ecf20Sopenharmony_ci if (put_user(len, optlen)) { 40078c2ecf20Sopenharmony_ci release_sock(sk); 40088c2ecf20Sopenharmony_ci return -EFAULT; 40098c2ecf20Sopenharmony_ci } 40108c2ecf20Sopenharmony_ci if (copy_to_user(optval, tp->saved_syn->data, len)) { 40118c2ecf20Sopenharmony_ci release_sock(sk); 40128c2ecf20Sopenharmony_ci return -EFAULT; 40138c2ecf20Sopenharmony_ci } 40148c2ecf20Sopenharmony_ci tcp_saved_syn_free(tp); 40158c2ecf20Sopenharmony_ci release_sock(sk); 40168c2ecf20Sopenharmony_ci } else { 40178c2ecf20Sopenharmony_ci release_sock(sk); 40188c2ecf20Sopenharmony_ci len = 0; 40198c2ecf20Sopenharmony_ci if (put_user(len, optlen)) 40208c2ecf20Sopenharmony_ci return -EFAULT; 40218c2ecf20Sopenharmony_ci } 40228c2ecf20Sopenharmony_ci return 0; 40238c2ecf20Sopenharmony_ci } 40248c2ecf20Sopenharmony_ci#ifdef CONFIG_MMU 40258c2ecf20Sopenharmony_ci case TCP_ZEROCOPY_RECEIVE: { 40268c2ecf20Sopenharmony_ci struct tcp_zerocopy_receive zc = {}; 40278c2ecf20Sopenharmony_ci int err; 40288c2ecf20Sopenharmony_ci 40298c2ecf20Sopenharmony_ci if (get_user(len, optlen)) 40308c2ecf20Sopenharmony_ci return -EFAULT; 40318c2ecf20Sopenharmony_ci if (len < 0 || 40328c2ecf20Sopenharmony_ci len < offsetofend(struct tcp_zerocopy_receive, length)) 40338c2ecf20Sopenharmony_ci return -EINVAL; 40348c2ecf20Sopenharmony_ci if (len > sizeof(zc)) { 40358c2ecf20Sopenharmony_ci len = sizeof(zc); 40368c2ecf20Sopenharmony_ci if (put_user(len, optlen)) 40378c2ecf20Sopenharmony_ci return -EFAULT; 40388c2ecf20Sopenharmony_ci } 40398c2ecf20Sopenharmony_ci if (copy_from_user(&zc, optval, len)) 40408c2ecf20Sopenharmony_ci return -EFAULT; 40418c2ecf20Sopenharmony_ci lock_sock(sk); 40428c2ecf20Sopenharmony_ci err = tcp_zerocopy_receive(sk, &zc); 40438c2ecf20Sopenharmony_ci err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname, 40448c2ecf20Sopenharmony_ci &zc, &len, err); 40458c2ecf20Sopenharmony_ci release_sock(sk); 40468c2ecf20Sopenharmony_ci if (len >= offsetofend(struct tcp_zerocopy_receive, err)) 40478c2ecf20Sopenharmony_ci goto zerocopy_rcv_sk_err; 40488c2ecf20Sopenharmony_ci switch (len) { 40498c2ecf20Sopenharmony_ci case offsetofend(struct tcp_zerocopy_receive, err): 40508c2ecf20Sopenharmony_ci goto zerocopy_rcv_sk_err; 40518c2ecf20Sopenharmony_ci case offsetofend(struct tcp_zerocopy_receive, inq): 40528c2ecf20Sopenharmony_ci goto zerocopy_rcv_inq; 40538c2ecf20Sopenharmony_ci case offsetofend(struct tcp_zerocopy_receive, length): 40548c2ecf20Sopenharmony_ci default: 40558c2ecf20Sopenharmony_ci goto zerocopy_rcv_out; 40568c2ecf20Sopenharmony_ci } 40578c2ecf20Sopenharmony_cizerocopy_rcv_sk_err: 40588c2ecf20Sopenharmony_ci if (!err) 40598c2ecf20Sopenharmony_ci zc.err = sock_error(sk); 40608c2ecf20Sopenharmony_cizerocopy_rcv_inq: 40618c2ecf20Sopenharmony_ci zc.inq = tcp_inq_hint(sk); 40628c2ecf20Sopenharmony_cizerocopy_rcv_out: 40638c2ecf20Sopenharmony_ci if (!err && copy_to_user(optval, &zc, len)) 40648c2ecf20Sopenharmony_ci err = -EFAULT; 40658c2ecf20Sopenharmony_ci return err; 40668c2ecf20Sopenharmony_ci } 40678c2ecf20Sopenharmony_ci#endif 40688c2ecf20Sopenharmony_ci default: 40698c2ecf20Sopenharmony_ci return -ENOPROTOOPT; 40708c2ecf20Sopenharmony_ci } 40718c2ecf20Sopenharmony_ci 40728c2ecf20Sopenharmony_ci if (put_user(len, optlen)) 40738c2ecf20Sopenharmony_ci return -EFAULT; 40748c2ecf20Sopenharmony_ci if (copy_to_user(optval, &val, len)) 40758c2ecf20Sopenharmony_ci return -EFAULT; 40768c2ecf20Sopenharmony_ci return 0; 40778c2ecf20Sopenharmony_ci} 40788c2ecf20Sopenharmony_ci 40798c2ecf20Sopenharmony_cibool tcp_bpf_bypass_getsockopt(int level, int optname) 40808c2ecf20Sopenharmony_ci{ 40818c2ecf20Sopenharmony_ci /* TCP do_tcp_getsockopt has optimized getsockopt implementation 40828c2ecf20Sopenharmony_ci * to avoid extra socket lock for TCP_ZEROCOPY_RECEIVE. 40838c2ecf20Sopenharmony_ci */ 40848c2ecf20Sopenharmony_ci if (level == SOL_TCP && optname == TCP_ZEROCOPY_RECEIVE) 40858c2ecf20Sopenharmony_ci return true; 40868c2ecf20Sopenharmony_ci 40878c2ecf20Sopenharmony_ci return false; 40888c2ecf20Sopenharmony_ci} 40898c2ecf20Sopenharmony_ciEXPORT_SYMBOL(tcp_bpf_bypass_getsockopt); 40908c2ecf20Sopenharmony_ci 40918c2ecf20Sopenharmony_ciint tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, 40928c2ecf20Sopenharmony_ci int __user *optlen) 40938c2ecf20Sopenharmony_ci{ 40948c2ecf20Sopenharmony_ci struct inet_connection_sock *icsk = inet_csk(sk); 40958c2ecf20Sopenharmony_ci 40968c2ecf20Sopenharmony_ci if (level != SOL_TCP) 40978c2ecf20Sopenharmony_ci /* Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() */ 40988c2ecf20Sopenharmony_ci return READ_ONCE(icsk->icsk_af_ops)->getsockopt(sk, level, optname, 40998c2ecf20Sopenharmony_ci optval, optlen); 41008c2ecf20Sopenharmony_ci return do_tcp_getsockopt(sk, level, optname, optval, optlen); 41018c2ecf20Sopenharmony_ci} 41028c2ecf20Sopenharmony_ciEXPORT_SYMBOL(tcp_getsockopt); 41038c2ecf20Sopenharmony_ci 41048c2ecf20Sopenharmony_ci#ifdef CONFIG_TCP_MD5SIG 41058c2ecf20Sopenharmony_cistatic DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool); 41068c2ecf20Sopenharmony_cistatic DEFINE_MUTEX(tcp_md5sig_mutex); 41078c2ecf20Sopenharmony_cistatic bool tcp_md5sig_pool_populated = false; 41088c2ecf20Sopenharmony_ci 41098c2ecf20Sopenharmony_cistatic void __tcp_alloc_md5sig_pool(void) 41108c2ecf20Sopenharmony_ci{ 41118c2ecf20Sopenharmony_ci struct crypto_ahash *hash; 41128c2ecf20Sopenharmony_ci int cpu; 41138c2ecf20Sopenharmony_ci 41148c2ecf20Sopenharmony_ci hash = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC); 41158c2ecf20Sopenharmony_ci if (IS_ERR(hash)) 41168c2ecf20Sopenharmony_ci return; 41178c2ecf20Sopenharmony_ci 41188c2ecf20Sopenharmony_ci for_each_possible_cpu(cpu) { 41198c2ecf20Sopenharmony_ci void *scratch = per_cpu(tcp_md5sig_pool, cpu).scratch; 41208c2ecf20Sopenharmony_ci struct ahash_request *req; 41218c2ecf20Sopenharmony_ci 41228c2ecf20Sopenharmony_ci if (!scratch) { 41238c2ecf20Sopenharmony_ci scratch = kmalloc_node(sizeof(union tcp_md5sum_block) + 41248c2ecf20Sopenharmony_ci sizeof(struct tcphdr), 41258c2ecf20Sopenharmony_ci GFP_KERNEL, 41268c2ecf20Sopenharmony_ci cpu_to_node(cpu)); 41278c2ecf20Sopenharmony_ci if (!scratch) 41288c2ecf20Sopenharmony_ci return; 41298c2ecf20Sopenharmony_ci per_cpu(tcp_md5sig_pool, cpu).scratch = scratch; 41308c2ecf20Sopenharmony_ci } 41318c2ecf20Sopenharmony_ci if (per_cpu(tcp_md5sig_pool, cpu).md5_req) 41328c2ecf20Sopenharmony_ci continue; 41338c2ecf20Sopenharmony_ci 41348c2ecf20Sopenharmony_ci req = ahash_request_alloc(hash, GFP_KERNEL); 41358c2ecf20Sopenharmony_ci if (!req) 41368c2ecf20Sopenharmony_ci return; 41378c2ecf20Sopenharmony_ci 41388c2ecf20Sopenharmony_ci ahash_request_set_callback(req, 0, NULL, NULL); 41398c2ecf20Sopenharmony_ci 41408c2ecf20Sopenharmony_ci per_cpu(tcp_md5sig_pool, cpu).md5_req = req; 41418c2ecf20Sopenharmony_ci } 41428c2ecf20Sopenharmony_ci /* before setting tcp_md5sig_pool_populated, we must commit all writes 41438c2ecf20Sopenharmony_ci * to memory. See smp_rmb() in tcp_get_md5sig_pool() 41448c2ecf20Sopenharmony_ci */ 41458c2ecf20Sopenharmony_ci smp_wmb(); 41468c2ecf20Sopenharmony_ci /* Paired with READ_ONCE() from tcp_alloc_md5sig_pool() 41478c2ecf20Sopenharmony_ci * and tcp_get_md5sig_pool(). 41488c2ecf20Sopenharmony_ci */ 41498c2ecf20Sopenharmony_ci WRITE_ONCE(tcp_md5sig_pool_populated, true); 41508c2ecf20Sopenharmony_ci} 41518c2ecf20Sopenharmony_ci 41528c2ecf20Sopenharmony_cibool tcp_alloc_md5sig_pool(void) 41538c2ecf20Sopenharmony_ci{ 41548c2ecf20Sopenharmony_ci /* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */ 41558c2ecf20Sopenharmony_ci if (unlikely(!READ_ONCE(tcp_md5sig_pool_populated))) { 41568c2ecf20Sopenharmony_ci mutex_lock(&tcp_md5sig_mutex); 41578c2ecf20Sopenharmony_ci 41588c2ecf20Sopenharmony_ci if (!tcp_md5sig_pool_populated) { 41598c2ecf20Sopenharmony_ci __tcp_alloc_md5sig_pool(); 41608c2ecf20Sopenharmony_ci if (tcp_md5sig_pool_populated) 41618c2ecf20Sopenharmony_ci static_branch_inc(&tcp_md5_needed); 41628c2ecf20Sopenharmony_ci } 41638c2ecf20Sopenharmony_ci 41648c2ecf20Sopenharmony_ci mutex_unlock(&tcp_md5sig_mutex); 41658c2ecf20Sopenharmony_ci } 41668c2ecf20Sopenharmony_ci /* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */ 41678c2ecf20Sopenharmony_ci return READ_ONCE(tcp_md5sig_pool_populated); 41688c2ecf20Sopenharmony_ci} 41698c2ecf20Sopenharmony_ciEXPORT_SYMBOL(tcp_alloc_md5sig_pool); 41708c2ecf20Sopenharmony_ci 41718c2ecf20Sopenharmony_ci 41728c2ecf20Sopenharmony_ci/** 41738c2ecf20Sopenharmony_ci * tcp_get_md5sig_pool - get md5sig_pool for this user 41748c2ecf20Sopenharmony_ci * 41758c2ecf20Sopenharmony_ci * We use percpu structure, so if we succeed, we exit with preemption 41768c2ecf20Sopenharmony_ci * and BH disabled, to make sure another thread or softirq handling 41778c2ecf20Sopenharmony_ci * wont try to get same context. 41788c2ecf20Sopenharmony_ci */ 41798c2ecf20Sopenharmony_cistruct tcp_md5sig_pool *tcp_get_md5sig_pool(void) 41808c2ecf20Sopenharmony_ci{ 41818c2ecf20Sopenharmony_ci local_bh_disable(); 41828c2ecf20Sopenharmony_ci 41838c2ecf20Sopenharmony_ci /* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */ 41848c2ecf20Sopenharmony_ci if (READ_ONCE(tcp_md5sig_pool_populated)) { 41858c2ecf20Sopenharmony_ci /* coupled with smp_wmb() in __tcp_alloc_md5sig_pool() */ 41868c2ecf20Sopenharmony_ci smp_rmb(); 41878c2ecf20Sopenharmony_ci return this_cpu_ptr(&tcp_md5sig_pool); 41888c2ecf20Sopenharmony_ci } 41898c2ecf20Sopenharmony_ci local_bh_enable(); 41908c2ecf20Sopenharmony_ci return NULL; 41918c2ecf20Sopenharmony_ci} 41928c2ecf20Sopenharmony_ciEXPORT_SYMBOL(tcp_get_md5sig_pool); 41938c2ecf20Sopenharmony_ci 41948c2ecf20Sopenharmony_ciint tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, 41958c2ecf20Sopenharmony_ci const struct sk_buff *skb, unsigned int header_len) 41968c2ecf20Sopenharmony_ci{ 41978c2ecf20Sopenharmony_ci struct scatterlist sg; 41988c2ecf20Sopenharmony_ci const struct tcphdr *tp = tcp_hdr(skb); 41998c2ecf20Sopenharmony_ci struct ahash_request *req = hp->md5_req; 42008c2ecf20Sopenharmony_ci unsigned int i; 42018c2ecf20Sopenharmony_ci const unsigned int head_data_len = skb_headlen(skb) > header_len ? 42028c2ecf20Sopenharmony_ci skb_headlen(skb) - header_len : 0; 42038c2ecf20Sopenharmony_ci const struct skb_shared_info *shi = skb_shinfo(skb); 42048c2ecf20Sopenharmony_ci struct sk_buff *frag_iter; 42058c2ecf20Sopenharmony_ci 42068c2ecf20Sopenharmony_ci sg_init_table(&sg, 1); 42078c2ecf20Sopenharmony_ci 42088c2ecf20Sopenharmony_ci sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len); 42098c2ecf20Sopenharmony_ci ahash_request_set_crypt(req, &sg, NULL, head_data_len); 42108c2ecf20Sopenharmony_ci if (crypto_ahash_update(req)) 42118c2ecf20Sopenharmony_ci return 1; 42128c2ecf20Sopenharmony_ci 42138c2ecf20Sopenharmony_ci for (i = 0; i < shi->nr_frags; ++i) { 42148c2ecf20Sopenharmony_ci const skb_frag_t *f = &shi->frags[i]; 42158c2ecf20Sopenharmony_ci unsigned int offset = skb_frag_off(f); 42168c2ecf20Sopenharmony_ci struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT); 42178c2ecf20Sopenharmony_ci 42188c2ecf20Sopenharmony_ci sg_set_page(&sg, page, skb_frag_size(f), 42198c2ecf20Sopenharmony_ci offset_in_page(offset)); 42208c2ecf20Sopenharmony_ci ahash_request_set_crypt(req, &sg, NULL, skb_frag_size(f)); 42218c2ecf20Sopenharmony_ci if (crypto_ahash_update(req)) 42228c2ecf20Sopenharmony_ci return 1; 42238c2ecf20Sopenharmony_ci } 42248c2ecf20Sopenharmony_ci 42258c2ecf20Sopenharmony_ci skb_walk_frags(skb, frag_iter) 42268c2ecf20Sopenharmony_ci if (tcp_md5_hash_skb_data(hp, frag_iter, 0)) 42278c2ecf20Sopenharmony_ci return 1; 42288c2ecf20Sopenharmony_ci 42298c2ecf20Sopenharmony_ci return 0; 42308c2ecf20Sopenharmony_ci} 42318c2ecf20Sopenharmony_ciEXPORT_SYMBOL(tcp_md5_hash_skb_data); 42328c2ecf20Sopenharmony_ci 42338c2ecf20Sopenharmony_ciint tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key) 42348c2ecf20Sopenharmony_ci{ 42358c2ecf20Sopenharmony_ci u8 keylen = READ_ONCE(key->keylen); /* paired with WRITE_ONCE() in tcp_md5_do_add */ 42368c2ecf20Sopenharmony_ci struct scatterlist sg; 42378c2ecf20Sopenharmony_ci 42388c2ecf20Sopenharmony_ci sg_init_one(&sg, key->key, keylen); 42398c2ecf20Sopenharmony_ci ahash_request_set_crypt(hp->md5_req, &sg, NULL, keylen); 42408c2ecf20Sopenharmony_ci 42418c2ecf20Sopenharmony_ci /* We use data_race() because tcp_md5_do_add() might change key->key under us */ 42428c2ecf20Sopenharmony_ci return data_race(crypto_ahash_update(hp->md5_req)); 42438c2ecf20Sopenharmony_ci} 42448c2ecf20Sopenharmony_ciEXPORT_SYMBOL(tcp_md5_hash_key); 42458c2ecf20Sopenharmony_ci 42468c2ecf20Sopenharmony_ci#endif 42478c2ecf20Sopenharmony_ci 42488c2ecf20Sopenharmony_civoid tcp_done(struct sock *sk) 42498c2ecf20Sopenharmony_ci{ 42508c2ecf20Sopenharmony_ci struct request_sock *req; 42518c2ecf20Sopenharmony_ci 42528c2ecf20Sopenharmony_ci /* We might be called with a new socket, after 42538c2ecf20Sopenharmony_ci * inet_csk_prepare_forced_close() has been called 42548c2ecf20Sopenharmony_ci * so we can not use lockdep_sock_is_held(sk) 42558c2ecf20Sopenharmony_ci */ 42568c2ecf20Sopenharmony_ci req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, 1); 42578c2ecf20Sopenharmony_ci 42588c2ecf20Sopenharmony_ci if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) 42598c2ecf20Sopenharmony_ci TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS); 42608c2ecf20Sopenharmony_ci 42618c2ecf20Sopenharmony_ci tcp_set_state(sk, TCP_CLOSE); 42628c2ecf20Sopenharmony_ci tcp_clear_xmit_timers(sk); 42638c2ecf20Sopenharmony_ci if (req) 42648c2ecf20Sopenharmony_ci reqsk_fastopen_remove(sk, req, false); 42658c2ecf20Sopenharmony_ci 42668c2ecf20Sopenharmony_ci WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); 42678c2ecf20Sopenharmony_ci 42688c2ecf20Sopenharmony_ci if (!sock_flag(sk, SOCK_DEAD)) 42698c2ecf20Sopenharmony_ci sk->sk_state_change(sk); 42708c2ecf20Sopenharmony_ci else 42718c2ecf20Sopenharmony_ci inet_csk_destroy_sock(sk); 42728c2ecf20Sopenharmony_ci} 42738c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(tcp_done); 42748c2ecf20Sopenharmony_ci 42758c2ecf20Sopenharmony_ciint tcp_abort(struct sock *sk, int err) 42768c2ecf20Sopenharmony_ci{ 42778c2ecf20Sopenharmony_ci if (!sk_fullsock(sk)) { 42788c2ecf20Sopenharmony_ci if (sk->sk_state == TCP_NEW_SYN_RECV) { 42798c2ecf20Sopenharmony_ci struct request_sock *req = inet_reqsk(sk); 42808c2ecf20Sopenharmony_ci 42818c2ecf20Sopenharmony_ci local_bh_disable(); 42828c2ecf20Sopenharmony_ci inet_csk_reqsk_queue_drop(req->rsk_listener, req); 42838c2ecf20Sopenharmony_ci local_bh_enable(); 42848c2ecf20Sopenharmony_ci return 0; 42858c2ecf20Sopenharmony_ci } 42868c2ecf20Sopenharmony_ci return -EOPNOTSUPP; 42878c2ecf20Sopenharmony_ci } 42888c2ecf20Sopenharmony_ci 42898c2ecf20Sopenharmony_ci /* Don't race with userspace socket closes such as tcp_close. */ 42908c2ecf20Sopenharmony_ci#ifdef CONFIG_TCP_SOCK_DESTROY 42918c2ecf20Sopenharmony_ci /* BPF context ensures sock locking. */ 42928c2ecf20Sopenharmony_ci if (!has_current_bpf_ctx()) 42938c2ecf20Sopenharmony_ci#endif /* CONFIG_TCP_SOCK_DESTROY */ 42948c2ecf20Sopenharmony_ci lock_sock(sk); 42958c2ecf20Sopenharmony_ci 42968c2ecf20Sopenharmony_ci if (sk->sk_state == TCP_LISTEN) { 42978c2ecf20Sopenharmony_ci tcp_set_state(sk, TCP_CLOSE); 42988c2ecf20Sopenharmony_ci inet_csk_listen_stop(sk); 42998c2ecf20Sopenharmony_ci } 43008c2ecf20Sopenharmony_ci 43018c2ecf20Sopenharmony_ci /* Don't race with BH socket closes such as inet_csk_listen_stop. */ 43028c2ecf20Sopenharmony_ci local_bh_disable(); 43038c2ecf20Sopenharmony_ci bh_lock_sock(sk); 43048c2ecf20Sopenharmony_ci 43058c2ecf20Sopenharmony_ci if (!sock_flag(sk, SOCK_DEAD)) { 43068c2ecf20Sopenharmony_ci sk->sk_err = err; 43078c2ecf20Sopenharmony_ci /* This barrier is coupled with smp_rmb() in tcp_poll() */ 43088c2ecf20Sopenharmony_ci smp_wmb(); 43098c2ecf20Sopenharmony_ci sk->sk_error_report(sk); 43108c2ecf20Sopenharmony_ci if (tcp_need_reset(sk->sk_state)) 43118c2ecf20Sopenharmony_ci tcp_send_active_reset(sk, GFP_ATOMIC); 43128c2ecf20Sopenharmony_ci tcp_done(sk); 43138c2ecf20Sopenharmony_ci } 43148c2ecf20Sopenharmony_ci 43158c2ecf20Sopenharmony_ci bh_unlock_sock(sk); 43168c2ecf20Sopenharmony_ci local_bh_enable(); 43178c2ecf20Sopenharmony_ci tcp_write_queue_purge(sk); 43188c2ecf20Sopenharmony_ci#ifdef CONFIG_TCP_SOCK_DESTROY 43198c2ecf20Sopenharmony_ci if (!has_current_bpf_ctx()) 43208c2ecf20Sopenharmony_ci#endif /* CONFIG_TCP_SOCK_DESTROY */ 43218c2ecf20Sopenharmony_ci release_sock(sk); 43228c2ecf20Sopenharmony_ci return 0; 43238c2ecf20Sopenharmony_ci} 43248c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(tcp_abort); 43258c2ecf20Sopenharmony_ci 43268c2ecf20Sopenharmony_ciextern struct tcp_congestion_ops tcp_reno; 43278c2ecf20Sopenharmony_ci 43288c2ecf20Sopenharmony_cistatic __initdata unsigned long thash_entries; 43298c2ecf20Sopenharmony_cistatic int __init set_thash_entries(char *str) 43308c2ecf20Sopenharmony_ci{ 43318c2ecf20Sopenharmony_ci ssize_t ret; 43328c2ecf20Sopenharmony_ci 43338c2ecf20Sopenharmony_ci if (!str) 43348c2ecf20Sopenharmony_ci return 0; 43358c2ecf20Sopenharmony_ci 43368c2ecf20Sopenharmony_ci ret = kstrtoul(str, 0, &thash_entries); 43378c2ecf20Sopenharmony_ci if (ret) 43388c2ecf20Sopenharmony_ci return 0; 43398c2ecf20Sopenharmony_ci 43408c2ecf20Sopenharmony_ci return 1; 43418c2ecf20Sopenharmony_ci} 43428c2ecf20Sopenharmony_ci__setup("thash_entries=", set_thash_entries); 43438c2ecf20Sopenharmony_ci 43448c2ecf20Sopenharmony_cistatic void __init tcp_init_mem(void) 43458c2ecf20Sopenharmony_ci{ 43468c2ecf20Sopenharmony_ci unsigned long limit = nr_free_buffer_pages() / 16; 43478c2ecf20Sopenharmony_ci 43488c2ecf20Sopenharmony_ci limit = max(limit, 128UL); 43498c2ecf20Sopenharmony_ci sysctl_tcp_mem[0] = limit / 4 * 3; /* 4.68 % */ 43508c2ecf20Sopenharmony_ci sysctl_tcp_mem[1] = limit; /* 6.25 % */ 43518c2ecf20Sopenharmony_ci sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; /* 9.37 % */ 43528c2ecf20Sopenharmony_ci} 43538c2ecf20Sopenharmony_ci 43548c2ecf20Sopenharmony_civoid __init tcp_init(void) 43558c2ecf20Sopenharmony_ci{ 43568c2ecf20Sopenharmony_ci int max_rshare, max_wshare, cnt; 43578c2ecf20Sopenharmony_ci unsigned long limit; 43588c2ecf20Sopenharmony_ci unsigned int i; 43598c2ecf20Sopenharmony_ci 43608c2ecf20Sopenharmony_ci BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE); 43618c2ecf20Sopenharmony_ci BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > 43628c2ecf20Sopenharmony_ci sizeof_field(struct sk_buff, cb)); 43638c2ecf20Sopenharmony_ci 43648c2ecf20Sopenharmony_ci percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); 43658c2ecf20Sopenharmony_ci 43668c2ecf20Sopenharmony_ci timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE); 43678c2ecf20Sopenharmony_ci mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD); 43688c2ecf20Sopenharmony_ci 43698c2ecf20Sopenharmony_ci inet_hashinfo_init(&tcp_hashinfo); 43708c2ecf20Sopenharmony_ci inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash", 43718c2ecf20Sopenharmony_ci thash_entries, 21, /* one slot per 2 MB*/ 43728c2ecf20Sopenharmony_ci 0, 64 * 1024); 43738c2ecf20Sopenharmony_ci tcp_hashinfo.bind_bucket_cachep = 43748c2ecf20Sopenharmony_ci kmem_cache_create("tcp_bind_bucket", 43758c2ecf20Sopenharmony_ci sizeof(struct inet_bind_bucket), 0, 43768c2ecf20Sopenharmony_ci SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 43778c2ecf20Sopenharmony_ci 43788c2ecf20Sopenharmony_ci /* Size and allocate the main established and bind bucket 43798c2ecf20Sopenharmony_ci * hash tables. 43808c2ecf20Sopenharmony_ci * 43818c2ecf20Sopenharmony_ci * The methodology is similar to that of the buffer cache. 43828c2ecf20Sopenharmony_ci */ 43838c2ecf20Sopenharmony_ci tcp_hashinfo.ehash = 43848c2ecf20Sopenharmony_ci alloc_large_system_hash("TCP established", 43858c2ecf20Sopenharmony_ci sizeof(struct inet_ehash_bucket), 43868c2ecf20Sopenharmony_ci thash_entries, 43878c2ecf20Sopenharmony_ci 17, /* one slot per 128 KB of memory */ 43888c2ecf20Sopenharmony_ci 0, 43898c2ecf20Sopenharmony_ci NULL, 43908c2ecf20Sopenharmony_ci &tcp_hashinfo.ehash_mask, 43918c2ecf20Sopenharmony_ci 0, 43928c2ecf20Sopenharmony_ci thash_entries ? 0 : 512 * 1024); 43938c2ecf20Sopenharmony_ci for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) 43948c2ecf20Sopenharmony_ci INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i); 43958c2ecf20Sopenharmony_ci 43968c2ecf20Sopenharmony_ci if (inet_ehash_locks_alloc(&tcp_hashinfo)) 43978c2ecf20Sopenharmony_ci panic("TCP: failed to alloc ehash_locks"); 43988c2ecf20Sopenharmony_ci tcp_hashinfo.bhash = 43998c2ecf20Sopenharmony_ci alloc_large_system_hash("TCP bind", 44008c2ecf20Sopenharmony_ci sizeof(struct inet_bind_hashbucket), 44018c2ecf20Sopenharmony_ci tcp_hashinfo.ehash_mask + 1, 44028c2ecf20Sopenharmony_ci 17, /* one slot per 128 KB of memory */ 44038c2ecf20Sopenharmony_ci 0, 44048c2ecf20Sopenharmony_ci &tcp_hashinfo.bhash_size, 44058c2ecf20Sopenharmony_ci NULL, 44068c2ecf20Sopenharmony_ci 0, 44078c2ecf20Sopenharmony_ci 64 * 1024); 44088c2ecf20Sopenharmony_ci tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size; 44098c2ecf20Sopenharmony_ci for (i = 0; i < tcp_hashinfo.bhash_size; i++) { 44108c2ecf20Sopenharmony_ci spin_lock_init(&tcp_hashinfo.bhash[i].lock); 44118c2ecf20Sopenharmony_ci INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); 44128c2ecf20Sopenharmony_ci } 44138c2ecf20Sopenharmony_ci 44148c2ecf20Sopenharmony_ci 44158c2ecf20Sopenharmony_ci cnt = tcp_hashinfo.ehash_mask + 1; 44168c2ecf20Sopenharmony_ci sysctl_tcp_max_orphans = cnt / 2; 44178c2ecf20Sopenharmony_ci 44188c2ecf20Sopenharmony_ci tcp_init_mem(); 44198c2ecf20Sopenharmony_ci /* Set per-socket limits to no more than 1/128 the pressure threshold */ 44208c2ecf20Sopenharmony_ci limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); 44218c2ecf20Sopenharmony_ci max_wshare = min(4UL*1024*1024, limit); 44228c2ecf20Sopenharmony_ci max_rshare = min(6UL*1024*1024, limit); 44238c2ecf20Sopenharmony_ci 44248c2ecf20Sopenharmony_ci init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; 44258c2ecf20Sopenharmony_ci init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; 44268c2ecf20Sopenharmony_ci init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare); 44278c2ecf20Sopenharmony_ci 44288c2ecf20Sopenharmony_ci init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; 44298c2ecf20Sopenharmony_ci init_net.ipv4.sysctl_tcp_rmem[1] = 131072; 44308c2ecf20Sopenharmony_ci init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare); 44318c2ecf20Sopenharmony_ci 44328c2ecf20Sopenharmony_ci pr_info("Hash tables configured (established %u bind %u)\n", 44338c2ecf20Sopenharmony_ci tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); 44348c2ecf20Sopenharmony_ci 44358c2ecf20Sopenharmony_ci tcp_v4_init(); 44368c2ecf20Sopenharmony_ci tcp_metrics_init(); 44378c2ecf20Sopenharmony_ci BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0); 44388c2ecf20Sopenharmony_ci tcp_tasklet_init(); 44398c2ecf20Sopenharmony_ci mptcp_init(); 44408c2ecf20Sopenharmony_ci} 4441