xref: /kernel/linux/linux-6.6/net/ipv4/tcp_ipv4.c (revision 62306a36)
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * INET		An implementation of the TCP/IP protocol suite for the LINUX
4 *		operating system.  INET is implemented using the  BSD Socket
5 *		interface as the means of communication with the user level.
6 *
7 *		Implementation of the Transmission Control Protocol(TCP).
8 *
9 *		IPv4 specific functions
10 *
11 *		code split from:
12 *		linux/ipv4/tcp.c
13 *		linux/ipv4/tcp_input.c
14 *		linux/ipv4/tcp_output.c
15 *
16 *		See tcp.c for author information
17 */
18
19/*
20 * Changes:
21 *		David S. Miller	:	New socket lookup architecture.
22 *					This code is dedicated to John Dyson.
23 *		David S. Miller :	Change semantics of established hash,
24 *					half is devoted to TIME_WAIT sockets
25 *					and the rest go in the other half.
26 *		Andi Kleen :		Add support for syncookies and fixed
27 *					some bugs: ip options weren't passed to
28 *					the TCP layer, missed a check for an
29 *					ACK bit.
30 *		Andi Kleen :		Implemented fast path mtu discovery.
31 *	     				Fixed many serious bugs in the
32 *					request_sock handling and moved
33 *					most of it into the af independent code.
34 *					Added tail drop and some other bugfixes.
35 *					Added new listen semantics.
36 *		Mike McLagan	:	Routing by source
37 *	Juan Jose Ciarlante:		ip_dynaddr bits
38 *		Andi Kleen:		various fixes.
39 *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40 *					coma.
41 *	Andi Kleen		:	Fix new listen.
42 *	Andi Kleen		:	Fix accept error reporting.
43 *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44 *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45 *					a single port at the same time.
46 */
47
48#define pr_fmt(fmt) "TCP: " fmt
49
50#include <linux/bottom_half.h>
51#include <linux/types.h>
52#include <linux/fcntl.h>
53#include <linux/module.h>
54#include <linux/random.h>
55#include <linux/cache.h>
56#include <linux/jhash.h>
57#include <linux/init.h>
58#include <linux/times.h>
59#include <linux/slab.h>
60#include <linux/sched.h>
61
62#include <net/net_namespace.h>
63#include <net/icmp.h>
64#include <net/inet_hashtables.h>
65#include <net/tcp.h>
66#include <net/transp_v6.h>
67#include <net/ipv6.h>
68#include <net/inet_common.h>
69#include <net/timewait_sock.h>
70#include <net/xfrm.h>
71#include <net/secure_seq.h>
72#include <net/busy_poll.h>
73
74#include <linux/inet.h>
75#include <linux/ipv6.h>
76#include <linux/stddef.h>
77#include <linux/proc_fs.h>
78#include <linux/seq_file.h>
79#include <linux/inetdevice.h>
80#include <linux/btf_ids.h>
81
82#include <crypto/hash.h>
83#include <linux/scatterlist.h>
84
85#include <trace/events/tcp.h>
86
87#ifdef CONFIG_TCP_MD5SIG
88static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
89			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
90#endif
91
92struct inet_hashinfo tcp_hashinfo;
93EXPORT_SYMBOL(tcp_hashinfo);
94
95static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
96
97static u32 tcp_v4_init_seq(const struct sk_buff *skb)
98{
99	return secure_tcp_seq(ip_hdr(skb)->daddr,
100			      ip_hdr(skb)->saddr,
101			      tcp_hdr(skb)->dest,
102			      tcp_hdr(skb)->source);
103}
104
105static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
106{
107	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
108}
109
110int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
111{
112	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
113	const struct inet_timewait_sock *tw = inet_twsk(sktw);
114	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
115	struct tcp_sock *tp = tcp_sk(sk);
116
117	if (reuse == 2) {
118		/* Still does not detect *everything* that goes through
119		 * lo, since we require a loopback src or dst address
120		 * or direct binding to 'lo' interface.
121		 */
122		bool loopback = false;
123		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
124			loopback = true;
125#if IS_ENABLED(CONFIG_IPV6)
126		if (tw->tw_family == AF_INET6) {
127			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
128			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
129			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
130			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
131				loopback = true;
132		} else
133#endif
134		{
135			if (ipv4_is_loopback(tw->tw_daddr) ||
136			    ipv4_is_loopback(tw->tw_rcv_saddr))
137				loopback = true;
138		}
139		if (!loopback)
140			reuse = 0;
141	}
142
143	/* With PAWS, it is safe from the viewpoint
144	   of data integrity. Even without PAWS it is safe provided sequence
145	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
146
147	   Actually, the idea is close to VJ's one, only timestamp cache is
148	   held not per host, but per port pair and TW bucket is used as state
149	   holder.
150
151	   If TW bucket has been already destroyed we fall back to VJ's scheme
152	   and use initial timestamp retrieved from peer table.
153	 */
154	if (tcptw->tw_ts_recent_stamp &&
155	    (!twp || (reuse && time_after32(ktime_get_seconds(),
156					    tcptw->tw_ts_recent_stamp)))) {
157		/* In case of repair and re-using TIME-WAIT sockets we still
158		 * want to be sure that it is safe as above but honor the
159		 * sequence numbers and time stamps set as part of the repair
160		 * process.
161		 *
162		 * Without this check re-using a TIME-WAIT socket with TCP
163		 * repair would accumulate a -1 on the repair assigned
164		 * sequence number. The first time it is reused the sequence
165		 * is -1, the second time -2, etc. This fixes that issue
166		 * without appearing to create any others.
167		 */
168		if (likely(!tp->repair)) {
169			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
170
171			if (!seq)
172				seq = 1;
173			WRITE_ONCE(tp->write_seq, seq);
174			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
175			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
176		}
177		sock_hold(sktw);
178		return 1;
179	}
180
181	return 0;
182}
183EXPORT_SYMBOL_GPL(tcp_twsk_unique);
184
185static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
186			      int addr_len)
187{
188	/* This check is replicated from tcp_v4_connect() and intended to
189	 * prevent BPF program called below from accessing bytes that are out
190	 * of the bound specified by user in addr_len.
191	 */
192	if (addr_len < sizeof(struct sockaddr_in))
193		return -EINVAL;
194
195	sock_owned_by_me(sk);
196
197	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
198}
199
200/* This will initiate an outgoing connection. */
201int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
202{
203	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
204	struct inet_timewait_death_row *tcp_death_row;
205	struct inet_sock *inet = inet_sk(sk);
206	struct tcp_sock *tp = tcp_sk(sk);
207	struct ip_options_rcu *inet_opt;
208	struct net *net = sock_net(sk);
209	__be16 orig_sport, orig_dport;
210	__be32 daddr, nexthop;
211	struct flowi4 *fl4;
212	struct rtable *rt;
213	int err;
214
215	if (addr_len < sizeof(struct sockaddr_in))
216		return -EINVAL;
217
218	if (usin->sin_family != AF_INET)
219		return -EAFNOSUPPORT;
220
221	nexthop = daddr = usin->sin_addr.s_addr;
222	inet_opt = rcu_dereference_protected(inet->inet_opt,
223					     lockdep_sock_is_held(sk));
224	if (inet_opt && inet_opt->opt.srr) {
225		if (!daddr)
226			return -EINVAL;
227		nexthop = inet_opt->opt.faddr;
228	}
229
230	orig_sport = inet->inet_sport;
231	orig_dport = usin->sin_port;
232	fl4 = &inet->cork.fl.u.ip4;
233	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
234			      sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
235			      orig_dport, sk);
236	if (IS_ERR(rt)) {
237		err = PTR_ERR(rt);
238		if (err == -ENETUNREACH)
239			IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
240		return err;
241	}
242
243	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
244		ip_rt_put(rt);
245		return -ENETUNREACH;
246	}
247
248	if (!inet_opt || !inet_opt->opt.srr)
249		daddr = fl4->daddr;
250
251	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
252
253	if (!inet->inet_saddr) {
254		err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
255		if (err) {
256			ip_rt_put(rt);
257			return err;
258		}
259	} else {
260		sk_rcv_saddr_set(sk, inet->inet_saddr);
261	}
262
263	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
264		/* Reset inherited state */
265		tp->rx_opt.ts_recent	   = 0;
266		tp->rx_opt.ts_recent_stamp = 0;
267		if (likely(!tp->repair))
268			WRITE_ONCE(tp->write_seq, 0);
269	}
270
271	inet->inet_dport = usin->sin_port;
272	sk_daddr_set(sk, daddr);
273
274	inet_csk(sk)->icsk_ext_hdr_len = 0;
275	if (inet_opt)
276		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
277
278	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
279
280	/* Socket identity is still unknown (sport may be zero).
281	 * However we set state to SYN-SENT and not releasing socket
282	 * lock select source port, enter ourselves into the hash tables and
283	 * complete initialization after this.
284	 */
285	tcp_set_state(sk, TCP_SYN_SENT);
286	err = inet_hash_connect(tcp_death_row, sk);
287	if (err)
288		goto failure;
289
290	sk_set_txhash(sk);
291
292	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
293			       inet->inet_sport, inet->inet_dport, sk);
294	if (IS_ERR(rt)) {
295		err = PTR_ERR(rt);
296		rt = NULL;
297		goto failure;
298	}
299	/* OK, now commit destination to socket.  */
300	sk->sk_gso_type = SKB_GSO_TCPV4;
301	sk_setup_caps(sk, &rt->dst);
302	rt = NULL;
303
304	if (likely(!tp->repair)) {
305		if (!tp->write_seq)
306			WRITE_ONCE(tp->write_seq,
307				   secure_tcp_seq(inet->inet_saddr,
308						  inet->inet_daddr,
309						  inet->inet_sport,
310						  usin->sin_port));
311		WRITE_ONCE(tp->tsoffset,
312			   secure_tcp_ts_off(net, inet->inet_saddr,
313					     inet->inet_daddr));
314	}
315
316	atomic_set(&inet->inet_id, get_random_u16());
317
318	if (tcp_fastopen_defer_connect(sk, &err))
319		return err;
320	if (err)
321		goto failure;
322
323	err = tcp_connect(sk);
324
325	if (err)
326		goto failure;
327
328	return 0;
329
330failure:
331	/*
332	 * This unhashes the socket and releases the local port,
333	 * if necessary.
334	 */
335	tcp_set_state(sk, TCP_CLOSE);
336	inet_bhash2_reset_saddr(sk);
337	ip_rt_put(rt);
338	sk->sk_route_caps = 0;
339	inet->inet_dport = 0;
340	return err;
341}
342EXPORT_SYMBOL(tcp_v4_connect);
343
344/*
345 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
346 * It can be called through tcp_release_cb() if socket was owned by user
347 * at the time tcp_v4_err() was called to handle ICMP message.
348 */
349void tcp_v4_mtu_reduced(struct sock *sk)
350{
351	struct inet_sock *inet = inet_sk(sk);
352	struct dst_entry *dst;
353	u32 mtu;
354
355	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
356		return;
357	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
358	dst = inet_csk_update_pmtu(sk, mtu);
359	if (!dst)
360		return;
361
362	/* Something is about to be wrong... Remember soft error
363	 * for the case, if this connection will not able to recover.
364	 */
365	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
366		WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
367
368	mtu = dst_mtu(dst);
369
370	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
371	    ip_sk_accept_pmtu(sk) &&
372	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
373		tcp_sync_mss(sk, mtu);
374
375		/* Resend the TCP packet because it's
376		 * clear that the old packet has been
377		 * dropped. This is the new "fast" path mtu
378		 * discovery.
379		 */
380		tcp_simple_retransmit(sk);
381	} /* else let the usual retransmit timer handle it */
382}
383EXPORT_SYMBOL(tcp_v4_mtu_reduced);
384
385static void do_redirect(struct sk_buff *skb, struct sock *sk)
386{
387	struct dst_entry *dst = __sk_dst_check(sk, 0);
388
389	if (dst)
390		dst->ops->redirect(dst, sk, skb);
391}
392
393
394/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
395void tcp_req_err(struct sock *sk, u32 seq, bool abort)
396{
397	struct request_sock *req = inet_reqsk(sk);
398	struct net *net = sock_net(sk);
399
400	/* ICMPs are not backlogged, hence we cannot get
401	 * an established socket here.
402	 */
403	if (seq != tcp_rsk(req)->snt_isn) {
404		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
405	} else if (abort) {
406		/*
407		 * Still in SYN_RECV, just remove it silently.
408		 * There is no good way to pass the error to the newly
409		 * created socket, and POSIX does not want network
410		 * errors returned from accept().
411		 */
412		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
413		tcp_listendrop(req->rsk_listener);
414	}
415	reqsk_put(req);
416}
417EXPORT_SYMBOL(tcp_req_err);
418
419/* TCP-LD (RFC 6069) logic */
420void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
421{
422	struct inet_connection_sock *icsk = inet_csk(sk);
423	struct tcp_sock *tp = tcp_sk(sk);
424	struct sk_buff *skb;
425	s32 remaining;
426	u32 delta_us;
427
428	if (sock_owned_by_user(sk))
429		return;
430
431	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
432	    !icsk->icsk_backoff)
433		return;
434
435	skb = tcp_rtx_queue_head(sk);
436	if (WARN_ON_ONCE(!skb))
437		return;
438
439	icsk->icsk_backoff--;
440	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
441	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
442
443	tcp_mstamp_refresh(tp);
444	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
445	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
446
447	if (remaining > 0) {
448		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
449					  remaining, TCP_RTO_MAX);
450	} else {
451		/* RTO revert clocked out retransmission.
452		 * Will retransmit now.
453		 */
454		tcp_retransmit_timer(sk);
455	}
456}
457EXPORT_SYMBOL(tcp_ld_RTO_revert);
458
459/*
460 * This routine is called by the ICMP module when it gets some
461 * sort of error condition.  If err < 0 then the socket should
462 * be closed and the error returned to the user.  If err > 0
463 * it's just the icmp type << 8 | icmp code.  After adjustment
464 * header points to the first 8 bytes of the tcp header.  We need
465 * to find the appropriate port.
466 *
467 * The locking strategy used here is very "optimistic". When
468 * someone else accesses the socket the ICMP is just dropped
469 * and for some paths there is no check at all.
470 * A more general error queue to queue errors for later handling
471 * is probably better.
472 *
473 */
474
475int tcp_v4_err(struct sk_buff *skb, u32 info)
476{
477	const struct iphdr *iph = (const struct iphdr *)skb->data;
478	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
479	struct tcp_sock *tp;
480	const int type = icmp_hdr(skb)->type;
481	const int code = icmp_hdr(skb)->code;
482	struct sock *sk;
483	struct request_sock *fastopen;
484	u32 seq, snd_una;
485	int err;
486	struct net *net = dev_net(skb->dev);
487
488	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
489				       iph->daddr, th->dest, iph->saddr,
490				       ntohs(th->source), inet_iif(skb), 0);
491	if (!sk) {
492		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
493		return -ENOENT;
494	}
495	if (sk->sk_state == TCP_TIME_WAIT) {
496		inet_twsk_put(inet_twsk(sk));
497		return 0;
498	}
499	seq = ntohl(th->seq);
500	if (sk->sk_state == TCP_NEW_SYN_RECV) {
501		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
502				     type == ICMP_TIME_EXCEEDED ||
503				     (type == ICMP_DEST_UNREACH &&
504				      (code == ICMP_NET_UNREACH ||
505				       code == ICMP_HOST_UNREACH)));
506		return 0;
507	}
508
509	bh_lock_sock(sk);
510	/* If too many ICMPs get dropped on busy
511	 * servers this needs to be solved differently.
512	 * We do take care of PMTU discovery (RFC1191) special case :
513	 * we can receive locally generated ICMP messages while socket is held.
514	 */
515	if (sock_owned_by_user(sk)) {
516		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
517			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
518	}
519	if (sk->sk_state == TCP_CLOSE)
520		goto out;
521
522	if (static_branch_unlikely(&ip4_min_ttl)) {
523		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
524		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
525			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
526			goto out;
527		}
528	}
529
530	tp = tcp_sk(sk);
531	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
532	fastopen = rcu_dereference(tp->fastopen_rsk);
533	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
534	if (sk->sk_state != TCP_LISTEN &&
535	    !between(seq, snd_una, tp->snd_nxt)) {
536		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
537		goto out;
538	}
539
540	switch (type) {
541	case ICMP_REDIRECT:
542		if (!sock_owned_by_user(sk))
543			do_redirect(skb, sk);
544		goto out;
545	case ICMP_SOURCE_QUENCH:
546		/* Just silently ignore these. */
547		goto out;
548	case ICMP_PARAMETERPROB:
549		err = EPROTO;
550		break;
551	case ICMP_DEST_UNREACH:
552		if (code > NR_ICMP_UNREACH)
553			goto out;
554
555		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
556			/* We are not interested in TCP_LISTEN and open_requests
557			 * (SYN-ACKs send out by Linux are always <576bytes so
558			 * they should go through unfragmented).
559			 */
560			if (sk->sk_state == TCP_LISTEN)
561				goto out;
562
563			WRITE_ONCE(tp->mtu_info, info);
564			if (!sock_owned_by_user(sk)) {
565				tcp_v4_mtu_reduced(sk);
566			} else {
567				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
568					sock_hold(sk);
569			}
570			goto out;
571		}
572
573		err = icmp_err_convert[code].errno;
574		/* check if this ICMP message allows revert of backoff.
575		 * (see RFC 6069)
576		 */
577		if (!fastopen &&
578		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
579			tcp_ld_RTO_revert(sk, seq);
580		break;
581	case ICMP_TIME_EXCEEDED:
582		err = EHOSTUNREACH;
583		break;
584	default:
585		goto out;
586	}
587
588	switch (sk->sk_state) {
589	case TCP_SYN_SENT:
590	case TCP_SYN_RECV:
591		/* Only in fast or simultaneous open. If a fast open socket is
592		 * already accepted it is treated as a connected one below.
593		 */
594		if (fastopen && !fastopen->sk)
595			break;
596
597		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
598
599		if (!sock_owned_by_user(sk)) {
600			WRITE_ONCE(sk->sk_err, err);
601
602			sk_error_report(sk);
603
604			tcp_done(sk);
605		} else {
606			WRITE_ONCE(sk->sk_err_soft, err);
607		}
608		goto out;
609	}
610
611	/* If we've already connected we will keep trying
612	 * until we time out, or the user gives up.
613	 *
614	 * rfc1122 4.2.3.9 allows to consider as hard errors
615	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
616	 * but it is obsoleted by pmtu discovery).
617	 *
618	 * Note, that in modern internet, where routing is unreliable
619	 * and in each dark corner broken firewalls sit, sending random
620	 * errors ordered by their masters even this two messages finally lose
621	 * their original sense (even Linux sends invalid PORT_UNREACHs)
622	 *
623	 * Now we are in compliance with RFCs.
624	 *							--ANK (980905)
625	 */
626
627	if (!sock_owned_by_user(sk) &&
628	    inet_test_bit(RECVERR, sk)) {
629		WRITE_ONCE(sk->sk_err, err);
630		sk_error_report(sk);
631	} else	{ /* Only an error on timeout */
632		WRITE_ONCE(sk->sk_err_soft, err);
633	}
634
635out:
636	bh_unlock_sock(sk);
637	sock_put(sk);
638	return 0;
639}
640
641void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
642{
643	struct tcphdr *th = tcp_hdr(skb);
644
645	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
646	skb->csum_start = skb_transport_header(skb) - skb->head;
647	skb->csum_offset = offsetof(struct tcphdr, check);
648}
649
650/* This routine computes an IPv4 TCP checksum. */
651void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
652{
653	const struct inet_sock *inet = inet_sk(sk);
654
655	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
656}
657EXPORT_SYMBOL(tcp_v4_send_check);
658
659/*
660 *	This routine will send an RST to the other tcp.
661 *
662 *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
663 *		      for reset.
664 *	Answer: if a packet caused RST, it is not for a socket
665 *		existing in our system, if it is matched to a socket,
666 *		it is just duplicate segment or bug in other side's TCP.
667 *		So that we build reply only basing on parameters
668 *		arrived with segment.
669 *	Exception: precedence violation. We do not implement it in any case.
670 */
671
672#ifdef CONFIG_TCP_MD5SIG
673#define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
674#else
675#define OPTION_BYTES sizeof(__be32)
676#endif
677
678static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
679{
680	const struct tcphdr *th = tcp_hdr(skb);
681	struct {
682		struct tcphdr th;
683		__be32 opt[OPTION_BYTES / sizeof(__be32)];
684	} rep;
685	struct ip_reply_arg arg;
686#ifdef CONFIG_TCP_MD5SIG
687	struct tcp_md5sig_key *key = NULL;
688	const __u8 *hash_location = NULL;
689	unsigned char newhash[16];
690	int genhash;
691	struct sock *sk1 = NULL;
692#endif
693	u64 transmit_time = 0;
694	struct sock *ctl_sk;
695	struct net *net;
696	u32 txhash = 0;
697
698	/* Never send a reset in response to a reset. */
699	if (th->rst)
700		return;
701
702	/* If sk not NULL, it means we did a successful lookup and incoming
703	 * route had to be correct. prequeue might have dropped our dst.
704	 */
705	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
706		return;
707
708	/* Swap the send and the receive. */
709	memset(&rep, 0, sizeof(rep));
710	rep.th.dest   = th->source;
711	rep.th.source = th->dest;
712	rep.th.doff   = sizeof(struct tcphdr) / 4;
713	rep.th.rst    = 1;
714
715	if (th->ack) {
716		rep.th.seq = th->ack_seq;
717	} else {
718		rep.th.ack = 1;
719		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
720				       skb->len - (th->doff << 2));
721	}
722
723	memset(&arg, 0, sizeof(arg));
724	arg.iov[0].iov_base = (unsigned char *)&rep;
725	arg.iov[0].iov_len  = sizeof(rep.th);
726
727	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
728#ifdef CONFIG_TCP_MD5SIG
729	rcu_read_lock();
730	hash_location = tcp_parse_md5sig_option(th);
731	if (sk && sk_fullsock(sk)) {
732		const union tcp_md5_addr *addr;
733		int l3index;
734
735		/* sdif set, means packet ingressed via a device
736		 * in an L3 domain and inet_iif is set to it.
737		 */
738		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
739		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
740		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
741	} else if (hash_location) {
742		const union tcp_md5_addr *addr;
743		int sdif = tcp_v4_sdif(skb);
744		int dif = inet_iif(skb);
745		int l3index;
746
747		/*
748		 * active side is lost. Try to find listening socket through
749		 * source port, and then find md5 key through listening socket.
750		 * we are not loose security here:
751		 * Incoming packet is checked with md5 hash with finding key,
752		 * no RST generated if md5 hash doesn't match.
753		 */
754		sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
755					     NULL, 0, ip_hdr(skb)->saddr,
756					     th->source, ip_hdr(skb)->daddr,
757					     ntohs(th->source), dif, sdif);
758		/* don't send rst if it can't find key */
759		if (!sk1)
760			goto out;
761
762		/* sdif set, means packet ingressed via a device
763		 * in an L3 domain and dif is set to it.
764		 */
765		l3index = sdif ? dif : 0;
766		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
767		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
768		if (!key)
769			goto out;
770
771
772		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
773		if (genhash || memcmp(hash_location, newhash, 16) != 0)
774			goto out;
775
776	}
777
778	if (key) {
779		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
780				   (TCPOPT_NOP << 16) |
781				   (TCPOPT_MD5SIG << 8) |
782				   TCPOLEN_MD5SIG);
783		/* Update length and the length the header thinks exists */
784		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
785		rep.th.doff = arg.iov[0].iov_len / 4;
786
787		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
788				     key, ip_hdr(skb)->saddr,
789				     ip_hdr(skb)->daddr, &rep.th);
790	}
791#endif
792	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
793	if (rep.opt[0] == 0) {
794		__be32 mrst = mptcp_reset_option(skb);
795
796		if (mrst) {
797			rep.opt[0] = mrst;
798			arg.iov[0].iov_len += sizeof(mrst);
799			rep.th.doff = arg.iov[0].iov_len / 4;
800		}
801	}
802
803	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
804				      ip_hdr(skb)->saddr, /* XXX */
805				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
806	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
807	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
808
809	/* When socket is gone, all binding information is lost.
810	 * routing might fail in this case. No choice here, if we choose to force
811	 * input interface, we will misroute in case of asymmetric route.
812	 */
813	if (sk) {
814		arg.bound_dev_if = sk->sk_bound_dev_if;
815		if (sk_fullsock(sk))
816			trace_tcp_send_reset(sk, skb);
817	}
818
819	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
820		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
821
822	arg.tos = ip_hdr(skb)->tos;
823	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
824	local_bh_disable();
825	ctl_sk = this_cpu_read(ipv4_tcp_sk);
826	sock_net_set(ctl_sk, net);
827	if (sk) {
828		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
829				   inet_twsk(sk)->tw_mark : sk->sk_mark;
830		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
831				   inet_twsk(sk)->tw_priority : sk->sk_priority;
832		transmit_time = tcp_transmit_time(sk);
833		xfrm_sk_clone_policy(ctl_sk, sk);
834		txhash = (sk->sk_state == TCP_TIME_WAIT) ?
835			 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
836	} else {
837		ctl_sk->sk_mark = 0;
838		ctl_sk->sk_priority = 0;
839	}
840	ip_send_unicast_reply(ctl_sk,
841			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
842			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
843			      &arg, arg.iov[0].iov_len,
844			      transmit_time, txhash);
845
846	xfrm_sk_free_policy(ctl_sk);
847	sock_net_set(ctl_sk, &init_net);
848	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
849	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
850	local_bh_enable();
851
852#ifdef CONFIG_TCP_MD5SIG
853out:
854	rcu_read_unlock();
855#endif
856}
857
858/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
859   outside socket context is ugly, certainly. What can I do?
860 */
861
862static void tcp_v4_send_ack(const struct sock *sk,
863			    struct sk_buff *skb, u32 seq, u32 ack,
864			    u32 win, u32 tsval, u32 tsecr, int oif,
865			    struct tcp_md5sig_key *key,
866			    int reply_flags, u8 tos, u32 txhash)
867{
868	const struct tcphdr *th = tcp_hdr(skb);
869	struct {
870		struct tcphdr th;
871		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
872#ifdef CONFIG_TCP_MD5SIG
873			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
874#endif
875			];
876	} rep;
877	struct net *net = sock_net(sk);
878	struct ip_reply_arg arg;
879	struct sock *ctl_sk;
880	u64 transmit_time;
881
882	memset(&rep.th, 0, sizeof(struct tcphdr));
883	memset(&arg, 0, sizeof(arg));
884
885	arg.iov[0].iov_base = (unsigned char *)&rep;
886	arg.iov[0].iov_len  = sizeof(rep.th);
887	if (tsecr) {
888		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
889				   (TCPOPT_TIMESTAMP << 8) |
890				   TCPOLEN_TIMESTAMP);
891		rep.opt[1] = htonl(tsval);
892		rep.opt[2] = htonl(tsecr);
893		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
894	}
895
896	/* Swap the send and the receive. */
897	rep.th.dest    = th->source;
898	rep.th.source  = th->dest;
899	rep.th.doff    = arg.iov[0].iov_len / 4;
900	rep.th.seq     = htonl(seq);
901	rep.th.ack_seq = htonl(ack);
902	rep.th.ack     = 1;
903	rep.th.window  = htons(win);
904
905#ifdef CONFIG_TCP_MD5SIG
906	if (key) {
907		int offset = (tsecr) ? 3 : 0;
908
909		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
910					  (TCPOPT_NOP << 16) |
911					  (TCPOPT_MD5SIG << 8) |
912					  TCPOLEN_MD5SIG);
913		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
914		rep.th.doff = arg.iov[0].iov_len/4;
915
916		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
917				    key, ip_hdr(skb)->saddr,
918				    ip_hdr(skb)->daddr, &rep.th);
919	}
920#endif
921	arg.flags = reply_flags;
922	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
923				      ip_hdr(skb)->saddr, /* XXX */
924				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
925	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
926	if (oif)
927		arg.bound_dev_if = oif;
928	arg.tos = tos;
929	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
930	local_bh_disable();
931	ctl_sk = this_cpu_read(ipv4_tcp_sk);
932	sock_net_set(ctl_sk, net);
933	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
934			   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
935	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
936			   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
937	transmit_time = tcp_transmit_time(sk);
938	ip_send_unicast_reply(ctl_sk,
939			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
940			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
941			      &arg, arg.iov[0].iov_len,
942			      transmit_time, txhash);
943
944	sock_net_set(ctl_sk, &init_net);
945	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
946	local_bh_enable();
947}
948
949static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
950{
951	struct inet_timewait_sock *tw = inet_twsk(sk);
952	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
953
954	tcp_v4_send_ack(sk, skb,
955			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
956			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
957			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
958			tcptw->tw_ts_recent,
959			tw->tw_bound_dev_if,
960			tcp_twsk_md5_key(tcptw),
961			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
962			tw->tw_tos,
963			tw->tw_txhash
964			);
965
966	inet_twsk_put(tw);
967}
968
969static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
970				  struct request_sock *req)
971{
972	const union tcp_md5_addr *addr;
973	int l3index;
974
975	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
976	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
977	 */
978	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
979					     tcp_sk(sk)->snd_nxt;
980
981	/* RFC 7323 2.3
982	 * The window field (SEG.WND) of every outgoing segment, with the
983	 * exception of <SYN> segments, MUST be right-shifted by
984	 * Rcv.Wind.Shift bits:
985	 */
986	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
987	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
988	tcp_v4_send_ack(sk, skb, seq,
989			tcp_rsk(req)->rcv_nxt,
990			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
991			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
992			READ_ONCE(req->ts_recent),
993			0,
994			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
995			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
996			ip_hdr(skb)->tos,
997			READ_ONCE(tcp_rsk(req)->txhash));
998}
999
1000/*
1001 *	Send a SYN-ACK after having received a SYN.
1002 *	This still operates on a request_sock only, not on a big
1003 *	socket.
1004 */
1005static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1006			      struct flowi *fl,
1007			      struct request_sock *req,
1008			      struct tcp_fastopen_cookie *foc,
1009			      enum tcp_synack_type synack_type,
1010			      struct sk_buff *syn_skb)
1011{
1012	const struct inet_request_sock *ireq = inet_rsk(req);
1013	struct flowi4 fl4;
1014	int err = -1;
1015	struct sk_buff *skb;
1016	u8 tos;
1017
1018	/* First, grab a route. */
1019	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1020		return -1;
1021
1022	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1023
1024	if (skb) {
1025		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1026
1027		tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1028				(tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1029				(inet_sk(sk)->tos & INET_ECN_MASK) :
1030				inet_sk(sk)->tos;
1031
1032		if (!INET_ECN_is_capable(tos) &&
1033		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1034			tos |= INET_ECN_ECT_0;
1035
1036		rcu_read_lock();
1037		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1038					    ireq->ir_rmt_addr,
1039					    rcu_dereference(ireq->ireq_opt),
1040					    tos);
1041		rcu_read_unlock();
1042		err = net_xmit_eval(err);
1043	}
1044
1045	return err;
1046}
1047
1048/*
1049 *	IPv4 request_sock destructor.
1050 */
1051static void tcp_v4_reqsk_destructor(struct request_sock *req)
1052{
1053	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1054}
1055
1056#ifdef CONFIG_TCP_MD5SIG
1057/*
1058 * RFC2385 MD5 checksumming requires a mapping of
1059 * IP address->MD5 Key.
1060 * We need to maintain these in the sk structure.
1061 */
1062
1063DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1064EXPORT_SYMBOL(tcp_md5_needed);
1065
1066static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1067{
1068	if (!old)
1069		return true;
1070
1071	/* l3index always overrides non-l3index */
1072	if (old->l3index && new->l3index == 0)
1073		return false;
1074	if (old->l3index == 0 && new->l3index)
1075		return true;
1076
1077	return old->prefixlen < new->prefixlen;
1078}
1079
1080/* Find the Key structure for an address.  */
1081struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1082					   const union tcp_md5_addr *addr,
1083					   int family)
1084{
1085	const struct tcp_sock *tp = tcp_sk(sk);
1086	struct tcp_md5sig_key *key;
1087	const struct tcp_md5sig_info *md5sig;
1088	__be32 mask;
1089	struct tcp_md5sig_key *best_match = NULL;
1090	bool match;
1091
1092	/* caller either holds rcu_read_lock() or socket lock */
1093	md5sig = rcu_dereference_check(tp->md5sig_info,
1094				       lockdep_sock_is_held(sk));
1095	if (!md5sig)
1096		return NULL;
1097
1098	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1099				 lockdep_sock_is_held(sk)) {
1100		if (key->family != family)
1101			continue;
1102		if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1103			continue;
1104		if (family == AF_INET) {
1105			mask = inet_make_mask(key->prefixlen);
1106			match = (key->addr.a4.s_addr & mask) ==
1107				(addr->a4.s_addr & mask);
1108#if IS_ENABLED(CONFIG_IPV6)
1109		} else if (family == AF_INET6) {
1110			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1111						  key->prefixlen);
1112#endif
1113		} else {
1114			match = false;
1115		}
1116
1117		if (match && better_md5_match(best_match, key))
1118			best_match = key;
1119	}
1120	return best_match;
1121}
1122EXPORT_SYMBOL(__tcp_md5_do_lookup);
1123
1124static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1125						      const union tcp_md5_addr *addr,
1126						      int family, u8 prefixlen,
1127						      int l3index, u8 flags)
1128{
1129	const struct tcp_sock *tp = tcp_sk(sk);
1130	struct tcp_md5sig_key *key;
1131	unsigned int size = sizeof(struct in_addr);
1132	const struct tcp_md5sig_info *md5sig;
1133
1134	/* caller either holds rcu_read_lock() or socket lock */
1135	md5sig = rcu_dereference_check(tp->md5sig_info,
1136				       lockdep_sock_is_held(sk));
1137	if (!md5sig)
1138		return NULL;
1139#if IS_ENABLED(CONFIG_IPV6)
1140	if (family == AF_INET6)
1141		size = sizeof(struct in6_addr);
1142#endif
1143	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1144				 lockdep_sock_is_held(sk)) {
1145		if (key->family != family)
1146			continue;
1147		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1148			continue;
1149		if (key->l3index != l3index)
1150			continue;
1151		if (!memcmp(&key->addr, addr, size) &&
1152		    key->prefixlen == prefixlen)
1153			return key;
1154	}
1155	return NULL;
1156}
1157
1158struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1159					 const struct sock *addr_sk)
1160{
1161	const union tcp_md5_addr *addr;
1162	int l3index;
1163
1164	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1165						 addr_sk->sk_bound_dev_if);
1166	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1167	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1168}
1169EXPORT_SYMBOL(tcp_v4_md5_lookup);
1170
1171static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1172{
1173	struct tcp_sock *tp = tcp_sk(sk);
1174	struct tcp_md5sig_info *md5sig;
1175
1176	md5sig = kmalloc(sizeof(*md5sig), gfp);
1177	if (!md5sig)
1178		return -ENOMEM;
1179
1180	sk_gso_disable(sk);
1181	INIT_HLIST_HEAD(&md5sig->head);
1182	rcu_assign_pointer(tp->md5sig_info, md5sig);
1183	return 0;
1184}
1185
1186/* This can be called on a newly created socket, from other files */
1187static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1188			    int family, u8 prefixlen, int l3index, u8 flags,
1189			    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1190{
1191	/* Add Key to the list */
1192	struct tcp_md5sig_key *key;
1193	struct tcp_sock *tp = tcp_sk(sk);
1194	struct tcp_md5sig_info *md5sig;
1195
1196	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1197	if (key) {
1198		/* Pre-existing entry - just update that one.
1199		 * Note that the key might be used concurrently.
1200		 * data_race() is telling kcsan that we do not care of
1201		 * key mismatches, since changing MD5 key on live flows
1202		 * can lead to packet drops.
1203		 */
1204		data_race(memcpy(key->key, newkey, newkeylen));
1205
1206		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1207		 * Also note that a reader could catch new key->keylen value
1208		 * but old key->key[], this is the reason we use __GFP_ZERO
1209		 * at sock_kmalloc() time below these lines.
1210		 */
1211		WRITE_ONCE(key->keylen, newkeylen);
1212
1213		return 0;
1214	}
1215
1216	md5sig = rcu_dereference_protected(tp->md5sig_info,
1217					   lockdep_sock_is_held(sk));
1218
1219	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1220	if (!key)
1221		return -ENOMEM;
1222	if (!tcp_alloc_md5sig_pool()) {
1223		sock_kfree_s(sk, key, sizeof(*key));
1224		return -ENOMEM;
1225	}
1226
1227	memcpy(key->key, newkey, newkeylen);
1228	key->keylen = newkeylen;
1229	key->family = family;
1230	key->prefixlen = prefixlen;
1231	key->l3index = l3index;
1232	key->flags = flags;
1233	memcpy(&key->addr, addr,
1234	       (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1235								 sizeof(struct in_addr));
1236	hlist_add_head_rcu(&key->node, &md5sig->head);
1237	return 0;
1238}
1239
1240int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1241		   int family, u8 prefixlen, int l3index, u8 flags,
1242		   const u8 *newkey, u8 newkeylen)
1243{
1244	struct tcp_sock *tp = tcp_sk(sk);
1245
1246	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1247		if (tcp_md5sig_info_add(sk, GFP_KERNEL))
1248			return -ENOMEM;
1249
1250		if (!static_branch_inc(&tcp_md5_needed.key)) {
1251			struct tcp_md5sig_info *md5sig;
1252
1253			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1254			rcu_assign_pointer(tp->md5sig_info, NULL);
1255			kfree_rcu(md5sig, rcu);
1256			return -EUSERS;
1257		}
1258	}
1259
1260	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1261				newkey, newkeylen, GFP_KERNEL);
1262}
1263EXPORT_SYMBOL(tcp_md5_do_add);
1264
1265int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1266		     int family, u8 prefixlen, int l3index,
1267		     struct tcp_md5sig_key *key)
1268{
1269	struct tcp_sock *tp = tcp_sk(sk);
1270
1271	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1272		if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC)))
1273			return -ENOMEM;
1274
1275		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1276			struct tcp_md5sig_info *md5sig;
1277
1278			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1279			net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1280			rcu_assign_pointer(tp->md5sig_info, NULL);
1281			kfree_rcu(md5sig, rcu);
1282			return -EUSERS;
1283		}
1284	}
1285
1286	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1287				key->flags, key->key, key->keylen,
1288				sk_gfp_mask(sk, GFP_ATOMIC));
1289}
1290EXPORT_SYMBOL(tcp_md5_key_copy);
1291
1292int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1293		   u8 prefixlen, int l3index, u8 flags)
1294{
1295	struct tcp_md5sig_key *key;
1296
1297	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1298	if (!key)
1299		return -ENOENT;
1300	hlist_del_rcu(&key->node);
1301	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1302	kfree_rcu(key, rcu);
1303	return 0;
1304}
1305EXPORT_SYMBOL(tcp_md5_do_del);
1306
1307static void tcp_clear_md5_list(struct sock *sk)
1308{
1309	struct tcp_sock *tp = tcp_sk(sk);
1310	struct tcp_md5sig_key *key;
1311	struct hlist_node *n;
1312	struct tcp_md5sig_info *md5sig;
1313
1314	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1315
1316	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1317		hlist_del_rcu(&key->node);
1318		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1319		kfree_rcu(key, rcu);
1320	}
1321}
1322
1323static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1324				 sockptr_t optval, int optlen)
1325{
1326	struct tcp_md5sig cmd;
1327	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1328	const union tcp_md5_addr *addr;
1329	u8 prefixlen = 32;
1330	int l3index = 0;
1331	u8 flags;
1332
1333	if (optlen < sizeof(cmd))
1334		return -EINVAL;
1335
1336	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1337		return -EFAULT;
1338
1339	if (sin->sin_family != AF_INET)
1340		return -EINVAL;
1341
1342	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1343
1344	if (optname == TCP_MD5SIG_EXT &&
1345	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1346		prefixlen = cmd.tcpm_prefixlen;
1347		if (prefixlen > 32)
1348			return -EINVAL;
1349	}
1350
1351	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1352	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1353		struct net_device *dev;
1354
1355		rcu_read_lock();
1356		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1357		if (dev && netif_is_l3_master(dev))
1358			l3index = dev->ifindex;
1359
1360		rcu_read_unlock();
1361
1362		/* ok to reference set/not set outside of rcu;
1363		 * right now device MUST be an L3 master
1364		 */
1365		if (!dev || !l3index)
1366			return -EINVAL;
1367	}
1368
1369	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1370
1371	if (!cmd.tcpm_keylen)
1372		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1373
1374	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1375		return -EINVAL;
1376
1377	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1378			      cmd.tcpm_key, cmd.tcpm_keylen);
1379}
1380
1381static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1382				   __be32 daddr, __be32 saddr,
1383				   const struct tcphdr *th, int nbytes)
1384{
1385	struct tcp4_pseudohdr *bp;
1386	struct scatterlist sg;
1387	struct tcphdr *_th;
1388
1389	bp = hp->scratch;
1390	bp->saddr = saddr;
1391	bp->daddr = daddr;
1392	bp->pad = 0;
1393	bp->protocol = IPPROTO_TCP;
1394	bp->len = cpu_to_be16(nbytes);
1395
1396	_th = (struct tcphdr *)(bp + 1);
1397	memcpy(_th, th, sizeof(*th));
1398	_th->check = 0;
1399
1400	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1401	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1402				sizeof(*bp) + sizeof(*th));
1403	return crypto_ahash_update(hp->md5_req);
1404}
1405
1406static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1407			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1408{
1409	struct tcp_md5sig_pool *hp;
1410	struct ahash_request *req;
1411
1412	hp = tcp_get_md5sig_pool();
1413	if (!hp)
1414		goto clear_hash_noput;
1415	req = hp->md5_req;
1416
1417	if (crypto_ahash_init(req))
1418		goto clear_hash;
1419	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1420		goto clear_hash;
1421	if (tcp_md5_hash_key(hp, key))
1422		goto clear_hash;
1423	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1424	if (crypto_ahash_final(req))
1425		goto clear_hash;
1426
1427	tcp_put_md5sig_pool();
1428	return 0;
1429
1430clear_hash:
1431	tcp_put_md5sig_pool();
1432clear_hash_noput:
1433	memset(md5_hash, 0, 16);
1434	return 1;
1435}
1436
1437int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1438			const struct sock *sk,
1439			const struct sk_buff *skb)
1440{
1441	struct tcp_md5sig_pool *hp;
1442	struct ahash_request *req;
1443	const struct tcphdr *th = tcp_hdr(skb);
1444	__be32 saddr, daddr;
1445
1446	if (sk) { /* valid for establish/request sockets */
1447		saddr = sk->sk_rcv_saddr;
1448		daddr = sk->sk_daddr;
1449	} else {
1450		const struct iphdr *iph = ip_hdr(skb);
1451		saddr = iph->saddr;
1452		daddr = iph->daddr;
1453	}
1454
1455	hp = tcp_get_md5sig_pool();
1456	if (!hp)
1457		goto clear_hash_noput;
1458	req = hp->md5_req;
1459
1460	if (crypto_ahash_init(req))
1461		goto clear_hash;
1462
1463	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1464		goto clear_hash;
1465	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1466		goto clear_hash;
1467	if (tcp_md5_hash_key(hp, key))
1468		goto clear_hash;
1469	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1470	if (crypto_ahash_final(req))
1471		goto clear_hash;
1472
1473	tcp_put_md5sig_pool();
1474	return 0;
1475
1476clear_hash:
1477	tcp_put_md5sig_pool();
1478clear_hash_noput:
1479	memset(md5_hash, 0, 16);
1480	return 1;
1481}
1482EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1483
1484#endif
1485
1486static void tcp_v4_init_req(struct request_sock *req,
1487			    const struct sock *sk_listener,
1488			    struct sk_buff *skb)
1489{
1490	struct inet_request_sock *ireq = inet_rsk(req);
1491	struct net *net = sock_net(sk_listener);
1492
1493	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1494	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1495	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1496}
1497
1498static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1499					  struct sk_buff *skb,
1500					  struct flowi *fl,
1501					  struct request_sock *req)
1502{
1503	tcp_v4_init_req(req, sk, skb);
1504
1505	if (security_inet_conn_request(sk, skb, req))
1506		return NULL;
1507
1508	return inet_csk_route_req(sk, &fl->u.ip4, req);
1509}
1510
1511struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1512	.family		=	PF_INET,
1513	.obj_size	=	sizeof(struct tcp_request_sock),
1514	.rtx_syn_ack	=	tcp_rtx_synack,
1515	.send_ack	=	tcp_v4_reqsk_send_ack,
1516	.destructor	=	tcp_v4_reqsk_destructor,
1517	.send_reset	=	tcp_v4_send_reset,
1518	.syn_ack_timeout =	tcp_syn_ack_timeout,
1519};
1520
1521const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1522	.mss_clamp	=	TCP_MSS_DEFAULT,
1523#ifdef CONFIG_TCP_MD5SIG
1524	.req_md5_lookup	=	tcp_v4_md5_lookup,
1525	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1526#endif
1527#ifdef CONFIG_SYN_COOKIES
1528	.cookie_init_seq =	cookie_v4_init_sequence,
1529#endif
1530	.route_req	=	tcp_v4_route_req,
1531	.init_seq	=	tcp_v4_init_seq,
1532	.init_ts_off	=	tcp_v4_init_ts_off,
1533	.send_synack	=	tcp_v4_send_synack,
1534};
1535
1536int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1537{
1538	/* Never answer to SYNs send to broadcast or multicast */
1539	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1540		goto drop;
1541
1542	return tcp_conn_request(&tcp_request_sock_ops,
1543				&tcp_request_sock_ipv4_ops, sk, skb);
1544
1545drop:
1546	tcp_listendrop(sk);
1547	return 0;
1548}
1549EXPORT_SYMBOL(tcp_v4_conn_request);
1550
1551
1552/*
1553 * The three way handshake has completed - we got a valid synack -
1554 * now create the new socket.
1555 */
1556struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1557				  struct request_sock *req,
1558				  struct dst_entry *dst,
1559				  struct request_sock *req_unhash,
1560				  bool *own_req)
1561{
1562	struct inet_request_sock *ireq;
1563	bool found_dup_sk = false;
1564	struct inet_sock *newinet;
1565	struct tcp_sock *newtp;
1566	struct sock *newsk;
1567#ifdef CONFIG_TCP_MD5SIG
1568	const union tcp_md5_addr *addr;
1569	struct tcp_md5sig_key *key;
1570	int l3index;
1571#endif
1572	struct ip_options_rcu *inet_opt;
1573
1574	if (sk_acceptq_is_full(sk))
1575		goto exit_overflow;
1576
1577	newsk = tcp_create_openreq_child(sk, req, skb);
1578	if (!newsk)
1579		goto exit_nonewsk;
1580
1581	newsk->sk_gso_type = SKB_GSO_TCPV4;
1582	inet_sk_rx_dst_set(newsk, skb);
1583
1584	newtp		      = tcp_sk(newsk);
1585	newinet		      = inet_sk(newsk);
1586	ireq		      = inet_rsk(req);
1587	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1588	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1589	newsk->sk_bound_dev_if = ireq->ir_iif;
1590	newinet->inet_saddr   = ireq->ir_loc_addr;
1591	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1592	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1593	newinet->mc_index     = inet_iif(skb);
1594	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1595	newinet->rcv_tos      = ip_hdr(skb)->tos;
1596	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1597	if (inet_opt)
1598		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1599	atomic_set(&newinet->inet_id, get_random_u16());
1600
1601	/* Set ToS of the new socket based upon the value of incoming SYN.
1602	 * ECT bits are set later in tcp_init_transfer().
1603	 */
1604	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1605		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1606
1607	if (!dst) {
1608		dst = inet_csk_route_child_sock(sk, newsk, req);
1609		if (!dst)
1610			goto put_and_exit;
1611	} else {
1612		/* syncookie case : see end of cookie_v4_check() */
1613	}
1614	sk_setup_caps(newsk, dst);
1615
1616	tcp_ca_openreq_child(newsk, dst);
1617
1618	tcp_sync_mss(newsk, dst_mtu(dst));
1619	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1620
1621	tcp_initialize_rcv_mss(newsk);
1622
1623#ifdef CONFIG_TCP_MD5SIG
1624	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1625	/* Copy over the MD5 key from the original socket */
1626	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1627	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1628	if (key) {
1629		if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1630			goto put_and_exit;
1631		sk_gso_disable(newsk);
1632	}
1633#endif
1634
1635	if (__inet_inherit_port(sk, newsk) < 0)
1636		goto put_and_exit;
1637	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1638				       &found_dup_sk);
1639	if (likely(*own_req)) {
1640		tcp_move_syn(newtp, req);
1641		ireq->ireq_opt = NULL;
1642	} else {
1643		newinet->inet_opt = NULL;
1644
1645		if (!req_unhash && found_dup_sk) {
1646			/* This code path should only be executed in the
1647			 * syncookie case only
1648			 */
1649			bh_unlock_sock(newsk);
1650			sock_put(newsk);
1651			newsk = NULL;
1652		}
1653	}
1654	return newsk;
1655
1656exit_overflow:
1657	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1658exit_nonewsk:
1659	dst_release(dst);
1660exit:
1661	tcp_listendrop(sk);
1662	return NULL;
1663put_and_exit:
1664	newinet->inet_opt = NULL;
1665	inet_csk_prepare_forced_close(newsk);
1666	tcp_done(newsk);
1667	goto exit;
1668}
1669EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1670
1671static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1672{
1673#ifdef CONFIG_SYN_COOKIES
1674	const struct tcphdr *th = tcp_hdr(skb);
1675
1676	if (!th->syn)
1677		sk = cookie_v4_check(sk, skb);
1678#endif
1679	return sk;
1680}
1681
1682u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1683			 struct tcphdr *th, u32 *cookie)
1684{
1685	u16 mss = 0;
1686#ifdef CONFIG_SYN_COOKIES
1687	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1688				    &tcp_request_sock_ipv4_ops, sk, th);
1689	if (mss) {
1690		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1691		tcp_synq_overflow(sk);
1692	}
1693#endif
1694	return mss;
1695}
1696
1697INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1698							   u32));
1699/* The socket must have it's spinlock held when we get
1700 * here, unless it is a TCP_LISTEN socket.
1701 *
1702 * We have a potential double-lock case here, so even when
1703 * doing backlog processing we use the BH locking scheme.
1704 * This is because we cannot sleep with the original spinlock
1705 * held.
1706 */
1707int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1708{
1709	enum skb_drop_reason reason;
1710	struct sock *rsk;
1711
1712	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1713		struct dst_entry *dst;
1714
1715		dst = rcu_dereference_protected(sk->sk_rx_dst,
1716						lockdep_sock_is_held(sk));
1717
1718		sock_rps_save_rxhash(sk, skb);
1719		sk_mark_napi_id(sk, skb);
1720		if (dst) {
1721			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1722			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1723					     dst, 0)) {
1724				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1725				dst_release(dst);
1726			}
1727		}
1728		tcp_rcv_established(sk, skb);
1729		return 0;
1730	}
1731
1732	reason = SKB_DROP_REASON_NOT_SPECIFIED;
1733	if (tcp_checksum_complete(skb))
1734		goto csum_err;
1735
1736	if (sk->sk_state == TCP_LISTEN) {
1737		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1738
1739		if (!nsk)
1740			goto discard;
1741		if (nsk != sk) {
1742			if (tcp_child_process(sk, nsk, skb)) {
1743				rsk = nsk;
1744				goto reset;
1745			}
1746			return 0;
1747		}
1748	} else
1749		sock_rps_save_rxhash(sk, skb);
1750
1751	if (tcp_rcv_state_process(sk, skb)) {
1752		rsk = sk;
1753		goto reset;
1754	}
1755	return 0;
1756
1757reset:
1758	tcp_v4_send_reset(rsk, skb);
1759discard:
1760	kfree_skb_reason(skb, reason);
1761	/* Be careful here. If this function gets more complicated and
1762	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1763	 * might be destroyed here. This current version compiles correctly,
1764	 * but you have been warned.
1765	 */
1766	return 0;
1767
1768csum_err:
1769	reason = SKB_DROP_REASON_TCP_CSUM;
1770	trace_tcp_bad_csum(skb);
1771	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1772	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1773	goto discard;
1774}
1775EXPORT_SYMBOL(tcp_v4_do_rcv);
1776
1777int tcp_v4_early_demux(struct sk_buff *skb)
1778{
1779	struct net *net = dev_net(skb->dev);
1780	const struct iphdr *iph;
1781	const struct tcphdr *th;
1782	struct sock *sk;
1783
1784	if (skb->pkt_type != PACKET_HOST)
1785		return 0;
1786
1787	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1788		return 0;
1789
1790	iph = ip_hdr(skb);
1791	th = tcp_hdr(skb);
1792
1793	if (th->doff < sizeof(struct tcphdr) / 4)
1794		return 0;
1795
1796	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1797				       iph->saddr, th->source,
1798				       iph->daddr, ntohs(th->dest),
1799				       skb->skb_iif, inet_sdif(skb));
1800	if (sk) {
1801		skb->sk = sk;
1802		skb->destructor = sock_edemux;
1803		if (sk_fullsock(sk)) {
1804			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1805
1806			if (dst)
1807				dst = dst_check(dst, 0);
1808			if (dst &&
1809			    sk->sk_rx_dst_ifindex == skb->skb_iif)
1810				skb_dst_set_noref(skb, dst);
1811		}
1812	}
1813	return 0;
1814}
1815
1816bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1817		     enum skb_drop_reason *reason)
1818{
1819	u32 limit, tail_gso_size, tail_gso_segs;
1820	struct skb_shared_info *shinfo;
1821	const struct tcphdr *th;
1822	struct tcphdr *thtail;
1823	struct sk_buff *tail;
1824	unsigned int hdrlen;
1825	bool fragstolen;
1826	u32 gso_segs;
1827	u32 gso_size;
1828	int delta;
1829
1830	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1831	 * we can fix skb->truesize to its real value to avoid future drops.
1832	 * This is valid because skb is not yet charged to the socket.
1833	 * It has been noticed pure SACK packets were sometimes dropped
1834	 * (if cooked by drivers without copybreak feature).
1835	 */
1836	skb_condense(skb);
1837
1838	skb_dst_drop(skb);
1839
1840	if (unlikely(tcp_checksum_complete(skb))) {
1841		bh_unlock_sock(sk);
1842		trace_tcp_bad_csum(skb);
1843		*reason = SKB_DROP_REASON_TCP_CSUM;
1844		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1845		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1846		return true;
1847	}
1848
1849	/* Attempt coalescing to last skb in backlog, even if we are
1850	 * above the limits.
1851	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1852	 */
1853	th = (const struct tcphdr *)skb->data;
1854	hdrlen = th->doff * 4;
1855
1856	tail = sk->sk_backlog.tail;
1857	if (!tail)
1858		goto no_coalesce;
1859	thtail = (struct tcphdr *)tail->data;
1860
1861	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1862	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1863	    ((TCP_SKB_CB(tail)->tcp_flags |
1864	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1865	    !((TCP_SKB_CB(tail)->tcp_flags &
1866	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1867	    ((TCP_SKB_CB(tail)->tcp_flags ^
1868	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1869#ifdef CONFIG_TLS_DEVICE
1870	    tail->decrypted != skb->decrypted ||
1871#endif
1872	    !mptcp_skb_can_collapse(tail, skb) ||
1873	    thtail->doff != th->doff ||
1874	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1875		goto no_coalesce;
1876
1877	__skb_pull(skb, hdrlen);
1878
1879	shinfo = skb_shinfo(skb);
1880	gso_size = shinfo->gso_size ?: skb->len;
1881	gso_segs = shinfo->gso_segs ?: 1;
1882
1883	shinfo = skb_shinfo(tail);
1884	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1885	tail_gso_segs = shinfo->gso_segs ?: 1;
1886
1887	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1888		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1889
1890		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1891			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1892			thtail->window = th->window;
1893		}
1894
1895		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1896		 * thtail->fin, so that the fast path in tcp_rcv_established()
1897		 * is not entered if we append a packet with a FIN.
1898		 * SYN, RST, URG are not present.
1899		 * ACK is set on both packets.
1900		 * PSH : we do not really care in TCP stack,
1901		 *       at least for 'GRO' packets.
1902		 */
1903		thtail->fin |= th->fin;
1904		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1905
1906		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1907			TCP_SKB_CB(tail)->has_rxtstamp = true;
1908			tail->tstamp = skb->tstamp;
1909			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1910		}
1911
1912		/* Not as strict as GRO. We only need to carry mss max value */
1913		shinfo->gso_size = max(gso_size, tail_gso_size);
1914		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1915
1916		sk->sk_backlog.len += delta;
1917		__NET_INC_STATS(sock_net(sk),
1918				LINUX_MIB_TCPBACKLOGCOALESCE);
1919		kfree_skb_partial(skb, fragstolen);
1920		return false;
1921	}
1922	__skb_push(skb, hdrlen);
1923
1924no_coalesce:
1925	limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
1926
1927	/* Only socket owner can try to collapse/prune rx queues
1928	 * to reduce memory overhead, so add a little headroom here.
1929	 * Few sockets backlog are possibly concurrently non empty.
1930	 */
1931	limit += 64 * 1024;
1932
1933	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1934		bh_unlock_sock(sk);
1935		*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1936		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1937		return true;
1938	}
1939	return false;
1940}
1941EXPORT_SYMBOL(tcp_add_backlog);
1942
1943int tcp_filter(struct sock *sk, struct sk_buff *skb)
1944{
1945	struct tcphdr *th = (struct tcphdr *)skb->data;
1946
1947	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1948}
1949EXPORT_SYMBOL(tcp_filter);
1950
1951static void tcp_v4_restore_cb(struct sk_buff *skb)
1952{
1953	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1954		sizeof(struct inet_skb_parm));
1955}
1956
1957static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1958			   const struct tcphdr *th)
1959{
1960	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1961	 * barrier() makes sure compiler wont play fool^Waliasing games.
1962	 */
1963	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1964		sizeof(struct inet_skb_parm));
1965	barrier();
1966
1967	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1968	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1969				    skb->len - th->doff * 4);
1970	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1971	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1972	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1973	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1974	TCP_SKB_CB(skb)->sacked	 = 0;
1975	TCP_SKB_CB(skb)->has_rxtstamp =
1976			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1977}
1978
1979/*
1980 *	From tcp_input.c
1981 */
1982
1983int tcp_v4_rcv(struct sk_buff *skb)
1984{
1985	struct net *net = dev_net(skb->dev);
1986	enum skb_drop_reason drop_reason;
1987	int sdif = inet_sdif(skb);
1988	int dif = inet_iif(skb);
1989	const struct iphdr *iph;
1990	const struct tcphdr *th;
1991	bool refcounted;
1992	struct sock *sk;
1993	int ret;
1994
1995	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1996	if (skb->pkt_type != PACKET_HOST)
1997		goto discard_it;
1998
1999	/* Count it even if it's bad */
2000	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
2001
2002	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2003		goto discard_it;
2004
2005	th = (const struct tcphdr *)skb->data;
2006
2007	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2008		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2009		goto bad_packet;
2010	}
2011	if (!pskb_may_pull(skb, th->doff * 4))
2012		goto discard_it;
2013
2014	/* An explanation is required here, I think.
2015	 * Packet length and doff are validated by header prediction,
2016	 * provided case of th->doff==0 is eliminated.
2017	 * So, we defer the checks. */
2018
2019	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2020		goto csum_error;
2021
2022	th = (const struct tcphdr *)skb->data;
2023	iph = ip_hdr(skb);
2024lookup:
2025	sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2026			       skb, __tcp_hdrlen(th), th->source,
2027			       th->dest, sdif, &refcounted);
2028	if (!sk)
2029		goto no_tcp_socket;
2030
2031process:
2032	if (sk->sk_state == TCP_TIME_WAIT)
2033		goto do_time_wait;
2034
2035	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2036		struct request_sock *req = inet_reqsk(sk);
2037		bool req_stolen = false;
2038		struct sock *nsk;
2039
2040		sk = req->rsk_listener;
2041		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2042			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2043		else
2044			drop_reason = tcp_inbound_md5_hash(sk, skb,
2045						   &iph->saddr, &iph->daddr,
2046						   AF_INET, dif, sdif);
2047		if (unlikely(drop_reason)) {
2048			sk_drops_add(sk, skb);
2049			reqsk_put(req);
2050			goto discard_it;
2051		}
2052		if (tcp_checksum_complete(skb)) {
2053			reqsk_put(req);
2054			goto csum_error;
2055		}
2056		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2057			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2058			if (!nsk) {
2059				inet_csk_reqsk_queue_drop_and_put(sk, req);
2060				goto lookup;
2061			}
2062			sk = nsk;
2063			/* reuseport_migrate_sock() has already held one sk_refcnt
2064			 * before returning.
2065			 */
2066		} else {
2067			/* We own a reference on the listener, increase it again
2068			 * as we might lose it too soon.
2069			 */
2070			sock_hold(sk);
2071		}
2072		refcounted = true;
2073		nsk = NULL;
2074		if (!tcp_filter(sk, skb)) {
2075			th = (const struct tcphdr *)skb->data;
2076			iph = ip_hdr(skb);
2077			tcp_v4_fill_cb(skb, iph, th);
2078			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2079		} else {
2080			drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2081		}
2082		if (!nsk) {
2083			reqsk_put(req);
2084			if (req_stolen) {
2085				/* Another cpu got exclusive access to req
2086				 * and created a full blown socket.
2087				 * Try to feed this packet to this socket
2088				 * instead of discarding it.
2089				 */
2090				tcp_v4_restore_cb(skb);
2091				sock_put(sk);
2092				goto lookup;
2093			}
2094			goto discard_and_relse;
2095		}
2096		nf_reset_ct(skb);
2097		if (nsk == sk) {
2098			reqsk_put(req);
2099			tcp_v4_restore_cb(skb);
2100		} else if (tcp_child_process(sk, nsk, skb)) {
2101			tcp_v4_send_reset(nsk, skb);
2102			goto discard_and_relse;
2103		} else {
2104			sock_put(sk);
2105			return 0;
2106		}
2107	}
2108
2109	if (static_branch_unlikely(&ip4_min_ttl)) {
2110		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
2111		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2112			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2113			drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2114			goto discard_and_relse;
2115		}
2116	}
2117
2118	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2119		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2120		goto discard_and_relse;
2121	}
2122
2123	drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2124					   &iph->daddr, AF_INET, dif, sdif);
2125	if (drop_reason)
2126		goto discard_and_relse;
2127
2128	nf_reset_ct(skb);
2129
2130	if (tcp_filter(sk, skb)) {
2131		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2132		goto discard_and_relse;
2133	}
2134	th = (const struct tcphdr *)skb->data;
2135	iph = ip_hdr(skb);
2136	tcp_v4_fill_cb(skb, iph, th);
2137
2138	skb->dev = NULL;
2139
2140	if (sk->sk_state == TCP_LISTEN) {
2141		ret = tcp_v4_do_rcv(sk, skb);
2142		goto put_and_return;
2143	}
2144
2145	sk_incoming_cpu_update(sk);
2146
2147	bh_lock_sock_nested(sk);
2148	tcp_segs_in(tcp_sk(sk), skb);
2149	ret = 0;
2150	if (!sock_owned_by_user(sk)) {
2151		ret = tcp_v4_do_rcv(sk, skb);
2152	} else {
2153		if (tcp_add_backlog(sk, skb, &drop_reason))
2154			goto discard_and_relse;
2155	}
2156	bh_unlock_sock(sk);
2157
2158put_and_return:
2159	if (refcounted)
2160		sock_put(sk);
2161
2162	return ret;
2163
2164no_tcp_socket:
2165	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2166	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2167		goto discard_it;
2168
2169	tcp_v4_fill_cb(skb, iph, th);
2170
2171	if (tcp_checksum_complete(skb)) {
2172csum_error:
2173		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2174		trace_tcp_bad_csum(skb);
2175		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2176bad_packet:
2177		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2178	} else {
2179		tcp_v4_send_reset(NULL, skb);
2180	}
2181
2182discard_it:
2183	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2184	/* Discard frame. */
2185	kfree_skb_reason(skb, drop_reason);
2186	return 0;
2187
2188discard_and_relse:
2189	sk_drops_add(sk, skb);
2190	if (refcounted)
2191		sock_put(sk);
2192	goto discard_it;
2193
2194do_time_wait:
2195	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2196		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2197		inet_twsk_put(inet_twsk(sk));
2198		goto discard_it;
2199	}
2200
2201	tcp_v4_fill_cb(skb, iph, th);
2202
2203	if (tcp_checksum_complete(skb)) {
2204		inet_twsk_put(inet_twsk(sk));
2205		goto csum_error;
2206	}
2207	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2208	case TCP_TW_SYN: {
2209		struct sock *sk2 = inet_lookup_listener(net,
2210							net->ipv4.tcp_death_row.hashinfo,
2211							skb, __tcp_hdrlen(th),
2212							iph->saddr, th->source,
2213							iph->daddr, th->dest,
2214							inet_iif(skb),
2215							sdif);
2216		if (sk2) {
2217			inet_twsk_deschedule_put(inet_twsk(sk));
2218			sk = sk2;
2219			tcp_v4_restore_cb(skb);
2220			refcounted = false;
2221			goto process;
2222		}
2223	}
2224		/* to ACK */
2225		fallthrough;
2226	case TCP_TW_ACK:
2227		tcp_v4_timewait_ack(sk, skb);
2228		break;
2229	case TCP_TW_RST:
2230		tcp_v4_send_reset(sk, skb);
2231		inet_twsk_deschedule_put(inet_twsk(sk));
2232		goto discard_it;
2233	case TCP_TW_SUCCESS:;
2234	}
2235	goto discard_it;
2236}
2237
2238static struct timewait_sock_ops tcp_timewait_sock_ops = {
2239	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2240	.twsk_unique	= tcp_twsk_unique,
2241	.twsk_destructor= tcp_twsk_destructor,
2242};
2243
2244void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2245{
2246	struct dst_entry *dst = skb_dst(skb);
2247
2248	if (dst && dst_hold_safe(dst)) {
2249		rcu_assign_pointer(sk->sk_rx_dst, dst);
2250		sk->sk_rx_dst_ifindex = skb->skb_iif;
2251	}
2252}
2253EXPORT_SYMBOL(inet_sk_rx_dst_set);
2254
2255const struct inet_connection_sock_af_ops ipv4_specific = {
2256	.queue_xmit	   = ip_queue_xmit,
2257	.send_check	   = tcp_v4_send_check,
2258	.rebuild_header	   = inet_sk_rebuild_header,
2259	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2260	.conn_request	   = tcp_v4_conn_request,
2261	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2262	.net_header_len	   = sizeof(struct iphdr),
2263	.setsockopt	   = ip_setsockopt,
2264	.getsockopt	   = ip_getsockopt,
2265	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2266	.sockaddr_len	   = sizeof(struct sockaddr_in),
2267	.mtu_reduced	   = tcp_v4_mtu_reduced,
2268};
2269EXPORT_SYMBOL(ipv4_specific);
2270
2271#ifdef CONFIG_TCP_MD5SIG
2272static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2273	.md5_lookup		= tcp_v4_md5_lookup,
2274	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2275	.md5_parse		= tcp_v4_parse_md5_keys,
2276};
2277#endif
2278
2279/* NOTE: A lot of things set to zero explicitly by call to
2280 *       sk_alloc() so need not be done here.
2281 */
2282static int tcp_v4_init_sock(struct sock *sk)
2283{
2284	struct inet_connection_sock *icsk = inet_csk(sk);
2285
2286	tcp_init_sock(sk);
2287
2288	icsk->icsk_af_ops = &ipv4_specific;
2289
2290#ifdef CONFIG_TCP_MD5SIG
2291	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2292#endif
2293
2294	return 0;
2295}
2296
2297void tcp_v4_destroy_sock(struct sock *sk)
2298{
2299	struct tcp_sock *tp = tcp_sk(sk);
2300
2301	trace_tcp_destroy_sock(sk);
2302
2303	tcp_clear_xmit_timers(sk);
2304
2305	tcp_cleanup_congestion_control(sk);
2306
2307	tcp_cleanup_ulp(sk);
2308
2309	/* Cleanup up the write buffer. */
2310	tcp_write_queue_purge(sk);
2311
2312	/* Check if we want to disable active TFO */
2313	tcp_fastopen_active_disable_ofo_check(sk);
2314
2315	/* Cleans up our, hopefully empty, out_of_order_queue. */
2316	skb_rbtree_purge(&tp->out_of_order_queue);
2317
2318#ifdef CONFIG_TCP_MD5SIG
2319	/* Clean up the MD5 key list, if any */
2320	if (tp->md5sig_info) {
2321		tcp_clear_md5_list(sk);
2322		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2323		tp->md5sig_info = NULL;
2324		static_branch_slow_dec_deferred(&tcp_md5_needed);
2325	}
2326#endif
2327
2328	/* Clean up a referenced TCP bind bucket. */
2329	if (inet_csk(sk)->icsk_bind_hash)
2330		inet_put_port(sk);
2331
2332	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2333
2334	/* If socket is aborted during connect operation */
2335	tcp_free_fastopen_req(tp);
2336	tcp_fastopen_destroy_cipher(sk);
2337	tcp_saved_syn_free(tp);
2338
2339	sk_sockets_allocated_dec(sk);
2340}
2341EXPORT_SYMBOL(tcp_v4_destroy_sock);
2342
2343#ifdef CONFIG_PROC_FS
2344/* Proc filesystem TCP sock list dumping. */
2345
2346static unsigned short seq_file_family(const struct seq_file *seq);
2347
2348static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2349{
2350	unsigned short family = seq_file_family(seq);
2351
2352	/* AF_UNSPEC is used as a match all */
2353	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2354		net_eq(sock_net(sk), seq_file_net(seq)));
2355}
2356
2357/* Find a non empty bucket (starting from st->bucket)
2358 * and return the first sk from it.
2359 */
2360static void *listening_get_first(struct seq_file *seq)
2361{
2362	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2363	struct tcp_iter_state *st = seq->private;
2364
2365	st->offset = 0;
2366	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2367		struct inet_listen_hashbucket *ilb2;
2368		struct hlist_nulls_node *node;
2369		struct sock *sk;
2370
2371		ilb2 = &hinfo->lhash2[st->bucket];
2372		if (hlist_nulls_empty(&ilb2->nulls_head))
2373			continue;
2374
2375		spin_lock(&ilb2->lock);
2376		sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2377			if (seq_sk_match(seq, sk))
2378				return sk;
2379		}
2380		spin_unlock(&ilb2->lock);
2381	}
2382
2383	return NULL;
2384}
2385
2386/* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2387 * If "cur" is the last one in the st->bucket,
2388 * call listening_get_first() to return the first sk of the next
2389 * non empty bucket.
2390 */
2391static void *listening_get_next(struct seq_file *seq, void *cur)
2392{
2393	struct tcp_iter_state *st = seq->private;
2394	struct inet_listen_hashbucket *ilb2;
2395	struct hlist_nulls_node *node;
2396	struct inet_hashinfo *hinfo;
2397	struct sock *sk = cur;
2398
2399	++st->num;
2400	++st->offset;
2401
2402	sk = sk_nulls_next(sk);
2403	sk_nulls_for_each_from(sk, node) {
2404		if (seq_sk_match(seq, sk))
2405			return sk;
2406	}
2407
2408	hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2409	ilb2 = &hinfo->lhash2[st->bucket];
2410	spin_unlock(&ilb2->lock);
2411	++st->bucket;
2412	return listening_get_first(seq);
2413}
2414
2415static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2416{
2417	struct tcp_iter_state *st = seq->private;
2418	void *rc;
2419
2420	st->bucket = 0;
2421	st->offset = 0;
2422	rc = listening_get_first(seq);
2423
2424	while (rc && *pos) {
2425		rc = listening_get_next(seq, rc);
2426		--*pos;
2427	}
2428	return rc;
2429}
2430
2431static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2432				const struct tcp_iter_state *st)
2433{
2434	return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2435}
2436
2437/*
2438 * Get first established socket starting from bucket given in st->bucket.
2439 * If st->bucket is zero, the very first socket in the hash is returned.
2440 */
2441static void *established_get_first(struct seq_file *seq)
2442{
2443	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2444	struct tcp_iter_state *st = seq->private;
2445
2446	st->offset = 0;
2447	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2448		struct sock *sk;
2449		struct hlist_nulls_node *node;
2450		spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2451
2452		cond_resched();
2453
2454		/* Lockless fast path for the common case of empty buckets */
2455		if (empty_bucket(hinfo, st))
2456			continue;
2457
2458		spin_lock_bh(lock);
2459		sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2460			if (seq_sk_match(seq, sk))
2461				return sk;
2462		}
2463		spin_unlock_bh(lock);
2464	}
2465
2466	return NULL;
2467}
2468
2469static void *established_get_next(struct seq_file *seq, void *cur)
2470{
2471	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2472	struct tcp_iter_state *st = seq->private;
2473	struct hlist_nulls_node *node;
2474	struct sock *sk = cur;
2475
2476	++st->num;
2477	++st->offset;
2478
2479	sk = sk_nulls_next(sk);
2480
2481	sk_nulls_for_each_from(sk, node) {
2482		if (seq_sk_match(seq, sk))
2483			return sk;
2484	}
2485
2486	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2487	++st->bucket;
2488	return established_get_first(seq);
2489}
2490
2491static void *established_get_idx(struct seq_file *seq, loff_t pos)
2492{
2493	struct tcp_iter_state *st = seq->private;
2494	void *rc;
2495
2496	st->bucket = 0;
2497	rc = established_get_first(seq);
2498
2499	while (rc && pos) {
2500		rc = established_get_next(seq, rc);
2501		--pos;
2502	}
2503	return rc;
2504}
2505
2506static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2507{
2508	void *rc;
2509	struct tcp_iter_state *st = seq->private;
2510
2511	st->state = TCP_SEQ_STATE_LISTENING;
2512	rc	  = listening_get_idx(seq, &pos);
2513
2514	if (!rc) {
2515		st->state = TCP_SEQ_STATE_ESTABLISHED;
2516		rc	  = established_get_idx(seq, pos);
2517	}
2518
2519	return rc;
2520}
2521
2522static void *tcp_seek_last_pos(struct seq_file *seq)
2523{
2524	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2525	struct tcp_iter_state *st = seq->private;
2526	int bucket = st->bucket;
2527	int offset = st->offset;
2528	int orig_num = st->num;
2529	void *rc = NULL;
2530
2531	switch (st->state) {
2532	case TCP_SEQ_STATE_LISTENING:
2533		if (st->bucket > hinfo->lhash2_mask)
2534			break;
2535		rc = listening_get_first(seq);
2536		while (offset-- && rc && bucket == st->bucket)
2537			rc = listening_get_next(seq, rc);
2538		if (rc)
2539			break;
2540		st->bucket = 0;
2541		st->state = TCP_SEQ_STATE_ESTABLISHED;
2542		fallthrough;
2543	case TCP_SEQ_STATE_ESTABLISHED:
2544		if (st->bucket > hinfo->ehash_mask)
2545			break;
2546		rc = established_get_first(seq);
2547		while (offset-- && rc && bucket == st->bucket)
2548			rc = established_get_next(seq, rc);
2549	}
2550
2551	st->num = orig_num;
2552
2553	return rc;
2554}
2555
2556void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2557{
2558	struct tcp_iter_state *st = seq->private;
2559	void *rc;
2560
2561	if (*pos && *pos == st->last_pos) {
2562		rc = tcp_seek_last_pos(seq);
2563		if (rc)
2564			goto out;
2565	}
2566
2567	st->state = TCP_SEQ_STATE_LISTENING;
2568	st->num = 0;
2569	st->bucket = 0;
2570	st->offset = 0;
2571	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2572
2573out:
2574	st->last_pos = *pos;
2575	return rc;
2576}
2577EXPORT_SYMBOL(tcp_seq_start);
2578
2579void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2580{
2581	struct tcp_iter_state *st = seq->private;
2582	void *rc = NULL;
2583
2584	if (v == SEQ_START_TOKEN) {
2585		rc = tcp_get_idx(seq, 0);
2586		goto out;
2587	}
2588
2589	switch (st->state) {
2590	case TCP_SEQ_STATE_LISTENING:
2591		rc = listening_get_next(seq, v);
2592		if (!rc) {
2593			st->state = TCP_SEQ_STATE_ESTABLISHED;
2594			st->bucket = 0;
2595			st->offset = 0;
2596			rc	  = established_get_first(seq);
2597		}
2598		break;
2599	case TCP_SEQ_STATE_ESTABLISHED:
2600		rc = established_get_next(seq, v);
2601		break;
2602	}
2603out:
2604	++*pos;
2605	st->last_pos = *pos;
2606	return rc;
2607}
2608EXPORT_SYMBOL(tcp_seq_next);
2609
2610void tcp_seq_stop(struct seq_file *seq, void *v)
2611{
2612	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2613	struct tcp_iter_state *st = seq->private;
2614
2615	switch (st->state) {
2616	case TCP_SEQ_STATE_LISTENING:
2617		if (v != SEQ_START_TOKEN)
2618			spin_unlock(&hinfo->lhash2[st->bucket].lock);
2619		break;
2620	case TCP_SEQ_STATE_ESTABLISHED:
2621		if (v)
2622			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2623		break;
2624	}
2625}
2626EXPORT_SYMBOL(tcp_seq_stop);
2627
2628static void get_openreq4(const struct request_sock *req,
2629			 struct seq_file *f, int i)
2630{
2631	const struct inet_request_sock *ireq = inet_rsk(req);
2632	long delta = req->rsk_timer.expires - jiffies;
2633
2634	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2635		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2636		i,
2637		ireq->ir_loc_addr,
2638		ireq->ir_num,
2639		ireq->ir_rmt_addr,
2640		ntohs(ireq->ir_rmt_port),
2641		TCP_SYN_RECV,
2642		0, 0, /* could print option size, but that is af dependent. */
2643		1,    /* timers active (only the expire timer) */
2644		jiffies_delta_to_clock_t(delta),
2645		req->num_timeout,
2646		from_kuid_munged(seq_user_ns(f),
2647				 sock_i_uid(req->rsk_listener)),
2648		0,  /* non standard timer */
2649		0, /* open_requests have no inode */
2650		0,
2651		req);
2652}
2653
2654static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2655{
2656	int timer_active;
2657	unsigned long timer_expires;
2658	const struct tcp_sock *tp = tcp_sk(sk);
2659	const struct inet_connection_sock *icsk = inet_csk(sk);
2660	const struct inet_sock *inet = inet_sk(sk);
2661	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2662	__be32 dest = inet->inet_daddr;
2663	__be32 src = inet->inet_rcv_saddr;
2664	__u16 destp = ntohs(inet->inet_dport);
2665	__u16 srcp = ntohs(inet->inet_sport);
2666	int rx_queue;
2667	int state;
2668
2669	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2670	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2671	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2672		timer_active	= 1;
2673		timer_expires	= icsk->icsk_timeout;
2674	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2675		timer_active	= 4;
2676		timer_expires	= icsk->icsk_timeout;
2677	} else if (timer_pending(&sk->sk_timer)) {
2678		timer_active	= 2;
2679		timer_expires	= sk->sk_timer.expires;
2680	} else {
2681		timer_active	= 0;
2682		timer_expires = jiffies;
2683	}
2684
2685	state = inet_sk_state_load(sk);
2686	if (state == TCP_LISTEN)
2687		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2688	else
2689		/* Because we don't lock the socket,
2690		 * we might find a transient negative value.
2691		 */
2692		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2693				      READ_ONCE(tp->copied_seq), 0);
2694
2695	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2696			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2697		i, src, srcp, dest, destp, state,
2698		READ_ONCE(tp->write_seq) - tp->snd_una,
2699		rx_queue,
2700		timer_active,
2701		jiffies_delta_to_clock_t(timer_expires - jiffies),
2702		icsk->icsk_retransmits,
2703		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2704		icsk->icsk_probes_out,
2705		sock_i_ino(sk),
2706		refcount_read(&sk->sk_refcnt), sk,
2707		jiffies_to_clock_t(icsk->icsk_rto),
2708		jiffies_to_clock_t(icsk->icsk_ack.ato),
2709		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2710		tcp_snd_cwnd(tp),
2711		state == TCP_LISTEN ?
2712		    fastopenq->max_qlen :
2713		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2714}
2715
2716static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2717			       struct seq_file *f, int i)
2718{
2719	long delta = tw->tw_timer.expires - jiffies;
2720	__be32 dest, src;
2721	__u16 destp, srcp;
2722
2723	dest  = tw->tw_daddr;
2724	src   = tw->tw_rcv_saddr;
2725	destp = ntohs(tw->tw_dport);
2726	srcp  = ntohs(tw->tw_sport);
2727
2728	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2729		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2730		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2731		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2732		refcount_read(&tw->tw_refcnt), tw);
2733}
2734
2735#define TMPSZ 150
2736
2737static int tcp4_seq_show(struct seq_file *seq, void *v)
2738{
2739	struct tcp_iter_state *st;
2740	struct sock *sk = v;
2741
2742	seq_setwidth(seq, TMPSZ - 1);
2743	if (v == SEQ_START_TOKEN) {
2744		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2745			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2746			   "inode");
2747		goto out;
2748	}
2749	st = seq->private;
2750
2751	if (sk->sk_state == TCP_TIME_WAIT)
2752		get_timewait4_sock(v, seq, st->num);
2753	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2754		get_openreq4(v, seq, st->num);
2755	else
2756		get_tcp4_sock(v, seq, st->num);
2757out:
2758	seq_pad(seq, '\n');
2759	return 0;
2760}
2761
2762#ifdef CONFIG_BPF_SYSCALL
2763struct bpf_tcp_iter_state {
2764	struct tcp_iter_state state;
2765	unsigned int cur_sk;
2766	unsigned int end_sk;
2767	unsigned int max_sk;
2768	struct sock **batch;
2769	bool st_bucket_done;
2770};
2771
2772struct bpf_iter__tcp {
2773	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2774	__bpf_md_ptr(struct sock_common *, sk_common);
2775	uid_t uid __aligned(8);
2776};
2777
2778static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2779			     struct sock_common *sk_common, uid_t uid)
2780{
2781	struct bpf_iter__tcp ctx;
2782
2783	meta->seq_num--;  /* skip SEQ_START_TOKEN */
2784	ctx.meta = meta;
2785	ctx.sk_common = sk_common;
2786	ctx.uid = uid;
2787	return bpf_iter_run_prog(prog, &ctx);
2788}
2789
2790static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2791{
2792	while (iter->cur_sk < iter->end_sk)
2793		sock_gen_put(iter->batch[iter->cur_sk++]);
2794}
2795
2796static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2797				      unsigned int new_batch_sz)
2798{
2799	struct sock **new_batch;
2800
2801	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2802			     GFP_USER | __GFP_NOWARN);
2803	if (!new_batch)
2804		return -ENOMEM;
2805
2806	bpf_iter_tcp_put_batch(iter);
2807	kvfree(iter->batch);
2808	iter->batch = new_batch;
2809	iter->max_sk = new_batch_sz;
2810
2811	return 0;
2812}
2813
2814static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2815						 struct sock *start_sk)
2816{
2817	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2818	struct bpf_tcp_iter_state *iter = seq->private;
2819	struct tcp_iter_state *st = &iter->state;
2820	struct hlist_nulls_node *node;
2821	unsigned int expected = 1;
2822	struct sock *sk;
2823
2824	sock_hold(start_sk);
2825	iter->batch[iter->end_sk++] = start_sk;
2826
2827	sk = sk_nulls_next(start_sk);
2828	sk_nulls_for_each_from(sk, node) {
2829		if (seq_sk_match(seq, sk)) {
2830			if (iter->end_sk < iter->max_sk) {
2831				sock_hold(sk);
2832				iter->batch[iter->end_sk++] = sk;
2833			}
2834			expected++;
2835		}
2836	}
2837	spin_unlock(&hinfo->lhash2[st->bucket].lock);
2838
2839	return expected;
2840}
2841
2842static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2843						   struct sock *start_sk)
2844{
2845	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2846	struct bpf_tcp_iter_state *iter = seq->private;
2847	struct tcp_iter_state *st = &iter->state;
2848	struct hlist_nulls_node *node;
2849	unsigned int expected = 1;
2850	struct sock *sk;
2851
2852	sock_hold(start_sk);
2853	iter->batch[iter->end_sk++] = start_sk;
2854
2855	sk = sk_nulls_next(start_sk);
2856	sk_nulls_for_each_from(sk, node) {
2857		if (seq_sk_match(seq, sk)) {
2858			if (iter->end_sk < iter->max_sk) {
2859				sock_hold(sk);
2860				iter->batch[iter->end_sk++] = sk;
2861			}
2862			expected++;
2863		}
2864	}
2865	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2866
2867	return expected;
2868}
2869
2870static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2871{
2872	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2873	struct bpf_tcp_iter_state *iter = seq->private;
2874	struct tcp_iter_state *st = &iter->state;
2875	unsigned int expected;
2876	bool resized = false;
2877	struct sock *sk;
2878
2879	/* The st->bucket is done.  Directly advance to the next
2880	 * bucket instead of having the tcp_seek_last_pos() to skip
2881	 * one by one in the current bucket and eventually find out
2882	 * it has to advance to the next bucket.
2883	 */
2884	if (iter->st_bucket_done) {
2885		st->offset = 0;
2886		st->bucket++;
2887		if (st->state == TCP_SEQ_STATE_LISTENING &&
2888		    st->bucket > hinfo->lhash2_mask) {
2889			st->state = TCP_SEQ_STATE_ESTABLISHED;
2890			st->bucket = 0;
2891		}
2892	}
2893
2894again:
2895	/* Get a new batch */
2896	iter->cur_sk = 0;
2897	iter->end_sk = 0;
2898	iter->st_bucket_done = false;
2899
2900	sk = tcp_seek_last_pos(seq);
2901	if (!sk)
2902		return NULL; /* Done */
2903
2904	if (st->state == TCP_SEQ_STATE_LISTENING)
2905		expected = bpf_iter_tcp_listening_batch(seq, sk);
2906	else
2907		expected = bpf_iter_tcp_established_batch(seq, sk);
2908
2909	if (iter->end_sk == expected) {
2910		iter->st_bucket_done = true;
2911		return sk;
2912	}
2913
2914	if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2915		resized = true;
2916		goto again;
2917	}
2918
2919	return sk;
2920}
2921
2922static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2923{
2924	/* bpf iter does not support lseek, so it always
2925	 * continue from where it was stop()-ped.
2926	 */
2927	if (*pos)
2928		return bpf_iter_tcp_batch(seq);
2929
2930	return SEQ_START_TOKEN;
2931}
2932
2933static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2934{
2935	struct bpf_tcp_iter_state *iter = seq->private;
2936	struct tcp_iter_state *st = &iter->state;
2937	struct sock *sk;
2938
2939	/* Whenever seq_next() is called, the iter->cur_sk is
2940	 * done with seq_show(), so advance to the next sk in
2941	 * the batch.
2942	 */
2943	if (iter->cur_sk < iter->end_sk) {
2944		/* Keeping st->num consistent in tcp_iter_state.
2945		 * bpf_iter_tcp does not use st->num.
2946		 * meta.seq_num is used instead.
2947		 */
2948		st->num++;
2949		/* Move st->offset to the next sk in the bucket such that
2950		 * the future start() will resume at st->offset in
2951		 * st->bucket.  See tcp_seek_last_pos().
2952		 */
2953		st->offset++;
2954		sock_gen_put(iter->batch[iter->cur_sk++]);
2955	}
2956
2957	if (iter->cur_sk < iter->end_sk)
2958		sk = iter->batch[iter->cur_sk];
2959	else
2960		sk = bpf_iter_tcp_batch(seq);
2961
2962	++*pos;
2963	/* Keeping st->last_pos consistent in tcp_iter_state.
2964	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2965	 */
2966	st->last_pos = *pos;
2967	return sk;
2968}
2969
2970static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2971{
2972	struct bpf_iter_meta meta;
2973	struct bpf_prog *prog;
2974	struct sock *sk = v;
2975	uid_t uid;
2976	int ret;
2977
2978	if (v == SEQ_START_TOKEN)
2979		return 0;
2980
2981	if (sk_fullsock(sk))
2982		lock_sock(sk);
2983
2984	if (unlikely(sk_unhashed(sk))) {
2985		ret = SEQ_SKIP;
2986		goto unlock;
2987	}
2988
2989	if (sk->sk_state == TCP_TIME_WAIT) {
2990		uid = 0;
2991	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2992		const struct request_sock *req = v;
2993
2994		uid = from_kuid_munged(seq_user_ns(seq),
2995				       sock_i_uid(req->rsk_listener));
2996	} else {
2997		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2998	}
2999
3000	meta.seq = seq;
3001	prog = bpf_iter_get_info(&meta, false);
3002	ret = tcp_prog_seq_show(prog, &meta, v, uid);
3003
3004unlock:
3005	if (sk_fullsock(sk))
3006		release_sock(sk);
3007	return ret;
3008
3009}
3010
3011static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3012{
3013	struct bpf_tcp_iter_state *iter = seq->private;
3014	struct bpf_iter_meta meta;
3015	struct bpf_prog *prog;
3016
3017	if (!v) {
3018		meta.seq = seq;
3019		prog = bpf_iter_get_info(&meta, true);
3020		if (prog)
3021			(void)tcp_prog_seq_show(prog, &meta, v, 0);
3022	}
3023
3024	if (iter->cur_sk < iter->end_sk) {
3025		bpf_iter_tcp_put_batch(iter);
3026		iter->st_bucket_done = false;
3027	}
3028}
3029
3030static const struct seq_operations bpf_iter_tcp_seq_ops = {
3031	.show		= bpf_iter_tcp_seq_show,
3032	.start		= bpf_iter_tcp_seq_start,
3033	.next		= bpf_iter_tcp_seq_next,
3034	.stop		= bpf_iter_tcp_seq_stop,
3035};
3036#endif
3037static unsigned short seq_file_family(const struct seq_file *seq)
3038{
3039	const struct tcp_seq_afinfo *afinfo;
3040
3041#ifdef CONFIG_BPF_SYSCALL
3042	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3043	if (seq->op == &bpf_iter_tcp_seq_ops)
3044		return AF_UNSPEC;
3045#endif
3046
3047	/* Iterated from proc fs */
3048	afinfo = pde_data(file_inode(seq->file));
3049	return afinfo->family;
3050}
3051
3052static const struct seq_operations tcp4_seq_ops = {
3053	.show		= tcp4_seq_show,
3054	.start		= tcp_seq_start,
3055	.next		= tcp_seq_next,
3056	.stop		= tcp_seq_stop,
3057};
3058
3059static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3060	.family		= AF_INET,
3061};
3062
3063static int __net_init tcp4_proc_init_net(struct net *net)
3064{
3065	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3066			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3067		return -ENOMEM;
3068	return 0;
3069}
3070
3071static void __net_exit tcp4_proc_exit_net(struct net *net)
3072{
3073	remove_proc_entry("tcp", net->proc_net);
3074}
3075
3076static struct pernet_operations tcp4_net_ops = {
3077	.init = tcp4_proc_init_net,
3078	.exit = tcp4_proc_exit_net,
3079};
3080
3081int __init tcp4_proc_init(void)
3082{
3083	return register_pernet_subsys(&tcp4_net_ops);
3084}
3085
3086void tcp4_proc_exit(void)
3087{
3088	unregister_pernet_subsys(&tcp4_net_ops);
3089}
3090#endif /* CONFIG_PROC_FS */
3091
3092/* @wake is one when sk_stream_write_space() calls us.
3093 * This sends EPOLLOUT only if notsent_bytes is half the limit.
3094 * This mimics the strategy used in sock_def_write_space().
3095 */
3096bool tcp_stream_memory_free(const struct sock *sk, int wake)
3097{
3098	const struct tcp_sock *tp = tcp_sk(sk);
3099	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3100			    READ_ONCE(tp->snd_nxt);
3101
3102	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3103}
3104EXPORT_SYMBOL(tcp_stream_memory_free);
3105
3106struct proto tcp_prot = {
3107	.name			= "TCP",
3108	.owner			= THIS_MODULE,
3109	.close			= tcp_close,
3110	.pre_connect		= tcp_v4_pre_connect,
3111	.connect		= tcp_v4_connect,
3112	.disconnect		= tcp_disconnect,
3113	.accept			= inet_csk_accept,
3114	.ioctl			= tcp_ioctl,
3115	.init			= tcp_v4_init_sock,
3116	.destroy		= tcp_v4_destroy_sock,
3117	.shutdown		= tcp_shutdown,
3118	.setsockopt		= tcp_setsockopt,
3119	.getsockopt		= tcp_getsockopt,
3120	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3121	.keepalive		= tcp_set_keepalive,
3122	.recvmsg		= tcp_recvmsg,
3123	.sendmsg		= tcp_sendmsg,
3124	.splice_eof		= tcp_splice_eof,
3125	.backlog_rcv		= tcp_v4_do_rcv,
3126	.release_cb		= tcp_release_cb,
3127	.hash			= inet_hash,
3128	.unhash			= inet_unhash,
3129	.get_port		= inet_csk_get_port,
3130	.put_port		= inet_put_port,
3131#ifdef CONFIG_BPF_SYSCALL
3132	.psock_update_sk_prot	= tcp_bpf_update_proto,
3133#endif
3134	.enter_memory_pressure	= tcp_enter_memory_pressure,
3135	.leave_memory_pressure	= tcp_leave_memory_pressure,
3136	.stream_memory_free	= tcp_stream_memory_free,
3137	.sockets_allocated	= &tcp_sockets_allocated,
3138	.orphan_count		= &tcp_orphan_count,
3139
3140	.memory_allocated	= &tcp_memory_allocated,
3141	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
3142
3143	.memory_pressure	= &tcp_memory_pressure,
3144	.sysctl_mem		= sysctl_tcp_mem,
3145	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3146	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3147	.max_header		= MAX_TCP_HEADER,
3148	.obj_size		= sizeof(struct tcp_sock),
3149	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3150	.twsk_prot		= &tcp_timewait_sock_ops,
3151	.rsk_prot		= &tcp_request_sock_ops,
3152	.h.hashinfo		= NULL,
3153	.no_autobind		= true,
3154	.diag_destroy		= tcp_abort,
3155};
3156EXPORT_SYMBOL(tcp_prot);
3157
3158static void __net_exit tcp_sk_exit(struct net *net)
3159{
3160	if (net->ipv4.tcp_congestion_control)
3161		bpf_module_put(net->ipv4.tcp_congestion_control,
3162			       net->ipv4.tcp_congestion_control->owner);
3163}
3164
3165static void __net_init tcp_set_hashinfo(struct net *net)
3166{
3167	struct inet_hashinfo *hinfo;
3168	unsigned int ehash_entries;
3169	struct net *old_net;
3170
3171	if (net_eq(net, &init_net))
3172		goto fallback;
3173
3174	old_net = current->nsproxy->net_ns;
3175	ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3176	if (!ehash_entries)
3177		goto fallback;
3178
3179	ehash_entries = roundup_pow_of_two(ehash_entries);
3180	hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3181	if (!hinfo) {
3182		pr_warn("Failed to allocate TCP ehash (entries: %u) "
3183			"for a netns, fallback to the global one\n",
3184			ehash_entries);
3185fallback:
3186		hinfo = &tcp_hashinfo;
3187		ehash_entries = tcp_hashinfo.ehash_mask + 1;
3188	}
3189
3190	net->ipv4.tcp_death_row.hashinfo = hinfo;
3191	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3192	net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3193}
3194
3195static int __net_init tcp_sk_init(struct net *net)
3196{
3197	net->ipv4.sysctl_tcp_ecn = 2;
3198	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3199
3200	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3201	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3202	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3203	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3204	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3205
3206	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3207	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3208	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3209
3210	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3211	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3212	net->ipv4.sysctl_tcp_syncookies = 1;
3213	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3214	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3215	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3216	net->ipv4.sysctl_tcp_orphan_retries = 0;
3217	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3218	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3219	net->ipv4.sysctl_tcp_tw_reuse = 2;
3220	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3221
3222	refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3223	tcp_set_hashinfo(net);
3224
3225	net->ipv4.sysctl_tcp_sack = 1;
3226	net->ipv4.sysctl_tcp_window_scaling = 1;
3227	net->ipv4.sysctl_tcp_timestamps = 1;
3228	net->ipv4.sysctl_tcp_early_retrans = 3;
3229	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3230	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3231	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3232	net->ipv4.sysctl_tcp_max_reordering = 300;
3233	net->ipv4.sysctl_tcp_dsack = 1;
3234	net->ipv4.sysctl_tcp_app_win = 31;
3235	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3236	net->ipv4.sysctl_tcp_frto = 2;
3237	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3238	/* This limits the percentage of the congestion window which we
3239	 * will allow a single TSO frame to consume.  Building TSO frames
3240	 * which are too large can cause TCP streams to be bursty.
3241	 */
3242	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3243	/* Default TSQ limit of 16 TSO segments */
3244	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3245
3246	/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3247	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3248
3249	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3250	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3251	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3252	net->ipv4.sysctl_tcp_autocorking = 1;
3253	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3254	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3255	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3256	if (net != &init_net) {
3257		memcpy(net->ipv4.sysctl_tcp_rmem,
3258		       init_net.ipv4.sysctl_tcp_rmem,
3259		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3260		memcpy(net->ipv4.sysctl_tcp_wmem,
3261		       init_net.ipv4.sysctl_tcp_wmem,
3262		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3263	}
3264	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3265	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3266	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3267	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3268	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3269	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3270
3271	/* Set default values for PLB */
3272	net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3273	net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3274	net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3275	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3276	/* Default congestion threshold for PLB to mark a round is 50% */
3277	net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3278
3279	/* Reno is always built in */
3280	if (!net_eq(net, &init_net) &&
3281	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3282			       init_net.ipv4.tcp_congestion_control->owner))
3283		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3284	else
3285		net->ipv4.tcp_congestion_control = &tcp_reno;
3286
3287	net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3288	net->ipv4.sysctl_tcp_shrink_window = 0;
3289
3290	return 0;
3291}
3292
3293static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3294{
3295	struct net *net;
3296
3297	tcp_twsk_purge(net_exit_list, AF_INET);
3298
3299	list_for_each_entry(net, net_exit_list, exit_list) {
3300		inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3301		WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3302		tcp_fastopen_ctx_destroy(net);
3303	}
3304}
3305
3306static struct pernet_operations __net_initdata tcp_sk_ops = {
3307       .init	   = tcp_sk_init,
3308       .exit	   = tcp_sk_exit,
3309       .exit_batch = tcp_sk_exit_batch,
3310};
3311
3312#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3313DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3314		     struct sock_common *sk_common, uid_t uid)
3315
3316#define INIT_BATCH_SZ 16
3317
3318static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3319{
3320	struct bpf_tcp_iter_state *iter = priv_data;
3321	int err;
3322
3323	err = bpf_iter_init_seq_net(priv_data, aux);
3324	if (err)
3325		return err;
3326
3327	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3328	if (err) {
3329		bpf_iter_fini_seq_net(priv_data);
3330		return err;
3331	}
3332
3333	return 0;
3334}
3335
3336static void bpf_iter_fini_tcp(void *priv_data)
3337{
3338	struct bpf_tcp_iter_state *iter = priv_data;
3339
3340	bpf_iter_fini_seq_net(priv_data);
3341	kvfree(iter->batch);
3342}
3343
3344static const struct bpf_iter_seq_info tcp_seq_info = {
3345	.seq_ops		= &bpf_iter_tcp_seq_ops,
3346	.init_seq_private	= bpf_iter_init_tcp,
3347	.fini_seq_private	= bpf_iter_fini_tcp,
3348	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3349};
3350
3351static const struct bpf_func_proto *
3352bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3353			    const struct bpf_prog *prog)
3354{
3355	switch (func_id) {
3356	case BPF_FUNC_setsockopt:
3357		return &bpf_sk_setsockopt_proto;
3358	case BPF_FUNC_getsockopt:
3359		return &bpf_sk_getsockopt_proto;
3360	default:
3361		return NULL;
3362	}
3363}
3364
3365static struct bpf_iter_reg tcp_reg_info = {
3366	.target			= "tcp",
3367	.ctx_arg_info_size	= 1,
3368	.ctx_arg_info		= {
3369		{ offsetof(struct bpf_iter__tcp, sk_common),
3370		  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3371	},
3372	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3373	.seq_info		= &tcp_seq_info,
3374};
3375
3376static void __init bpf_iter_register(void)
3377{
3378	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3379	if (bpf_iter_reg_target(&tcp_reg_info))
3380		pr_warn("Warning: could not register bpf iterator tcp\n");
3381}
3382
3383#endif
3384
3385void __init tcp_v4_init(void)
3386{
3387	int cpu, res;
3388
3389	for_each_possible_cpu(cpu) {
3390		struct sock *sk;
3391
3392		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3393					   IPPROTO_TCP, &init_net);
3394		if (res)
3395			panic("Failed to create the TCP control socket.\n");
3396		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3397
3398		/* Please enforce IP_DF and IPID==0 for RST and
3399		 * ACK sent in SYN-RECV and TIME-WAIT state.
3400		 */
3401		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3402
3403		per_cpu(ipv4_tcp_sk, cpu) = sk;
3404	}
3405	if (register_pernet_subsys(&tcp_sk_ops))
3406		panic("Failed to create the TCP control socket.\n");
3407
3408#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3409	bpf_iter_register();
3410#endif
3411}
3412