1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72 
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
80 
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
83 
84 #include <trace/events/tcp.h>
85 
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
89 #endif
90 
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
93 
94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
95 
tcp_v4_init_seq(const struct sk_buff *skb)96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
97 {
98 	return secure_tcp_seq(ip_hdr(skb)->daddr,
99 			      ip_hdr(skb)->saddr,
100 			      tcp_hdr(skb)->dest,
101 			      tcp_hdr(skb)->source);
102 }
103 
tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
105 {
106 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
107 }
108 
tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110 {
111 	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
112 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
113 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114 	struct tcp_sock *tp = tcp_sk(sk);
115 
116 	if (reuse == 2) {
117 		/* Still does not detect *everything* that goes through
118 		 * lo, since we require a loopback src or dst address
119 		 * or direct binding to 'lo' interface.
120 		 */
121 		bool loopback = false;
122 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
123 			loopback = true;
124 #if IS_ENABLED(CONFIG_IPV6)
125 		if (tw->tw_family == AF_INET6) {
126 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
127 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
128 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
129 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
130 				loopback = true;
131 		} else
132 #endif
133 		{
134 			if (ipv4_is_loopback(tw->tw_daddr) ||
135 			    ipv4_is_loopback(tw->tw_rcv_saddr))
136 				loopback = true;
137 		}
138 		if (!loopback)
139 			reuse = 0;
140 	}
141 
142 	/* With PAWS, it is safe from the viewpoint
143 	   of data integrity. Even without PAWS it is safe provided sequence
144 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
145 
146 	   Actually, the idea is close to VJ's one, only timestamp cache is
147 	   held not per host, but per port pair and TW bucket is used as state
148 	   holder.
149 
150 	   If TW bucket has been already destroyed we fall back to VJ's scheme
151 	   and use initial timestamp retrieved from peer table.
152 	 */
153 	if (tcptw->tw_ts_recent_stamp &&
154 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
155 					    tcptw->tw_ts_recent_stamp)))) {
156 		/* inet_twsk_hashdance() sets sk_refcnt after putting twsk
157 		 * and releasing the bucket lock.
158 		 */
159 		if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
160 			return 0;
161 
162 		/* In case of repair and re-using TIME-WAIT sockets we still
163 		 * want to be sure that it is safe as above but honor the
164 		 * sequence numbers and time stamps set as part of the repair
165 		 * process.
166 		 *
167 		 * Without this check re-using a TIME-WAIT socket with TCP
168 		 * repair would accumulate a -1 on the repair assigned
169 		 * sequence number. The first time it is reused the sequence
170 		 * is -1, the second time -2, etc. This fixes that issue
171 		 * without appearing to create any others.
172 		 */
173 		if (likely(!tp->repair)) {
174 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
175 
176 			if (!seq)
177 				seq = 1;
178 			WRITE_ONCE(tp->write_seq, seq);
179 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
180 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
181 		}
182 
183 		return 1;
184 	}
185 
186 	return 0;
187 }
188 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
189 
tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)190 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
191 			      int addr_len)
192 {
193 	/* This check is replicated from tcp_v4_connect() and intended to
194 	 * prevent BPF program called below from accessing bytes that are out
195 	 * of the bound specified by user in addr_len.
196 	 */
197 	if (addr_len < sizeof(struct sockaddr_in))
198 		return -EINVAL;
199 
200 	sock_owned_by_me(sk);
201 
202 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
203 }
204 
205 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)206 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
207 {
208 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
209 	struct inet_sock *inet = inet_sk(sk);
210 	struct tcp_sock *tp = tcp_sk(sk);
211 	__be16 orig_sport, orig_dport;
212 	__be32 daddr, nexthop;
213 	struct flowi4 *fl4;
214 	struct rtable *rt;
215 	int err;
216 	struct ip_options_rcu *inet_opt;
217 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
218 
219 	if (addr_len < sizeof(struct sockaddr_in))
220 		return -EINVAL;
221 
222 	if (usin->sin_family != AF_INET)
223 		return -EAFNOSUPPORT;
224 
225 	nexthop = daddr = usin->sin_addr.s_addr;
226 	inet_opt = rcu_dereference_protected(inet->inet_opt,
227 					     lockdep_sock_is_held(sk));
228 	if (inet_opt && inet_opt->opt.srr) {
229 		if (!daddr)
230 			return -EINVAL;
231 		nexthop = inet_opt->opt.faddr;
232 	}
233 
234 	orig_sport = inet->inet_sport;
235 	orig_dport = usin->sin_port;
236 	fl4 = &inet->cork.fl.u.ip4;
237 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
238 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
239 			      IPPROTO_TCP,
240 			      orig_sport, orig_dport, sk);
241 	if (IS_ERR(rt)) {
242 		err = PTR_ERR(rt);
243 		if (err == -ENETUNREACH)
244 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
245 		return err;
246 	}
247 
248 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
249 		ip_rt_put(rt);
250 		return -ENETUNREACH;
251 	}
252 
253 	if (!inet_opt || !inet_opt->opt.srr)
254 		daddr = fl4->daddr;
255 
256 	if (!inet->inet_saddr)
257 		inet->inet_saddr = fl4->saddr;
258 	sk_rcv_saddr_set(sk, inet->inet_saddr);
259 
260 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
261 		/* Reset inherited state */
262 		tp->rx_opt.ts_recent	   = 0;
263 		tp->rx_opt.ts_recent_stamp = 0;
264 		if (likely(!tp->repair))
265 			WRITE_ONCE(tp->write_seq, 0);
266 	}
267 
268 	inet->inet_dport = usin->sin_port;
269 	sk_daddr_set(sk, daddr);
270 
271 	inet_csk(sk)->icsk_ext_hdr_len = 0;
272 	if (inet_opt)
273 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
274 
275 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
276 
277 	/* Socket identity is still unknown (sport may be zero).
278 	 * However we set state to SYN-SENT and not releasing socket
279 	 * lock select source port, enter ourselves into the hash tables and
280 	 * complete initialization after this.
281 	 */
282 	tcp_set_state(sk, TCP_SYN_SENT);
283 	err = inet_hash_connect(tcp_death_row, sk);
284 	if (err)
285 		goto failure;
286 
287 	sk_set_txhash(sk);
288 
289 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
290 			       inet->inet_sport, inet->inet_dport, sk);
291 	if (IS_ERR(rt)) {
292 		err = PTR_ERR(rt);
293 		rt = NULL;
294 		goto failure;
295 	}
296 	/* OK, now commit destination to socket.  */
297 	sk->sk_gso_type = SKB_GSO_TCPV4;
298 	sk_setup_caps(sk, &rt->dst);
299 	rt = NULL;
300 
301 	if (likely(!tp->repair)) {
302 		if (!tp->write_seq)
303 			WRITE_ONCE(tp->write_seq,
304 				   secure_tcp_seq(inet->inet_saddr,
305 						  inet->inet_daddr,
306 						  inet->inet_sport,
307 						  usin->sin_port));
308 		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
309 						 inet->inet_saddr,
310 						 inet->inet_daddr);
311 	}
312 
313 	inet->inet_id = prandom_u32();
314 
315 	if (tcp_fastopen_defer_connect(sk, &err))
316 		return err;
317 	if (err)
318 		goto failure;
319 
320 	err = tcp_connect(sk);
321 
322 	if (err)
323 		goto failure;
324 
325 	return 0;
326 
327 failure:
328 	/*
329 	 * This unhashes the socket and releases the local port,
330 	 * if necessary.
331 	 */
332 	tcp_set_state(sk, TCP_CLOSE);
333 	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
334 		inet_reset_saddr(sk);
335 	ip_rt_put(rt);
336 	sk->sk_route_caps = 0;
337 	inet->inet_dport = 0;
338 	return err;
339 }
340 EXPORT_SYMBOL(tcp_v4_connect);
341 
342 /*
343  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
344  * It can be called through tcp_release_cb() if socket was owned by user
345  * at the time tcp_v4_err() was called to handle ICMP message.
346  */
tcp_v4_mtu_reduced(struct sock *sk)347 void tcp_v4_mtu_reduced(struct sock *sk)
348 {
349 	struct inet_sock *inet = inet_sk(sk);
350 	struct dst_entry *dst;
351 	u32 mtu;
352 
353 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
354 		return;
355 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
356 	dst = inet_csk_update_pmtu(sk, mtu);
357 	if (!dst)
358 		return;
359 
360 	/* Something is about to be wrong... Remember soft error
361 	 * for the case, if this connection will not able to recover.
362 	 */
363 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
364 		sk->sk_err_soft = EMSGSIZE;
365 
366 	mtu = dst_mtu(dst);
367 
368 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
369 	    ip_sk_accept_pmtu(sk) &&
370 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
371 		tcp_sync_mss(sk, mtu);
372 
373 		/* Resend the TCP packet because it's
374 		 * clear that the old packet has been
375 		 * dropped. This is the new "fast" path mtu
376 		 * discovery.
377 		 */
378 		tcp_simple_retransmit(sk);
379 	} /* else let the usual retransmit timer handle it */
380 }
381 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
382 
do_redirect(struct sk_buff *skb, struct sock *sk)383 static void do_redirect(struct sk_buff *skb, struct sock *sk)
384 {
385 	struct dst_entry *dst = __sk_dst_check(sk, 0);
386 
387 	if (dst)
388 		dst->ops->redirect(dst, sk, skb);
389 }
390 
391 
392 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock *sk, u32 seq, bool abort)393 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
394 {
395 	struct request_sock *req = inet_reqsk(sk);
396 	struct net *net = sock_net(sk);
397 
398 	/* ICMPs are not backlogged, hence we cannot get
399 	 * an established socket here.
400 	 */
401 	if (seq != tcp_rsk(req)->snt_isn) {
402 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
403 	} else if (abort) {
404 		/*
405 		 * Still in SYN_RECV, just remove it silently.
406 		 * There is no good way to pass the error to the newly
407 		 * created socket, and POSIX does not want network
408 		 * errors returned from accept().
409 		 */
410 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
411 		tcp_listendrop(req->rsk_listener);
412 	}
413 	reqsk_put(req);
414 }
415 EXPORT_SYMBOL(tcp_req_err);
416 
417 /* TCP-LD (RFC 6069) logic */
tcp_ld_RTO_revert(struct sock *sk, u32 seq)418 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
419 {
420 	struct inet_connection_sock *icsk = inet_csk(sk);
421 	struct tcp_sock *tp = tcp_sk(sk);
422 	struct sk_buff *skb;
423 	s32 remaining;
424 	u32 delta_us;
425 
426 	if (sock_owned_by_user(sk))
427 		return;
428 
429 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
430 	    !icsk->icsk_backoff)
431 		return;
432 
433 	skb = tcp_rtx_queue_head(sk);
434 	if (WARN_ON_ONCE(!skb))
435 		return;
436 
437 	icsk->icsk_backoff--;
438 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
439 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
440 
441 	tcp_mstamp_refresh(tp);
442 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
443 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
444 
445 	if (remaining > 0) {
446 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
447 					  remaining, TCP_RTO_MAX);
448 	} else {
449 		/* RTO revert clocked out retransmission.
450 		 * Will retransmit now.
451 		 */
452 		tcp_retransmit_timer(sk);
453 	}
454 }
455 EXPORT_SYMBOL(tcp_ld_RTO_revert);
456 
457 /*
458  * This routine is called by the ICMP module when it gets some
459  * sort of error condition.  If err < 0 then the socket should
460  * be closed and the error returned to the user.  If err > 0
461  * it's just the icmp type << 8 | icmp code.  After adjustment
462  * header points to the first 8 bytes of the tcp header.  We need
463  * to find the appropriate port.
464  *
465  * The locking strategy used here is very "optimistic". When
466  * someone else accesses the socket the ICMP is just dropped
467  * and for some paths there is no check at all.
468  * A more general error queue to queue errors for later handling
469  * is probably better.
470  *
471  */
472 
tcp_v4_err(struct sk_buff *skb, u32 info)473 int tcp_v4_err(struct sk_buff *skb, u32 info)
474 {
475 	const struct iphdr *iph = (const struct iphdr *)skb->data;
476 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
477 	struct tcp_sock *tp;
478 	struct inet_sock *inet;
479 	const int type = icmp_hdr(skb)->type;
480 	const int code = icmp_hdr(skb)->code;
481 	struct sock *sk;
482 	struct request_sock *fastopen;
483 	u32 seq, snd_una;
484 	int err;
485 	struct net *net = dev_net(skb->dev);
486 
487 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
488 				       th->dest, iph->saddr, ntohs(th->source),
489 				       inet_iif(skb), 0);
490 	if (!sk) {
491 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
492 		return -ENOENT;
493 	}
494 	if (sk->sk_state == TCP_TIME_WAIT) {
495 		inet_twsk_put(inet_twsk(sk));
496 		return 0;
497 	}
498 	seq = ntohl(th->seq);
499 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
500 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
501 				     type == ICMP_TIME_EXCEEDED ||
502 				     (type == ICMP_DEST_UNREACH &&
503 				      (code == ICMP_NET_UNREACH ||
504 				       code == ICMP_HOST_UNREACH)));
505 		return 0;
506 	}
507 
508 	bh_lock_sock(sk);
509 	/* If too many ICMPs get dropped on busy
510 	 * servers this needs to be solved differently.
511 	 * We do take care of PMTU discovery (RFC1191) special case :
512 	 * we can receive locally generated ICMP messages while socket is held.
513 	 */
514 	if (sock_owned_by_user(sk)) {
515 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
516 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
517 	}
518 	if (sk->sk_state == TCP_CLOSE)
519 		goto out;
520 
521 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
522 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
523 		goto out;
524 	}
525 
526 	tp = tcp_sk(sk);
527 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
528 	fastopen = rcu_dereference(tp->fastopen_rsk);
529 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
530 	if (sk->sk_state != TCP_LISTEN &&
531 	    !between(seq, snd_una, tp->snd_nxt)) {
532 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
533 		goto out;
534 	}
535 
536 	switch (type) {
537 	case ICMP_REDIRECT:
538 		if (!sock_owned_by_user(sk))
539 			do_redirect(skb, sk);
540 		goto out;
541 	case ICMP_SOURCE_QUENCH:
542 		/* Just silently ignore these. */
543 		goto out;
544 	case ICMP_PARAMETERPROB:
545 		err = EPROTO;
546 		break;
547 	case ICMP_DEST_UNREACH:
548 		if (code > NR_ICMP_UNREACH)
549 			goto out;
550 
551 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
552 			/* We are not interested in TCP_LISTEN and open_requests
553 			 * (SYN-ACKs send out by Linux are always <576bytes so
554 			 * they should go through unfragmented).
555 			 */
556 			if (sk->sk_state == TCP_LISTEN)
557 				goto out;
558 
559 			WRITE_ONCE(tp->mtu_info, info);
560 			if (!sock_owned_by_user(sk)) {
561 				tcp_v4_mtu_reduced(sk);
562 			} else {
563 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
564 					sock_hold(sk);
565 			}
566 			goto out;
567 		}
568 
569 		err = icmp_err_convert[code].errno;
570 		/* check if this ICMP message allows revert of backoff.
571 		 * (see RFC 6069)
572 		 */
573 		if (!fastopen &&
574 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
575 			tcp_ld_RTO_revert(sk, seq);
576 		break;
577 	case ICMP_TIME_EXCEEDED:
578 		err = EHOSTUNREACH;
579 		break;
580 	default:
581 		goto out;
582 	}
583 
584 	switch (sk->sk_state) {
585 	case TCP_SYN_SENT:
586 	case TCP_SYN_RECV:
587 		/* Only in fast or simultaneous open. If a fast open socket is
588 		 * already accepted it is treated as a connected one below.
589 		 */
590 		if (fastopen && !fastopen->sk)
591 			break;
592 
593 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
594 
595 		if (!sock_owned_by_user(sk)) {
596 			sk->sk_err = err;
597 
598 			sk->sk_error_report(sk);
599 
600 			tcp_done(sk);
601 		} else {
602 			sk->sk_err_soft = err;
603 		}
604 		goto out;
605 	}
606 
607 	/* If we've already connected we will keep trying
608 	 * until we time out, or the user gives up.
609 	 *
610 	 * rfc1122 4.2.3.9 allows to consider as hard errors
611 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
612 	 * but it is obsoleted by pmtu discovery).
613 	 *
614 	 * Note, that in modern internet, where routing is unreliable
615 	 * and in each dark corner broken firewalls sit, sending random
616 	 * errors ordered by their masters even this two messages finally lose
617 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
618 	 *
619 	 * Now we are in compliance with RFCs.
620 	 *							--ANK (980905)
621 	 */
622 
623 	inet = inet_sk(sk);
624 	if (!sock_owned_by_user(sk) && inet->recverr) {
625 		sk->sk_err = err;
626 		sk->sk_error_report(sk);
627 	} else	{ /* Only an error on timeout */
628 		sk->sk_err_soft = err;
629 	}
630 
631 out:
632 	bh_unlock_sock(sk);
633 	sock_put(sk);
634 	return 0;
635 }
636 
__tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)637 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
638 {
639 	struct tcphdr *th = tcp_hdr(skb);
640 
641 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
642 	skb->csum_start = skb_transport_header(skb) - skb->head;
643 	skb->csum_offset = offsetof(struct tcphdr, check);
644 }
645 
646 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)647 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
648 {
649 	const struct inet_sock *inet = inet_sk(sk);
650 
651 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
652 }
653 EXPORT_SYMBOL(tcp_v4_send_check);
654 
655 /*
656  *	This routine will send an RST to the other tcp.
657  *
658  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
659  *		      for reset.
660  *	Answer: if a packet caused RST, it is not for a socket
661  *		existing in our system, if it is matched to a socket,
662  *		it is just duplicate segment or bug in other side's TCP.
663  *		So that we build reply only basing on parameters
664  *		arrived with segment.
665  *	Exception: precedence violation. We do not implement it in any case.
666  */
667 
tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)668 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
669 {
670 	const struct tcphdr *th = tcp_hdr(skb);
671 	struct {
672 		struct tcphdr th;
673 #ifdef CONFIG_TCP_MD5SIG
674 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
675 #endif
676 	} rep;
677 	struct ip_reply_arg arg;
678 #ifdef CONFIG_TCP_MD5SIG
679 	struct tcp_md5sig_key *key = NULL;
680 	const __u8 *hash_location = NULL;
681 	unsigned char newhash[16];
682 	int genhash;
683 	struct sock *sk1 = NULL;
684 #endif
685 	u64 transmit_time = 0;
686 	struct sock *ctl_sk;
687 	struct net *net;
688 
689 	/* Never send a reset in response to a reset. */
690 	if (th->rst)
691 		return;
692 
693 	/* If sk not NULL, it means we did a successful lookup and incoming
694 	 * route had to be correct. prequeue might have dropped our dst.
695 	 */
696 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
697 		return;
698 
699 	/* Swap the send and the receive. */
700 	memset(&rep, 0, sizeof(rep));
701 	rep.th.dest   = th->source;
702 	rep.th.source = th->dest;
703 	rep.th.doff   = sizeof(struct tcphdr) / 4;
704 	rep.th.rst    = 1;
705 
706 	if (th->ack) {
707 		rep.th.seq = th->ack_seq;
708 	} else {
709 		rep.th.ack = 1;
710 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
711 				       skb->len - (th->doff << 2));
712 	}
713 
714 	memset(&arg, 0, sizeof(arg));
715 	arg.iov[0].iov_base = (unsigned char *)&rep;
716 	arg.iov[0].iov_len  = sizeof(rep.th);
717 
718 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
719 #ifdef CONFIG_TCP_MD5SIG
720 	rcu_read_lock();
721 	hash_location = tcp_parse_md5sig_option(th);
722 	if (sk && sk_fullsock(sk)) {
723 		const union tcp_md5_addr *addr;
724 		int l3index;
725 
726 		/* sdif set, means packet ingressed via a device
727 		 * in an L3 domain and inet_iif is set to it.
728 		 */
729 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
730 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
731 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
732 	} else if (hash_location) {
733 		const union tcp_md5_addr *addr;
734 		int sdif = tcp_v4_sdif(skb);
735 		int dif = inet_iif(skb);
736 		int l3index;
737 
738 		/*
739 		 * active side is lost. Try to find listening socket through
740 		 * source port, and then find md5 key through listening socket.
741 		 * we are not loose security here:
742 		 * Incoming packet is checked with md5 hash with finding key,
743 		 * no RST generated if md5 hash doesn't match.
744 		 */
745 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
746 					     ip_hdr(skb)->saddr,
747 					     th->source, ip_hdr(skb)->daddr,
748 					     ntohs(th->source), dif, sdif);
749 		/* don't send rst if it can't find key */
750 		if (!sk1)
751 			goto out;
752 
753 		/* sdif set, means packet ingressed via a device
754 		 * in an L3 domain and dif is set to it.
755 		 */
756 		l3index = sdif ? dif : 0;
757 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
758 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
759 		if (!key)
760 			goto out;
761 
762 
763 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
764 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
765 			goto out;
766 
767 	}
768 
769 	if (key) {
770 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
771 				   (TCPOPT_NOP << 16) |
772 				   (TCPOPT_MD5SIG << 8) |
773 				   TCPOLEN_MD5SIG);
774 		/* Update length and the length the header thinks exists */
775 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
776 		rep.th.doff = arg.iov[0].iov_len / 4;
777 
778 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
779 				     key, ip_hdr(skb)->saddr,
780 				     ip_hdr(skb)->daddr, &rep.th);
781 	}
782 #endif
783 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
784 				      ip_hdr(skb)->saddr, /* XXX */
785 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
786 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
787 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
788 
789 	/* When socket is gone, all binding information is lost.
790 	 * routing might fail in this case. No choice here, if we choose to force
791 	 * input interface, we will misroute in case of asymmetric route.
792 	 */
793 	if (sk) {
794 		arg.bound_dev_if = sk->sk_bound_dev_if;
795 		if (sk_fullsock(sk))
796 			trace_tcp_send_reset(sk, skb);
797 	}
798 
799 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
800 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
801 
802 	arg.tos = ip_hdr(skb)->tos;
803 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
804 	local_bh_disable();
805 	ctl_sk = this_cpu_read(ipv4_tcp_sk);
806 	sock_net_set(ctl_sk, net);
807 	if (sk) {
808 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
809 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
810 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
811 				   inet_twsk(sk)->tw_priority : sk->sk_priority;
812 		transmit_time = tcp_transmit_time(sk);
813 		xfrm_sk_clone_policy(ctl_sk, sk);
814 	} else {
815 		ctl_sk->sk_mark = 0;
816 		ctl_sk->sk_priority = 0;
817 	}
818 	ip_send_unicast_reply(ctl_sk,
819 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
820 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
821 			      &arg, arg.iov[0].iov_len,
822 			      transmit_time);
823 
824 	xfrm_sk_free_policy(ctl_sk);
825 	sock_net_set(ctl_sk, &init_net);
826 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
827 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
828 	local_bh_enable();
829 
830 #ifdef CONFIG_TCP_MD5SIG
831 out:
832 	rcu_read_unlock();
833 #endif
834 }
835 
836 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
837    outside socket context is ugly, certainly. What can I do?
838  */
839 
tcp_v4_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_md5sig_key *key, int reply_flags, u8 tos)840 static void tcp_v4_send_ack(const struct sock *sk,
841 			    struct sk_buff *skb, u32 seq, u32 ack,
842 			    u32 win, u32 tsval, u32 tsecr, int oif,
843 			    struct tcp_md5sig_key *key,
844 			    int reply_flags, u8 tos)
845 {
846 	const struct tcphdr *th = tcp_hdr(skb);
847 	struct {
848 		struct tcphdr th;
849 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
850 #ifdef CONFIG_TCP_MD5SIG
851 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
852 #endif
853 			];
854 	} rep;
855 	struct net *net = sock_net(sk);
856 	struct ip_reply_arg arg;
857 	struct sock *ctl_sk;
858 	u64 transmit_time;
859 
860 	memset(&rep.th, 0, sizeof(struct tcphdr));
861 	memset(&arg, 0, sizeof(arg));
862 
863 	arg.iov[0].iov_base = (unsigned char *)&rep;
864 	arg.iov[0].iov_len  = sizeof(rep.th);
865 	if (tsecr) {
866 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
867 				   (TCPOPT_TIMESTAMP << 8) |
868 				   TCPOLEN_TIMESTAMP);
869 		rep.opt[1] = htonl(tsval);
870 		rep.opt[2] = htonl(tsecr);
871 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
872 	}
873 
874 	/* Swap the send and the receive. */
875 	rep.th.dest    = th->source;
876 	rep.th.source  = th->dest;
877 	rep.th.doff    = arg.iov[0].iov_len / 4;
878 	rep.th.seq     = htonl(seq);
879 	rep.th.ack_seq = htonl(ack);
880 	rep.th.ack     = 1;
881 	rep.th.window  = htons(win);
882 
883 #ifdef CONFIG_TCP_MD5SIG
884 	if (key) {
885 		int offset = (tsecr) ? 3 : 0;
886 
887 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
888 					  (TCPOPT_NOP << 16) |
889 					  (TCPOPT_MD5SIG << 8) |
890 					  TCPOLEN_MD5SIG);
891 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
892 		rep.th.doff = arg.iov[0].iov_len/4;
893 
894 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
895 				    key, ip_hdr(skb)->saddr,
896 				    ip_hdr(skb)->daddr, &rep.th);
897 	}
898 #endif
899 	arg.flags = reply_flags;
900 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
901 				      ip_hdr(skb)->saddr, /* XXX */
902 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
903 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
904 	if (oif)
905 		arg.bound_dev_if = oif;
906 	arg.tos = tos;
907 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
908 	local_bh_disable();
909 	ctl_sk = this_cpu_read(ipv4_tcp_sk);
910 	sock_net_set(ctl_sk, net);
911 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
912 			   inet_twsk(sk)->tw_mark : sk->sk_mark;
913 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
914 			   inet_twsk(sk)->tw_priority : sk->sk_priority;
915 	transmit_time = tcp_transmit_time(sk);
916 	ip_send_unicast_reply(ctl_sk,
917 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
918 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
919 			      &arg, arg.iov[0].iov_len,
920 			      transmit_time);
921 
922 	sock_net_set(ctl_sk, &init_net);
923 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
924 	local_bh_enable();
925 }
926 
tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)927 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
928 {
929 	struct inet_timewait_sock *tw = inet_twsk(sk);
930 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
931 
932 	tcp_v4_send_ack(sk, skb,
933 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
934 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
935 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
936 			tcptw->tw_ts_recent,
937 			tw->tw_bound_dev_if,
938 			tcp_twsk_md5_key(tcptw),
939 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
940 			tw->tw_tos
941 			);
942 
943 	inet_twsk_put(tw);
944 }
945 
tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req)946 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
947 				  struct request_sock *req)
948 {
949 	const union tcp_md5_addr *addr;
950 	int l3index;
951 
952 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
953 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
954 	 */
955 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
956 					     tcp_sk(sk)->snd_nxt;
957 
958 	/* RFC 7323 2.3
959 	 * The window field (SEG.WND) of every outgoing segment, with the
960 	 * exception of <SYN> segments, MUST be right-shifted by
961 	 * Rcv.Wind.Shift bits:
962 	 */
963 	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
964 	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
965 	tcp_v4_send_ack(sk, skb, seq,
966 			tcp_rsk(req)->rcv_nxt,
967 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
968 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
969 			READ_ONCE(req->ts_recent),
970 			0,
971 			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
972 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
973 			ip_hdr(skb)->tos);
974 }
975 
976 /*
977  *	Send a SYN-ACK after having received a SYN.
978  *	This still operates on a request_sock only, not on a big
979  *	socket.
980  */
tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type, struct sk_buff *syn_skb)981 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
982 			      struct flowi *fl,
983 			      struct request_sock *req,
984 			      struct tcp_fastopen_cookie *foc,
985 			      enum tcp_synack_type synack_type,
986 			      struct sk_buff *syn_skb)
987 {
988 	const struct inet_request_sock *ireq = inet_rsk(req);
989 	struct flowi4 fl4;
990 	int err = -1;
991 	struct sk_buff *skb;
992 	u8 tos;
993 
994 	/* First, grab a route. */
995 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
996 		return -1;
997 
998 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
999 
1000 	if (skb) {
1001 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1002 
1003 		tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1004 				(tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1005 				(inet_sk(sk)->tos & INET_ECN_MASK) :
1006 				inet_sk(sk)->tos;
1007 
1008 		if (!INET_ECN_is_capable(tos) &&
1009 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1010 			tos |= INET_ECN_ECT_0;
1011 
1012 		rcu_read_lock();
1013 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1014 					    ireq->ir_rmt_addr,
1015 					    rcu_dereference(ireq->ireq_opt),
1016 					    tos);
1017 		rcu_read_unlock();
1018 		err = net_xmit_eval(err);
1019 	}
1020 
1021 	return err;
1022 }
1023 
1024 /*
1025  *	IPv4 request_sock destructor.
1026  */
tcp_v4_reqsk_destructor(struct request_sock *req)1027 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1028 {
1029 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1030 }
1031 
1032 #ifdef CONFIG_TCP_MD5SIG
1033 /*
1034  * RFC2385 MD5 checksumming requires a mapping of
1035  * IP address->MD5 Key.
1036  * We need to maintain these in the sk structure.
1037  */
1038 
1039 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1040 EXPORT_SYMBOL(tcp_md5_needed);
1041 
better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)1042 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1043 {
1044 	if (!old)
1045 		return true;
1046 
1047 	/* l3index always overrides non-l3index */
1048 	if (old->l3index && new->l3index == 0)
1049 		return false;
1050 	if (old->l3index == 0 && new->l3index)
1051 		return true;
1052 
1053 	return old->prefixlen < new->prefixlen;
1054 }
1055 
1056 /* Find the Key structure for an address.  */
__tcp_md5_do_lookup(const struct sock *sk, int l3index, const union tcp_md5_addr *addr, int family)1057 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1058 					   const union tcp_md5_addr *addr,
1059 					   int family)
1060 {
1061 	const struct tcp_sock *tp = tcp_sk(sk);
1062 	struct tcp_md5sig_key *key;
1063 	const struct tcp_md5sig_info *md5sig;
1064 	__be32 mask;
1065 	struct tcp_md5sig_key *best_match = NULL;
1066 	bool match;
1067 
1068 	/* caller either holds rcu_read_lock() or socket lock */
1069 	md5sig = rcu_dereference_check(tp->md5sig_info,
1070 				       lockdep_sock_is_held(sk));
1071 	if (!md5sig)
1072 		return NULL;
1073 
1074 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1075 				 lockdep_sock_is_held(sk)) {
1076 		if (key->family != family)
1077 			continue;
1078 		if (key->l3index && key->l3index != l3index)
1079 			continue;
1080 		if (family == AF_INET) {
1081 			mask = inet_make_mask(key->prefixlen);
1082 			match = (key->addr.a4.s_addr & mask) ==
1083 				(addr->a4.s_addr & mask);
1084 #if IS_ENABLED(CONFIG_IPV6)
1085 		} else if (family == AF_INET6) {
1086 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1087 						  key->prefixlen);
1088 #endif
1089 		} else {
1090 			match = false;
1091 		}
1092 
1093 		if (match && better_md5_match(best_match, key))
1094 			best_match = key;
1095 	}
1096 	return best_match;
1097 }
1098 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1099 
tcp_md5_do_lookup_exact(const struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index)1100 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1101 						      const union tcp_md5_addr *addr,
1102 						      int family, u8 prefixlen,
1103 						      int l3index)
1104 {
1105 	const struct tcp_sock *tp = tcp_sk(sk);
1106 	struct tcp_md5sig_key *key;
1107 	unsigned int size = sizeof(struct in_addr);
1108 	const struct tcp_md5sig_info *md5sig;
1109 
1110 	/* caller either holds rcu_read_lock() or socket lock */
1111 	md5sig = rcu_dereference_check(tp->md5sig_info,
1112 				       lockdep_sock_is_held(sk));
1113 	if (!md5sig)
1114 		return NULL;
1115 #if IS_ENABLED(CONFIG_IPV6)
1116 	if (family == AF_INET6)
1117 		size = sizeof(struct in6_addr);
1118 #endif
1119 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1120 				 lockdep_sock_is_held(sk)) {
1121 		if (key->family != family)
1122 			continue;
1123 		if (key->l3index != l3index)
1124 			continue;
1125 		if (!memcmp(&key->addr, addr, size) &&
1126 		    key->prefixlen == prefixlen)
1127 			return key;
1128 	}
1129 	return NULL;
1130 }
1131 
tcp_v4_md5_lookup(const struct sock *sk, const struct sock *addr_sk)1132 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1133 					 const struct sock *addr_sk)
1134 {
1135 	const union tcp_md5_addr *addr;
1136 	int l3index;
1137 
1138 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1139 						 addr_sk->sk_bound_dev_if);
1140 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1141 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1142 }
1143 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1144 
1145 /* This can be called on a newly created socket, from other files */
tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index, const u8 *newkey, u8 newkeylen, gfp_t gfp)1146 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1147 		   int family, u8 prefixlen, int l3index,
1148 		   const u8 *newkey, u8 newkeylen, gfp_t gfp)
1149 {
1150 	/* Add Key to the list */
1151 	struct tcp_md5sig_key *key;
1152 	struct tcp_sock *tp = tcp_sk(sk);
1153 	struct tcp_md5sig_info *md5sig;
1154 
1155 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1156 	if (key) {
1157 		/* Pre-existing entry - just update that one.
1158 		 * Note that the key might be used concurrently.
1159 		 * data_race() is telling kcsan that we do not care of
1160 		 * key mismatches, since changing MD5 key on live flows
1161 		 * can lead to packet drops.
1162 		 */
1163 		data_race(memcpy(key->key, newkey, newkeylen));
1164 
1165 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1166 		 * Also note that a reader could catch new key->keylen value
1167 		 * but old key->key[], this is the reason we use __GFP_ZERO
1168 		 * at sock_kmalloc() time below these lines.
1169 		 */
1170 		WRITE_ONCE(key->keylen, newkeylen);
1171 
1172 		return 0;
1173 	}
1174 
1175 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1176 					   lockdep_sock_is_held(sk));
1177 	if (!md5sig) {
1178 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1179 		if (!md5sig)
1180 			return -ENOMEM;
1181 
1182 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1183 		INIT_HLIST_HEAD(&md5sig->head);
1184 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1185 	}
1186 
1187 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1188 	if (!key)
1189 		return -ENOMEM;
1190 	if (!tcp_alloc_md5sig_pool()) {
1191 		sock_kfree_s(sk, key, sizeof(*key));
1192 		return -ENOMEM;
1193 	}
1194 
1195 	memcpy(key->key, newkey, newkeylen);
1196 	key->keylen = newkeylen;
1197 	key->family = family;
1198 	key->prefixlen = prefixlen;
1199 	key->l3index = l3index;
1200 	memcpy(&key->addr, addr,
1201 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1202 				      sizeof(struct in_addr));
1203 	hlist_add_head_rcu(&key->node, &md5sig->head);
1204 	return 0;
1205 }
1206 EXPORT_SYMBOL(tcp_md5_do_add);
1207 
tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index)1208 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1209 		   u8 prefixlen, int l3index)
1210 {
1211 	struct tcp_md5sig_key *key;
1212 
1213 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1214 	if (!key)
1215 		return -ENOENT;
1216 	hlist_del_rcu(&key->node);
1217 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1218 	kfree_rcu(key, rcu);
1219 	return 0;
1220 }
1221 EXPORT_SYMBOL(tcp_md5_do_del);
1222 
tcp_clear_md5_list(struct sock *sk)1223 static void tcp_clear_md5_list(struct sock *sk)
1224 {
1225 	struct tcp_sock *tp = tcp_sk(sk);
1226 	struct tcp_md5sig_key *key;
1227 	struct hlist_node *n;
1228 	struct tcp_md5sig_info *md5sig;
1229 
1230 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1231 
1232 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1233 		hlist_del_rcu(&key->node);
1234 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1235 		kfree_rcu(key, rcu);
1236 	}
1237 }
1238 
tcp_v4_parse_md5_keys(struct sock *sk, int optname, sockptr_t optval, int optlen)1239 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1240 				 sockptr_t optval, int optlen)
1241 {
1242 	struct tcp_md5sig cmd;
1243 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1244 	const union tcp_md5_addr *addr;
1245 	u8 prefixlen = 32;
1246 	int l3index = 0;
1247 
1248 	if (optlen < sizeof(cmd))
1249 		return -EINVAL;
1250 
1251 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1252 		return -EFAULT;
1253 
1254 	if (sin->sin_family != AF_INET)
1255 		return -EINVAL;
1256 
1257 	if (optname == TCP_MD5SIG_EXT &&
1258 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1259 		prefixlen = cmd.tcpm_prefixlen;
1260 		if (prefixlen > 32)
1261 			return -EINVAL;
1262 	}
1263 
1264 	if (optname == TCP_MD5SIG_EXT &&
1265 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1266 		struct net_device *dev;
1267 
1268 		rcu_read_lock();
1269 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1270 		if (dev && netif_is_l3_master(dev))
1271 			l3index = dev->ifindex;
1272 
1273 		rcu_read_unlock();
1274 
1275 		/* ok to reference set/not set outside of rcu;
1276 		 * right now device MUST be an L3 master
1277 		 */
1278 		if (!dev || !l3index)
1279 			return -EINVAL;
1280 	}
1281 
1282 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1283 
1284 	if (!cmd.tcpm_keylen)
1285 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1286 
1287 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1288 		return -EINVAL;
1289 
1290 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1291 			      cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1292 }
1293 
tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, __be32 daddr, __be32 saddr, const struct tcphdr *th, int nbytes)1294 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1295 				   __be32 daddr, __be32 saddr,
1296 				   const struct tcphdr *th, int nbytes)
1297 {
1298 	struct tcp4_pseudohdr *bp;
1299 	struct scatterlist sg;
1300 	struct tcphdr *_th;
1301 
1302 	bp = hp->scratch;
1303 	bp->saddr = saddr;
1304 	bp->daddr = daddr;
1305 	bp->pad = 0;
1306 	bp->protocol = IPPROTO_TCP;
1307 	bp->len = cpu_to_be16(nbytes);
1308 
1309 	_th = (struct tcphdr *)(bp + 1);
1310 	memcpy(_th, th, sizeof(*th));
1311 	_th->check = 0;
1312 
1313 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1314 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1315 				sizeof(*bp) + sizeof(*th));
1316 	return crypto_ahash_update(hp->md5_req);
1317 }
1318 
tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, __be32 daddr, __be32 saddr, const struct tcphdr *th)1319 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1320 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1321 {
1322 	struct tcp_md5sig_pool *hp;
1323 	struct ahash_request *req;
1324 
1325 	hp = tcp_get_md5sig_pool();
1326 	if (!hp)
1327 		goto clear_hash_noput;
1328 	req = hp->md5_req;
1329 
1330 	if (crypto_ahash_init(req))
1331 		goto clear_hash;
1332 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1333 		goto clear_hash;
1334 	if (tcp_md5_hash_key(hp, key))
1335 		goto clear_hash;
1336 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1337 	if (crypto_ahash_final(req))
1338 		goto clear_hash;
1339 
1340 	tcp_put_md5sig_pool();
1341 	return 0;
1342 
1343 clear_hash:
1344 	tcp_put_md5sig_pool();
1345 clear_hash_noput:
1346 	memset(md5_hash, 0, 16);
1347 	return 1;
1348 }
1349 
tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, const struct sock *sk, const struct sk_buff *skb)1350 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1351 			const struct sock *sk,
1352 			const struct sk_buff *skb)
1353 {
1354 	struct tcp_md5sig_pool *hp;
1355 	struct ahash_request *req;
1356 	const struct tcphdr *th = tcp_hdr(skb);
1357 	__be32 saddr, daddr;
1358 
1359 	if (sk) { /* valid for establish/request sockets */
1360 		saddr = sk->sk_rcv_saddr;
1361 		daddr = sk->sk_daddr;
1362 	} else {
1363 		const struct iphdr *iph = ip_hdr(skb);
1364 		saddr = iph->saddr;
1365 		daddr = iph->daddr;
1366 	}
1367 
1368 	hp = tcp_get_md5sig_pool();
1369 	if (!hp)
1370 		goto clear_hash_noput;
1371 	req = hp->md5_req;
1372 
1373 	if (crypto_ahash_init(req))
1374 		goto clear_hash;
1375 
1376 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1377 		goto clear_hash;
1378 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1379 		goto clear_hash;
1380 	if (tcp_md5_hash_key(hp, key))
1381 		goto clear_hash;
1382 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1383 	if (crypto_ahash_final(req))
1384 		goto clear_hash;
1385 
1386 	tcp_put_md5sig_pool();
1387 	return 0;
1388 
1389 clear_hash:
1390 	tcp_put_md5sig_pool();
1391 clear_hash_noput:
1392 	memset(md5_hash, 0, 16);
1393 	return 1;
1394 }
1395 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1396 
1397 #endif
1398 
1399 /* Called with rcu_read_lock() */
tcp_v4_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, int dif, int sdif)1400 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1401 				    const struct sk_buff *skb,
1402 				    int dif, int sdif)
1403 {
1404 #ifdef CONFIG_TCP_MD5SIG
1405 	/*
1406 	 * This gets called for each TCP segment that arrives
1407 	 * so we want to be efficient.
1408 	 * We have 3 drop cases:
1409 	 * o No MD5 hash and one expected.
1410 	 * o MD5 hash and we're not expecting one.
1411 	 * o MD5 hash and its wrong.
1412 	 */
1413 	const __u8 *hash_location = NULL;
1414 	struct tcp_md5sig_key *hash_expected;
1415 	const struct iphdr *iph = ip_hdr(skb);
1416 	const struct tcphdr *th = tcp_hdr(skb);
1417 	const union tcp_md5_addr *addr;
1418 	unsigned char newhash[16];
1419 	int genhash, l3index;
1420 
1421 	/* sdif set, means packet ingressed via a device
1422 	 * in an L3 domain and dif is set to the l3mdev
1423 	 */
1424 	l3index = sdif ? dif : 0;
1425 
1426 	addr = (union tcp_md5_addr *)&iph->saddr;
1427 	hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1428 	hash_location = tcp_parse_md5sig_option(th);
1429 
1430 	/* We've parsed the options - do we have a hash? */
1431 	if (!hash_expected && !hash_location)
1432 		return false;
1433 
1434 	if (hash_expected && !hash_location) {
1435 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1436 		return true;
1437 	}
1438 
1439 	if (!hash_expected && hash_location) {
1440 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1441 		return true;
1442 	}
1443 
1444 	/* Okay, so this is hash_expected and hash_location -
1445 	 * so we need to calculate the checksum.
1446 	 */
1447 	genhash = tcp_v4_md5_hash_skb(newhash,
1448 				      hash_expected,
1449 				      NULL, skb);
1450 
1451 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1452 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1453 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1454 				     &iph->saddr, ntohs(th->source),
1455 				     &iph->daddr, ntohs(th->dest),
1456 				     genhash ? " tcp_v4_calc_md5_hash failed"
1457 				     : "", l3index);
1458 		return true;
1459 	}
1460 	return false;
1461 #endif
1462 	return false;
1463 }
1464 
tcp_v4_init_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb)1465 static void tcp_v4_init_req(struct request_sock *req,
1466 			    const struct sock *sk_listener,
1467 			    struct sk_buff *skb)
1468 {
1469 	struct inet_request_sock *ireq = inet_rsk(req);
1470 	struct net *net = sock_net(sk_listener);
1471 
1472 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1473 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1474 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1475 }
1476 
tcp_v4_route_req(const struct sock *sk, struct flowi *fl, const struct request_sock *req)1477 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1478 					  struct flowi *fl,
1479 					  const struct request_sock *req)
1480 {
1481 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1482 }
1483 
1484 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1485 	.family		=	PF_INET,
1486 	.obj_size	=	sizeof(struct tcp_request_sock),
1487 	.rtx_syn_ack	=	tcp_rtx_synack,
1488 	.send_ack	=	tcp_v4_reqsk_send_ack,
1489 	.destructor	=	tcp_v4_reqsk_destructor,
1490 	.send_reset	=	tcp_v4_send_reset,
1491 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1492 };
1493 
1494 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1495 	.mss_clamp	=	TCP_MSS_DEFAULT,
1496 #ifdef CONFIG_TCP_MD5SIG
1497 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1498 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1499 #endif
1500 	.init_req	=	tcp_v4_init_req,
1501 #ifdef CONFIG_SYN_COOKIES
1502 	.cookie_init_seq =	cookie_v4_init_sequence,
1503 #endif
1504 	.route_req	=	tcp_v4_route_req,
1505 	.init_seq	=	tcp_v4_init_seq,
1506 	.init_ts_off	=	tcp_v4_init_ts_off,
1507 	.send_synack	=	tcp_v4_send_synack,
1508 };
1509 
tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)1510 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1511 {
1512 	/* Never answer to SYNs send to broadcast or multicast */
1513 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1514 		goto drop;
1515 
1516 	return tcp_conn_request(&tcp_request_sock_ops,
1517 				&tcp_request_sock_ipv4_ops, sk, skb);
1518 
1519 drop:
1520 	tcp_listendrop(sk);
1521 	return 0;
1522 }
1523 EXPORT_SYMBOL(tcp_v4_conn_request);
1524 
1525 
1526 /*
1527  * The three way handshake has completed - we got a valid synack -
1528  * now create the new socket.
1529  */
tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst, struct request_sock *req_unhash, bool *own_req)1530 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1531 				  struct request_sock *req,
1532 				  struct dst_entry *dst,
1533 				  struct request_sock *req_unhash,
1534 				  bool *own_req)
1535 {
1536 	struct inet_request_sock *ireq;
1537 	bool found_dup_sk = false;
1538 	struct inet_sock *newinet;
1539 	struct tcp_sock *newtp;
1540 	struct sock *newsk;
1541 #ifdef CONFIG_TCP_MD5SIG
1542 	const union tcp_md5_addr *addr;
1543 	struct tcp_md5sig_key *key;
1544 	int l3index;
1545 #endif
1546 	struct ip_options_rcu *inet_opt;
1547 
1548 	if (sk_acceptq_is_full(sk))
1549 		goto exit_overflow;
1550 
1551 	newsk = tcp_create_openreq_child(sk, req, skb);
1552 	if (!newsk)
1553 		goto exit_nonewsk;
1554 
1555 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1556 	inet_sk_rx_dst_set(newsk, skb);
1557 
1558 	newtp		      = tcp_sk(newsk);
1559 	newinet		      = inet_sk(newsk);
1560 	ireq		      = inet_rsk(req);
1561 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1562 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1563 	newsk->sk_bound_dev_if = ireq->ir_iif;
1564 	newinet->inet_saddr   = ireq->ir_loc_addr;
1565 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1566 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1567 	newinet->mc_index     = inet_iif(skb);
1568 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1569 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1570 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1571 	if (inet_opt)
1572 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1573 	newinet->inet_id = prandom_u32();
1574 
1575 	/* Set ToS of the new socket based upon the value of incoming SYN.
1576 	 * ECT bits are set later in tcp_init_transfer().
1577 	 */
1578 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1579 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1580 
1581 	if (!dst) {
1582 		dst = inet_csk_route_child_sock(sk, newsk, req);
1583 		if (!dst)
1584 			goto put_and_exit;
1585 	} else {
1586 		/* syncookie case : see end of cookie_v4_check() */
1587 	}
1588 	sk_setup_caps(newsk, dst);
1589 
1590 	tcp_ca_openreq_child(newsk, dst);
1591 
1592 	tcp_sync_mss(newsk, dst_mtu(dst));
1593 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1594 
1595 	tcp_initialize_rcv_mss(newsk);
1596 
1597 #ifdef CONFIG_TCP_MD5SIG
1598 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1599 	/* Copy over the MD5 key from the original socket */
1600 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1601 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1602 	if (key) {
1603 		/*
1604 		 * We're using one, so create a matching key
1605 		 * on the newsk structure. If we fail to get
1606 		 * memory, then we end up not copying the key
1607 		 * across. Shucks.
1608 		 */
1609 		tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1610 			       key->key, key->keylen, GFP_ATOMIC);
1611 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1612 	}
1613 #endif
1614 
1615 	if (__inet_inherit_port(sk, newsk) < 0)
1616 		goto put_and_exit;
1617 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1618 				       &found_dup_sk);
1619 	if (likely(*own_req)) {
1620 		tcp_move_syn(newtp, req);
1621 		ireq->ireq_opt = NULL;
1622 	} else {
1623 		newinet->inet_opt = NULL;
1624 
1625 		if (!req_unhash && found_dup_sk) {
1626 			/* This code path should only be executed in the
1627 			 * syncookie case only
1628 			 */
1629 			bh_unlock_sock(newsk);
1630 			sock_put(newsk);
1631 			newsk = NULL;
1632 		}
1633 	}
1634 	return newsk;
1635 
1636 exit_overflow:
1637 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1638 exit_nonewsk:
1639 	dst_release(dst);
1640 exit:
1641 	tcp_listendrop(sk);
1642 	return NULL;
1643 put_and_exit:
1644 	newinet->inet_opt = NULL;
1645 	inet_csk_prepare_forced_close(newsk);
1646 	tcp_done(newsk);
1647 	goto exit;
1648 }
1649 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1650 
tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)1651 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1652 {
1653 #ifdef CONFIG_SYN_COOKIES
1654 	const struct tcphdr *th = tcp_hdr(skb);
1655 
1656 	if (!th->syn)
1657 		sk = cookie_v4_check(sk, skb);
1658 #endif
1659 	return sk;
1660 }
1661 
tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, struct tcphdr *th, u32 *cookie)1662 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1663 			 struct tcphdr *th, u32 *cookie)
1664 {
1665 	u16 mss = 0;
1666 #ifdef CONFIG_SYN_COOKIES
1667 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1668 				    &tcp_request_sock_ipv4_ops, sk, th);
1669 	if (mss) {
1670 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1671 		tcp_synq_overflow(sk);
1672 	}
1673 #endif
1674 	return mss;
1675 }
1676 
1677 /* The socket must have it's spinlock held when we get
1678  * here, unless it is a TCP_LISTEN socket.
1679  *
1680  * We have a potential double-lock case here, so even when
1681  * doing backlog processing we use the BH locking scheme.
1682  * This is because we cannot sleep with the original spinlock
1683  * held.
1684  */
tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)1685 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1686 {
1687 	struct sock *rsk;
1688 
1689 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1690 		struct dst_entry *dst;
1691 
1692 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1693 						lockdep_sock_is_held(sk));
1694 
1695 		sock_rps_save_rxhash(sk, skb);
1696 		sk_mark_napi_id(sk, skb);
1697 		if (dst) {
1698 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1699 			    !dst->ops->check(dst, 0)) {
1700 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1701 				dst_release(dst);
1702 			}
1703 		}
1704 		tcp_rcv_established(sk, skb);
1705 		return 0;
1706 	}
1707 
1708 	if (tcp_checksum_complete(skb))
1709 		goto csum_err;
1710 
1711 	if (sk->sk_state == TCP_LISTEN) {
1712 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1713 
1714 		if (!nsk)
1715 			goto discard;
1716 		if (nsk != sk) {
1717 			if (tcp_child_process(sk, nsk, skb)) {
1718 				rsk = nsk;
1719 				goto reset;
1720 			}
1721 			return 0;
1722 		}
1723 	} else
1724 		sock_rps_save_rxhash(sk, skb);
1725 
1726 	if (tcp_rcv_state_process(sk, skb)) {
1727 		rsk = sk;
1728 		goto reset;
1729 	}
1730 	return 0;
1731 
1732 reset:
1733 	tcp_v4_send_reset(rsk, skb);
1734 discard:
1735 	kfree_skb(skb);
1736 	/* Be careful here. If this function gets more complicated and
1737 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1738 	 * might be destroyed here. This current version compiles correctly,
1739 	 * but you have been warned.
1740 	 */
1741 	return 0;
1742 
1743 csum_err:
1744 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1745 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1746 	goto discard;
1747 }
1748 EXPORT_SYMBOL(tcp_v4_do_rcv);
1749 
tcp_v4_early_demux(struct sk_buff *skb)1750 int tcp_v4_early_demux(struct sk_buff *skb)
1751 {
1752 	const struct iphdr *iph;
1753 	const struct tcphdr *th;
1754 	struct sock *sk;
1755 
1756 	if (skb->pkt_type != PACKET_HOST)
1757 		return 0;
1758 
1759 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1760 		return 0;
1761 
1762 	iph = ip_hdr(skb);
1763 	th = tcp_hdr(skb);
1764 
1765 	if (th->doff < sizeof(struct tcphdr) / 4)
1766 		return 0;
1767 
1768 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1769 				       iph->saddr, th->source,
1770 				       iph->daddr, ntohs(th->dest),
1771 				       skb->skb_iif, inet_sdif(skb));
1772 	if (sk) {
1773 		skb->sk = sk;
1774 		skb->destructor = sock_edemux;
1775 		if (sk_fullsock(sk)) {
1776 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1777 
1778 			if (dst)
1779 				dst = dst_check(dst, 0);
1780 			if (dst &&
1781 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1782 				skb_dst_set_noref(skb, dst);
1783 		}
1784 	}
1785 	return 0;
1786 }
1787 
tcp_add_backlog(struct sock *sk, struct sk_buff *skb)1788 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1789 {
1790 	u32 limit, tail_gso_size, tail_gso_segs;
1791 	struct skb_shared_info *shinfo;
1792 	const struct tcphdr *th;
1793 	struct tcphdr *thtail;
1794 	struct sk_buff *tail;
1795 	unsigned int hdrlen;
1796 	bool fragstolen;
1797 	u32 gso_segs;
1798 	u32 gso_size;
1799 	int delta;
1800 
1801 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1802 	 * we can fix skb->truesize to its real value to avoid future drops.
1803 	 * This is valid because skb is not yet charged to the socket.
1804 	 * It has been noticed pure SACK packets were sometimes dropped
1805 	 * (if cooked by drivers without copybreak feature).
1806 	 */
1807 	skb_condense(skb);
1808 
1809 	skb_dst_drop(skb);
1810 
1811 	if (unlikely(tcp_checksum_complete(skb))) {
1812 		bh_unlock_sock(sk);
1813 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1814 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1815 		return true;
1816 	}
1817 
1818 	/* Attempt coalescing to last skb in backlog, even if we are
1819 	 * above the limits.
1820 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1821 	 */
1822 	th = (const struct tcphdr *)skb->data;
1823 	hdrlen = th->doff * 4;
1824 
1825 	tail = sk->sk_backlog.tail;
1826 	if (!tail)
1827 		goto no_coalesce;
1828 	thtail = (struct tcphdr *)tail->data;
1829 
1830 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1831 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1832 	    ((TCP_SKB_CB(tail)->tcp_flags |
1833 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1834 	    !((TCP_SKB_CB(tail)->tcp_flags &
1835 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1836 	    ((TCP_SKB_CB(tail)->tcp_flags ^
1837 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1838 #ifdef CONFIG_TLS_DEVICE
1839 	    tail->decrypted != skb->decrypted ||
1840 #endif
1841 	    !mptcp_skb_can_collapse(tail, skb) ||
1842 	    thtail->doff != th->doff ||
1843 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1844 		goto no_coalesce;
1845 
1846 	__skb_pull(skb, hdrlen);
1847 
1848 	shinfo = skb_shinfo(skb);
1849 	gso_size = shinfo->gso_size ?: skb->len;
1850 	gso_segs = shinfo->gso_segs ?: 1;
1851 
1852 	shinfo = skb_shinfo(tail);
1853 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1854 	tail_gso_segs = shinfo->gso_segs ?: 1;
1855 
1856 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1857 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1858 
1859 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1860 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1861 			thtail->window = th->window;
1862 		}
1863 
1864 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1865 		 * thtail->fin, so that the fast path in tcp_rcv_established()
1866 		 * is not entered if we append a packet with a FIN.
1867 		 * SYN, RST, URG are not present.
1868 		 * ACK is set on both packets.
1869 		 * PSH : we do not really care in TCP stack,
1870 		 *       at least for 'GRO' packets.
1871 		 */
1872 		thtail->fin |= th->fin;
1873 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1874 
1875 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1876 			TCP_SKB_CB(tail)->has_rxtstamp = true;
1877 			tail->tstamp = skb->tstamp;
1878 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1879 		}
1880 
1881 		/* Not as strict as GRO. We only need to carry mss max value */
1882 		shinfo->gso_size = max(gso_size, tail_gso_size);
1883 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1884 
1885 		sk->sk_backlog.len += delta;
1886 		__NET_INC_STATS(sock_net(sk),
1887 				LINUX_MIB_TCPBACKLOGCOALESCE);
1888 		kfree_skb_partial(skb, fragstolen);
1889 		return false;
1890 	}
1891 	__skb_push(skb, hdrlen);
1892 
1893 no_coalesce:
1894 	limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
1895 
1896 	/* Only socket owner can try to collapse/prune rx queues
1897 	 * to reduce memory overhead, so add a little headroom here.
1898 	 * Few sockets backlog are possibly concurrently non empty.
1899 	 */
1900 	limit += 64 * 1024;
1901 
1902 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1903 		bh_unlock_sock(sk);
1904 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1905 		return true;
1906 	}
1907 	return false;
1908 }
1909 EXPORT_SYMBOL(tcp_add_backlog);
1910 
tcp_filter(struct sock *sk, struct sk_buff *skb)1911 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1912 {
1913 	struct tcphdr *th = (struct tcphdr *)skb->data;
1914 
1915 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1916 }
1917 EXPORT_SYMBOL(tcp_filter);
1918 
tcp_v4_restore_cb(struct sk_buff *skb)1919 static void tcp_v4_restore_cb(struct sk_buff *skb)
1920 {
1921 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1922 		sizeof(struct inet_skb_parm));
1923 }
1924 
tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, const struct tcphdr *th)1925 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1926 			   const struct tcphdr *th)
1927 {
1928 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1929 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1930 	 */
1931 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1932 		sizeof(struct inet_skb_parm));
1933 	barrier();
1934 
1935 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1936 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1937 				    skb->len - th->doff * 4);
1938 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1939 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1940 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1941 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1942 	TCP_SKB_CB(skb)->sacked	 = 0;
1943 	TCP_SKB_CB(skb)->has_rxtstamp =
1944 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1945 }
1946 
1947 /*
1948  *	From tcp_input.c
1949  */
1950 
tcp_v4_rcv(struct sk_buff *skb)1951 int tcp_v4_rcv(struct sk_buff *skb)
1952 {
1953 	struct net *net = dev_net(skb->dev);
1954 	struct sk_buff *skb_to_free;
1955 	int sdif = inet_sdif(skb);
1956 	int dif = inet_iif(skb);
1957 	const struct iphdr *iph;
1958 	const struct tcphdr *th;
1959 	bool refcounted;
1960 	struct sock *sk;
1961 	int ret;
1962 
1963 	if (skb->pkt_type != PACKET_HOST)
1964 		goto discard_it;
1965 
1966 	/* Count it even if it's bad */
1967 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1968 
1969 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1970 		goto discard_it;
1971 
1972 	th = (const struct tcphdr *)skb->data;
1973 
1974 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1975 		goto bad_packet;
1976 	if (!pskb_may_pull(skb, th->doff * 4))
1977 		goto discard_it;
1978 
1979 	/* An explanation is required here, I think.
1980 	 * Packet length and doff are validated by header prediction,
1981 	 * provided case of th->doff==0 is eliminated.
1982 	 * So, we defer the checks. */
1983 
1984 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1985 		goto csum_error;
1986 
1987 	th = (const struct tcphdr *)skb->data;
1988 	iph = ip_hdr(skb);
1989 lookup:
1990 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1991 			       th->dest, sdif, &refcounted);
1992 	if (!sk)
1993 		goto no_tcp_socket;
1994 
1995 process:
1996 	if (sk->sk_state == TCP_TIME_WAIT)
1997 		goto do_time_wait;
1998 
1999 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2000 		struct request_sock *req = inet_reqsk(sk);
2001 		bool req_stolen = false;
2002 		struct sock *nsk;
2003 
2004 		sk = req->rsk_listener;
2005 		if (unlikely(!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb) ||
2006 			     tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
2007 			sk_drops_add(sk, skb);
2008 			reqsk_put(req);
2009 			goto discard_it;
2010 		}
2011 		if (tcp_checksum_complete(skb)) {
2012 			reqsk_put(req);
2013 			goto csum_error;
2014 		}
2015 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2016 			inet_csk_reqsk_queue_drop_and_put(sk, req);
2017 			goto lookup;
2018 		}
2019 		/* We own a reference on the listener, increase it again
2020 		 * as we might lose it too soon.
2021 		 */
2022 		sock_hold(sk);
2023 		refcounted = true;
2024 		nsk = NULL;
2025 		if (!tcp_filter(sk, skb)) {
2026 			th = (const struct tcphdr *)skb->data;
2027 			iph = ip_hdr(skb);
2028 			tcp_v4_fill_cb(skb, iph, th);
2029 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2030 		}
2031 		if (!nsk) {
2032 			reqsk_put(req);
2033 			if (req_stolen) {
2034 				/* Another cpu got exclusive access to req
2035 				 * and created a full blown socket.
2036 				 * Try to feed this packet to this socket
2037 				 * instead of discarding it.
2038 				 */
2039 				tcp_v4_restore_cb(skb);
2040 				sock_put(sk);
2041 				goto lookup;
2042 			}
2043 			goto discard_and_relse;
2044 		}
2045 		nf_reset_ct(skb);
2046 		if (nsk == sk) {
2047 			reqsk_put(req);
2048 			tcp_v4_restore_cb(skb);
2049 		} else if (tcp_child_process(sk, nsk, skb)) {
2050 			tcp_v4_send_reset(nsk, skb);
2051 			goto discard_and_relse;
2052 		} else {
2053 			sock_put(sk);
2054 			return 0;
2055 		}
2056 	}
2057 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2058 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2059 		goto discard_and_relse;
2060 	}
2061 
2062 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2063 		goto discard_and_relse;
2064 
2065 	if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2066 		goto discard_and_relse;
2067 
2068 	nf_reset_ct(skb);
2069 
2070 	if (tcp_filter(sk, skb))
2071 		goto discard_and_relse;
2072 	th = (const struct tcphdr *)skb->data;
2073 	iph = ip_hdr(skb);
2074 	tcp_v4_fill_cb(skb, iph, th);
2075 
2076 	skb->dev = NULL;
2077 
2078 	if (sk->sk_state == TCP_LISTEN) {
2079 		ret = tcp_v4_do_rcv(sk, skb);
2080 		goto put_and_return;
2081 	}
2082 
2083 	sk_incoming_cpu_update(sk);
2084 
2085 	bh_lock_sock_nested(sk);
2086 	tcp_segs_in(tcp_sk(sk), skb);
2087 	ret = 0;
2088 	if (!sock_owned_by_user(sk)) {
2089 		skb_to_free = sk->sk_rx_skb_cache;
2090 		sk->sk_rx_skb_cache = NULL;
2091 		ret = tcp_v4_do_rcv(sk, skb);
2092 	} else {
2093 		if (tcp_add_backlog(sk, skb))
2094 			goto discard_and_relse;
2095 		skb_to_free = NULL;
2096 	}
2097 	bh_unlock_sock(sk);
2098 	if (skb_to_free)
2099 		__kfree_skb(skb_to_free);
2100 
2101 put_and_return:
2102 	if (refcounted)
2103 		sock_put(sk);
2104 
2105 	return ret;
2106 
2107 no_tcp_socket:
2108 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2109 		goto discard_it;
2110 
2111 	tcp_v4_fill_cb(skb, iph, th);
2112 
2113 	if (tcp_checksum_complete(skb)) {
2114 csum_error:
2115 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2116 bad_packet:
2117 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2118 	} else {
2119 		tcp_v4_send_reset(NULL, skb);
2120 	}
2121 
2122 discard_it:
2123 	/* Discard frame. */
2124 	kfree_skb(skb);
2125 	return 0;
2126 
2127 discard_and_relse:
2128 	sk_drops_add(sk, skb);
2129 	if (refcounted)
2130 		sock_put(sk);
2131 	goto discard_it;
2132 
2133 do_time_wait:
2134 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2135 		inet_twsk_put(inet_twsk(sk));
2136 		goto discard_it;
2137 	}
2138 
2139 	tcp_v4_fill_cb(skb, iph, th);
2140 
2141 	if (tcp_checksum_complete(skb)) {
2142 		inet_twsk_put(inet_twsk(sk));
2143 		goto csum_error;
2144 	}
2145 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2146 	case TCP_TW_SYN: {
2147 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2148 							&tcp_hashinfo, skb,
2149 							__tcp_hdrlen(th),
2150 							iph->saddr, th->source,
2151 							iph->daddr, th->dest,
2152 							inet_iif(skb),
2153 							sdif);
2154 		if (sk2) {
2155 			inet_twsk_deschedule_put(inet_twsk(sk));
2156 			sk = sk2;
2157 			tcp_v4_restore_cb(skb);
2158 			refcounted = false;
2159 			goto process;
2160 		}
2161 	}
2162 		/* to ACK */
2163 		fallthrough;
2164 	case TCP_TW_ACK:
2165 		tcp_v4_timewait_ack(sk, skb);
2166 		break;
2167 	case TCP_TW_RST:
2168 		tcp_v4_send_reset(sk, skb);
2169 		inet_twsk_deschedule_put(inet_twsk(sk));
2170 		goto discard_it;
2171 	case TCP_TW_SUCCESS:;
2172 	}
2173 	goto discard_it;
2174 }
2175 
2176 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2177 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2178 	.twsk_unique	= tcp_twsk_unique,
2179 	.twsk_destructor= tcp_twsk_destructor,
2180 };
2181 
inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)2182 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2183 {
2184 	struct dst_entry *dst = skb_dst(skb);
2185 
2186 	if (dst && dst_hold_safe(dst)) {
2187 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2188 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2189 	}
2190 }
2191 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2192 
2193 const struct inet_connection_sock_af_ops ipv4_specific = {
2194 	.queue_xmit	   = ip_queue_xmit,
2195 	.send_check	   = tcp_v4_send_check,
2196 	.rebuild_header	   = inet_sk_rebuild_header,
2197 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2198 	.conn_request	   = tcp_v4_conn_request,
2199 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2200 	.net_header_len	   = sizeof(struct iphdr),
2201 	.setsockopt	   = ip_setsockopt,
2202 	.getsockopt	   = ip_getsockopt,
2203 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2204 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2205 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2206 };
2207 EXPORT_SYMBOL(ipv4_specific);
2208 
2209 #ifdef CONFIG_TCP_MD5SIG
2210 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2211 	.md5_lookup		= tcp_v4_md5_lookup,
2212 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2213 	.md5_parse		= tcp_v4_parse_md5_keys,
2214 };
2215 #endif
2216 
2217 /* NOTE: A lot of things set to zero explicitly by call to
2218  *       sk_alloc() so need not be done here.
2219  */
tcp_v4_init_sock(struct sock *sk)2220 static int tcp_v4_init_sock(struct sock *sk)
2221 {
2222 	struct inet_connection_sock *icsk = inet_csk(sk);
2223 
2224 	tcp_init_sock(sk);
2225 
2226 	icsk->icsk_af_ops = &ipv4_specific;
2227 
2228 #ifdef CONFIG_TCP_MD5SIG
2229 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2230 #endif
2231 
2232 	return 0;
2233 }
2234 
tcp_v4_destroy_sock(struct sock *sk)2235 void tcp_v4_destroy_sock(struct sock *sk)
2236 {
2237 	struct tcp_sock *tp = tcp_sk(sk);
2238 
2239 	trace_tcp_destroy_sock(sk);
2240 
2241 	tcp_clear_xmit_timers(sk);
2242 
2243 	tcp_cleanup_congestion_control(sk);
2244 
2245 	tcp_cleanup_ulp(sk);
2246 
2247 	/* Cleanup up the write buffer. */
2248 	tcp_write_queue_purge(sk);
2249 
2250 	/* Check if we want to disable active TFO */
2251 	tcp_fastopen_active_disable_ofo_check(sk);
2252 
2253 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2254 	skb_rbtree_purge(&tp->out_of_order_queue);
2255 
2256 #ifdef CONFIG_TCP_MD5SIG
2257 	/* Clean up the MD5 key list, if any */
2258 	if (tp->md5sig_info) {
2259 		tcp_clear_md5_list(sk);
2260 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2261 		tp->md5sig_info = NULL;
2262 	}
2263 #endif
2264 
2265 	/* Clean up a referenced TCP bind bucket. */
2266 	if (inet_csk(sk)->icsk_bind_hash)
2267 		inet_put_port(sk);
2268 
2269 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2270 
2271 	/* If socket is aborted during connect operation */
2272 	tcp_free_fastopen_req(tp);
2273 	tcp_fastopen_destroy_cipher(sk);
2274 	tcp_saved_syn_free(tp);
2275 
2276 	sk_sockets_allocated_dec(sk);
2277 }
2278 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2279 
2280 #ifdef CONFIG_PROC_FS
2281 /* Proc filesystem TCP sock list dumping. */
2282 
2283 /*
2284  * Get next listener socket follow cur.  If cur is NULL, get first socket
2285  * starting from bucket given in st->bucket; when st->bucket is zero the
2286  * very first socket in the hash table is returned.
2287  */
listening_get_next(struct seq_file *seq, void *cur)2288 static void *listening_get_next(struct seq_file *seq, void *cur)
2289 {
2290 	struct tcp_seq_afinfo *afinfo;
2291 	struct tcp_iter_state *st = seq->private;
2292 	struct net *net = seq_file_net(seq);
2293 	struct inet_listen_hashbucket *ilb;
2294 	struct hlist_nulls_node *node;
2295 	struct sock *sk = cur;
2296 
2297 	if (st->bpf_seq_afinfo)
2298 		afinfo = st->bpf_seq_afinfo;
2299 	else
2300 		afinfo = PDE_DATA(file_inode(seq->file));
2301 
2302 	if (!sk) {
2303 get_head:
2304 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2305 		spin_lock(&ilb->lock);
2306 		sk = sk_nulls_head(&ilb->nulls_head);
2307 		st->offset = 0;
2308 		goto get_sk;
2309 	}
2310 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
2311 	++st->num;
2312 	++st->offset;
2313 
2314 	sk = sk_nulls_next(sk);
2315 get_sk:
2316 	sk_nulls_for_each_from(sk, node) {
2317 		if (!net_eq(sock_net(sk), net))
2318 			continue;
2319 		if (afinfo->family == AF_UNSPEC ||
2320 		    sk->sk_family == afinfo->family)
2321 			return sk;
2322 	}
2323 	spin_unlock(&ilb->lock);
2324 	st->offset = 0;
2325 	if (++st->bucket < INET_LHTABLE_SIZE)
2326 		goto get_head;
2327 	return NULL;
2328 }
2329 
listening_get_idx(struct seq_file *seq, loff_t *pos)2330 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2331 {
2332 	struct tcp_iter_state *st = seq->private;
2333 	void *rc;
2334 
2335 	st->bucket = 0;
2336 	st->offset = 0;
2337 	rc = listening_get_next(seq, NULL);
2338 
2339 	while (rc && *pos) {
2340 		rc = listening_get_next(seq, rc);
2341 		--*pos;
2342 	}
2343 	return rc;
2344 }
2345 
empty_bucket(const struct tcp_iter_state *st)2346 static inline bool empty_bucket(const struct tcp_iter_state *st)
2347 {
2348 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2349 }
2350 
2351 /*
2352  * Get first established socket starting from bucket given in st->bucket.
2353  * If st->bucket is zero, the very first socket in the hash is returned.
2354  */
established_get_first(struct seq_file *seq)2355 static void *established_get_first(struct seq_file *seq)
2356 {
2357 	struct tcp_seq_afinfo *afinfo;
2358 	struct tcp_iter_state *st = seq->private;
2359 	struct net *net = seq_file_net(seq);
2360 	void *rc = NULL;
2361 
2362 	if (st->bpf_seq_afinfo)
2363 		afinfo = st->bpf_seq_afinfo;
2364 	else
2365 		afinfo = PDE_DATA(file_inode(seq->file));
2366 
2367 	st->offset = 0;
2368 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2369 		struct sock *sk;
2370 		struct hlist_nulls_node *node;
2371 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2372 
2373 		/* Lockless fast path for the common case of empty buckets */
2374 		if (empty_bucket(st))
2375 			continue;
2376 
2377 		spin_lock_bh(lock);
2378 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2379 			if ((afinfo->family != AF_UNSPEC &&
2380 			     sk->sk_family != afinfo->family) ||
2381 			    !net_eq(sock_net(sk), net)) {
2382 				continue;
2383 			}
2384 			rc = sk;
2385 			goto out;
2386 		}
2387 		spin_unlock_bh(lock);
2388 	}
2389 out:
2390 	return rc;
2391 }
2392 
established_get_next(struct seq_file *seq, void *cur)2393 static void *established_get_next(struct seq_file *seq, void *cur)
2394 {
2395 	struct tcp_seq_afinfo *afinfo;
2396 	struct sock *sk = cur;
2397 	struct hlist_nulls_node *node;
2398 	struct tcp_iter_state *st = seq->private;
2399 	struct net *net = seq_file_net(seq);
2400 
2401 	if (st->bpf_seq_afinfo)
2402 		afinfo = st->bpf_seq_afinfo;
2403 	else
2404 		afinfo = PDE_DATA(file_inode(seq->file));
2405 
2406 	++st->num;
2407 	++st->offset;
2408 
2409 	sk = sk_nulls_next(sk);
2410 
2411 	sk_nulls_for_each_from(sk, node) {
2412 		if ((afinfo->family == AF_UNSPEC ||
2413 		     sk->sk_family == afinfo->family) &&
2414 		    net_eq(sock_net(sk), net))
2415 			return sk;
2416 	}
2417 
2418 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2419 	++st->bucket;
2420 	return established_get_first(seq);
2421 }
2422 
established_get_idx(struct seq_file *seq, loff_t pos)2423 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2424 {
2425 	struct tcp_iter_state *st = seq->private;
2426 	void *rc;
2427 
2428 	st->bucket = 0;
2429 	rc = established_get_first(seq);
2430 
2431 	while (rc && pos) {
2432 		rc = established_get_next(seq, rc);
2433 		--pos;
2434 	}
2435 	return rc;
2436 }
2437 
tcp_get_idx(struct seq_file *seq, loff_t pos)2438 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2439 {
2440 	void *rc;
2441 	struct tcp_iter_state *st = seq->private;
2442 
2443 	st->state = TCP_SEQ_STATE_LISTENING;
2444 	rc	  = listening_get_idx(seq, &pos);
2445 
2446 	if (!rc) {
2447 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2448 		rc	  = established_get_idx(seq, pos);
2449 	}
2450 
2451 	return rc;
2452 }
2453 
tcp_seek_last_pos(struct seq_file *seq)2454 static void *tcp_seek_last_pos(struct seq_file *seq)
2455 {
2456 	struct tcp_iter_state *st = seq->private;
2457 	int bucket = st->bucket;
2458 	int offset = st->offset;
2459 	int orig_num = st->num;
2460 	void *rc = NULL;
2461 
2462 	switch (st->state) {
2463 	case TCP_SEQ_STATE_LISTENING:
2464 		if (st->bucket >= INET_LHTABLE_SIZE)
2465 			break;
2466 		st->state = TCP_SEQ_STATE_LISTENING;
2467 		rc = listening_get_next(seq, NULL);
2468 		while (offset-- && rc && bucket == st->bucket)
2469 			rc = listening_get_next(seq, rc);
2470 		if (rc)
2471 			break;
2472 		st->bucket = 0;
2473 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2474 		fallthrough;
2475 	case TCP_SEQ_STATE_ESTABLISHED:
2476 		if (st->bucket > tcp_hashinfo.ehash_mask)
2477 			break;
2478 		rc = established_get_first(seq);
2479 		while (offset-- && rc && bucket == st->bucket)
2480 			rc = established_get_next(seq, rc);
2481 	}
2482 
2483 	st->num = orig_num;
2484 
2485 	return rc;
2486 }
2487 
tcp_seq_start(struct seq_file *seq, loff_t *pos)2488 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2489 {
2490 	struct tcp_iter_state *st = seq->private;
2491 	void *rc;
2492 
2493 	if (*pos && *pos == st->last_pos) {
2494 		rc = tcp_seek_last_pos(seq);
2495 		if (rc)
2496 			goto out;
2497 	}
2498 
2499 	st->state = TCP_SEQ_STATE_LISTENING;
2500 	st->num = 0;
2501 	st->bucket = 0;
2502 	st->offset = 0;
2503 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2504 
2505 out:
2506 	st->last_pos = *pos;
2507 	return rc;
2508 }
2509 EXPORT_SYMBOL(tcp_seq_start);
2510 
tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)2511 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2512 {
2513 	struct tcp_iter_state *st = seq->private;
2514 	void *rc = NULL;
2515 
2516 	if (v == SEQ_START_TOKEN) {
2517 		rc = tcp_get_idx(seq, 0);
2518 		goto out;
2519 	}
2520 
2521 	switch (st->state) {
2522 	case TCP_SEQ_STATE_LISTENING:
2523 		rc = listening_get_next(seq, v);
2524 		if (!rc) {
2525 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2526 			st->bucket = 0;
2527 			st->offset = 0;
2528 			rc	  = established_get_first(seq);
2529 		}
2530 		break;
2531 	case TCP_SEQ_STATE_ESTABLISHED:
2532 		rc = established_get_next(seq, v);
2533 		break;
2534 	}
2535 out:
2536 	++*pos;
2537 	st->last_pos = *pos;
2538 	return rc;
2539 }
2540 EXPORT_SYMBOL(tcp_seq_next);
2541 
tcp_seq_stop(struct seq_file *seq, void *v)2542 void tcp_seq_stop(struct seq_file *seq, void *v)
2543 {
2544 	struct tcp_iter_state *st = seq->private;
2545 
2546 	switch (st->state) {
2547 	case TCP_SEQ_STATE_LISTENING:
2548 		if (v != SEQ_START_TOKEN)
2549 			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2550 		break;
2551 	case TCP_SEQ_STATE_ESTABLISHED:
2552 		if (v)
2553 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2554 		break;
2555 	}
2556 }
2557 EXPORT_SYMBOL(tcp_seq_stop);
2558 
get_openreq4(const struct request_sock *req, struct seq_file *f, int i)2559 static void get_openreq4(const struct request_sock *req,
2560 			 struct seq_file *f, int i)
2561 {
2562 	const struct inet_request_sock *ireq = inet_rsk(req);
2563 	long delta = req->rsk_timer.expires - jiffies;
2564 
2565 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2566 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2567 		i,
2568 		ireq->ir_loc_addr,
2569 		ireq->ir_num,
2570 		ireq->ir_rmt_addr,
2571 		ntohs(ireq->ir_rmt_port),
2572 		TCP_SYN_RECV,
2573 		0, 0, /* could print option size, but that is af dependent. */
2574 		1,    /* timers active (only the expire timer) */
2575 		jiffies_delta_to_clock_t(delta),
2576 		req->num_timeout,
2577 		from_kuid_munged(seq_user_ns(f),
2578 				 sock_i_uid(req->rsk_listener)),
2579 		0,  /* non standard timer */
2580 		0, /* open_requests have no inode */
2581 		0,
2582 		req);
2583 }
2584 
get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)2585 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2586 {
2587 	int timer_active;
2588 	unsigned long timer_expires;
2589 	const struct tcp_sock *tp = tcp_sk(sk);
2590 	const struct inet_connection_sock *icsk = inet_csk(sk);
2591 	const struct inet_sock *inet = inet_sk(sk);
2592 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2593 	__be32 dest = inet->inet_daddr;
2594 	__be32 src = inet->inet_rcv_saddr;
2595 	__u16 destp = ntohs(inet->inet_dport);
2596 	__u16 srcp = ntohs(inet->inet_sport);
2597 	int rx_queue;
2598 	int state;
2599 
2600 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2601 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2602 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2603 		timer_active	= 1;
2604 		timer_expires	= icsk->icsk_timeout;
2605 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2606 		timer_active	= 4;
2607 		timer_expires	= icsk->icsk_timeout;
2608 	} else if (timer_pending(&sk->sk_timer)) {
2609 		timer_active	= 2;
2610 		timer_expires	= sk->sk_timer.expires;
2611 	} else {
2612 		timer_active	= 0;
2613 		timer_expires = jiffies;
2614 	}
2615 
2616 	state = inet_sk_state_load(sk);
2617 	if (state == TCP_LISTEN)
2618 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2619 	else
2620 		/* Because we don't lock the socket,
2621 		 * we might find a transient negative value.
2622 		 */
2623 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2624 				      READ_ONCE(tp->copied_seq), 0);
2625 
2626 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2627 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2628 		i, src, srcp, dest, destp, state,
2629 		READ_ONCE(tp->write_seq) - tp->snd_una,
2630 		rx_queue,
2631 		timer_active,
2632 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2633 		icsk->icsk_retransmits,
2634 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2635 		icsk->icsk_probes_out,
2636 		sock_i_ino(sk),
2637 		refcount_read(&sk->sk_refcnt), sk,
2638 		jiffies_to_clock_t(icsk->icsk_rto),
2639 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2640 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2641 		tp->snd_cwnd,
2642 		state == TCP_LISTEN ?
2643 		    fastopenq->max_qlen :
2644 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2645 }
2646 
get_timewait4_sock(const struct inet_timewait_sock *tw, struct seq_file *f, int i)2647 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2648 			       struct seq_file *f, int i)
2649 {
2650 	long delta = tw->tw_timer.expires - jiffies;
2651 	__be32 dest, src;
2652 	__u16 destp, srcp;
2653 
2654 	dest  = tw->tw_daddr;
2655 	src   = tw->tw_rcv_saddr;
2656 	destp = ntohs(tw->tw_dport);
2657 	srcp  = ntohs(tw->tw_sport);
2658 
2659 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2660 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2661 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2662 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2663 		refcount_read(&tw->tw_refcnt), tw);
2664 }
2665 
2666 #define TMPSZ 150
2667 
tcp4_seq_show(struct seq_file *seq, void *v)2668 static int tcp4_seq_show(struct seq_file *seq, void *v)
2669 {
2670 	struct tcp_iter_state *st;
2671 	struct sock *sk = v;
2672 
2673 	seq_setwidth(seq, TMPSZ - 1);
2674 	if (v == SEQ_START_TOKEN) {
2675 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2676 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2677 			   "inode");
2678 		goto out;
2679 	}
2680 	st = seq->private;
2681 
2682 	if (sk->sk_state == TCP_TIME_WAIT)
2683 		get_timewait4_sock(v, seq, st->num);
2684 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2685 		get_openreq4(v, seq, st->num);
2686 	else
2687 		get_tcp4_sock(v, seq, st->num);
2688 out:
2689 	seq_pad(seq, '\n');
2690 	return 0;
2691 }
2692 
2693 #ifdef CONFIG_BPF_SYSCALL
2694 struct bpf_iter__tcp {
2695 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2696 	__bpf_md_ptr(struct sock_common *, sk_common);
2697 	uid_t uid __aligned(8);
2698 };
2699 
tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, struct sock_common *sk_common, uid_t uid)2700 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2701 			     struct sock_common *sk_common, uid_t uid)
2702 {
2703 	struct bpf_iter__tcp ctx;
2704 
2705 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
2706 	ctx.meta = meta;
2707 	ctx.sk_common = sk_common;
2708 	ctx.uid = uid;
2709 	return bpf_iter_run_prog(prog, &ctx);
2710 }
2711 
bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)2712 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2713 {
2714 	struct bpf_iter_meta meta;
2715 	struct bpf_prog *prog;
2716 	struct sock *sk = v;
2717 	uid_t uid;
2718 
2719 	if (v == SEQ_START_TOKEN)
2720 		return 0;
2721 
2722 	if (sk->sk_state == TCP_TIME_WAIT) {
2723 		uid = 0;
2724 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2725 		const struct request_sock *req = v;
2726 
2727 		uid = from_kuid_munged(seq_user_ns(seq),
2728 				       sock_i_uid(req->rsk_listener));
2729 	} else {
2730 		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2731 	}
2732 
2733 	meta.seq = seq;
2734 	prog = bpf_iter_get_info(&meta, false);
2735 	return tcp_prog_seq_show(prog, &meta, v, uid);
2736 }
2737 
bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)2738 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2739 {
2740 	struct bpf_iter_meta meta;
2741 	struct bpf_prog *prog;
2742 
2743 	if (!v) {
2744 		meta.seq = seq;
2745 		prog = bpf_iter_get_info(&meta, true);
2746 		if (prog)
2747 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
2748 	}
2749 
2750 	tcp_seq_stop(seq, v);
2751 }
2752 
2753 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2754 	.show		= bpf_iter_tcp_seq_show,
2755 	.start		= tcp_seq_start,
2756 	.next		= tcp_seq_next,
2757 	.stop		= bpf_iter_tcp_seq_stop,
2758 };
2759 #endif
2760 
2761 static const struct seq_operations tcp4_seq_ops = {
2762 	.show		= tcp4_seq_show,
2763 	.start		= tcp_seq_start,
2764 	.next		= tcp_seq_next,
2765 	.stop		= tcp_seq_stop,
2766 };
2767 
2768 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2769 	.family		= AF_INET,
2770 };
2771 
tcp4_proc_init_net(struct net *net)2772 static int __net_init tcp4_proc_init_net(struct net *net)
2773 {
2774 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2775 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2776 		return -ENOMEM;
2777 	return 0;
2778 }
2779 
tcp4_proc_exit_net(struct net *net)2780 static void __net_exit tcp4_proc_exit_net(struct net *net)
2781 {
2782 	remove_proc_entry("tcp", net->proc_net);
2783 }
2784 
2785 static struct pernet_operations tcp4_net_ops = {
2786 	.init = tcp4_proc_init_net,
2787 	.exit = tcp4_proc_exit_net,
2788 };
2789 
tcp4_proc_init(void)2790 int __init tcp4_proc_init(void)
2791 {
2792 	return register_pernet_subsys(&tcp4_net_ops);
2793 }
2794 
tcp4_proc_exit(void)2795 void tcp4_proc_exit(void)
2796 {
2797 	unregister_pernet_subsys(&tcp4_net_ops);
2798 }
2799 #endif /* CONFIG_PROC_FS */
2800 
2801 struct proto tcp_prot = {
2802 	.name			= "TCP",
2803 	.owner			= THIS_MODULE,
2804 	.close			= tcp_close,
2805 	.pre_connect		= tcp_v4_pre_connect,
2806 	.connect		= tcp_v4_connect,
2807 	.disconnect		= tcp_disconnect,
2808 	.accept			= inet_csk_accept,
2809 	.ioctl			= tcp_ioctl,
2810 	.init			= tcp_v4_init_sock,
2811 	.destroy		= tcp_v4_destroy_sock,
2812 	.shutdown		= tcp_shutdown,
2813 	.setsockopt		= tcp_setsockopt,
2814 	.getsockopt		= tcp_getsockopt,
2815 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
2816 	.keepalive		= tcp_set_keepalive,
2817 	.recvmsg		= tcp_recvmsg,
2818 	.sendmsg		= tcp_sendmsg,
2819 	.sendpage		= tcp_sendpage,
2820 	.backlog_rcv		= tcp_v4_do_rcv,
2821 	.release_cb		= tcp_release_cb,
2822 	.hash			= inet_hash,
2823 	.unhash			= inet_unhash,
2824 	.get_port		= inet_csk_get_port,
2825 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2826 	.leave_memory_pressure	= tcp_leave_memory_pressure,
2827 	.stream_memory_free	= tcp_stream_memory_free,
2828 	.sockets_allocated	= &tcp_sockets_allocated,
2829 	.orphan_count		= &tcp_orphan_count,
2830 	.memory_allocated	= &tcp_memory_allocated,
2831 	.memory_pressure	= &tcp_memory_pressure,
2832 	.sysctl_mem		= sysctl_tcp_mem,
2833 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
2834 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
2835 	.max_header		= MAX_TCP_HEADER,
2836 	.obj_size		= sizeof(struct tcp_sock),
2837 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
2838 	.twsk_prot		= &tcp_timewait_sock_ops,
2839 	.rsk_prot		= &tcp_request_sock_ops,
2840 	.h.hashinfo		= &tcp_hashinfo,
2841 	.no_autobind		= true,
2842 	.diag_destroy		= tcp_abort,
2843 };
2844 EXPORT_SYMBOL(tcp_prot);
2845 
tcp_sk_exit(struct net *net)2846 static void __net_exit tcp_sk_exit(struct net *net)
2847 {
2848 	if (net->ipv4.tcp_congestion_control)
2849 		bpf_module_put(net->ipv4.tcp_congestion_control,
2850 			       net->ipv4.tcp_congestion_control->owner);
2851 }
2852 
tcp_sk_init(struct net *net)2853 static int __net_init tcp_sk_init(struct net *net)
2854 {
2855 	int cnt;
2856 
2857 	net->ipv4.sysctl_tcp_ecn = 2;
2858 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2859 
2860 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2861 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2862 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2863 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2864 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2865 
2866 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2867 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2868 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2869 
2870 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2871 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2872 	net->ipv4.sysctl_tcp_syncookies = 1;
2873 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2874 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2875 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2876 	net->ipv4.sysctl_tcp_orphan_retries = 0;
2877 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2878 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2879 	net->ipv4.sysctl_tcp_tw_reuse = 2;
2880 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
2881 
2882 	cnt = tcp_hashinfo.ehash_mask + 1;
2883 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2884 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2885 
2886 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2887 	net->ipv4.sysctl_tcp_sack = 1;
2888 	net->ipv4.sysctl_tcp_window_scaling = 1;
2889 	net->ipv4.sysctl_tcp_timestamps = 1;
2890 	net->ipv4.sysctl_tcp_early_retrans = 3;
2891 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2892 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2893 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
2894 	net->ipv4.sysctl_tcp_max_reordering = 300;
2895 	net->ipv4.sysctl_tcp_dsack = 1;
2896 	net->ipv4.sysctl_tcp_app_win = 31;
2897 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
2898 	net->ipv4.sysctl_tcp_frto = 2;
2899 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2900 	/* This limits the percentage of the congestion window which we
2901 	 * will allow a single TSO frame to consume.  Building TSO frames
2902 	 * which are too large can cause TCP streams to be bursty.
2903 	 */
2904 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2905 	/* Default TSQ limit of 16 TSO segments */
2906 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2907 	/* rfc5961 challenge ack rate limiting */
2908 	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2909 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
2910 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2911 	net->ipv4.sysctl_tcp_autocorking = 1;
2912 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2913 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2914 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2915 	if (net != &init_net) {
2916 		memcpy(net->ipv4.sysctl_tcp_rmem,
2917 		       init_net.ipv4.sysctl_tcp_rmem,
2918 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2919 		memcpy(net->ipv4.sysctl_tcp_wmem,
2920 		       init_net.ipv4.sysctl_tcp_wmem,
2921 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2922 	}
2923 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2924 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
2925 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2926 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2927 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2928 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
2929 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2930 
2931 	/* Reno is always built in */
2932 	if (!net_eq(net, &init_net) &&
2933 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2934 			       init_net.ipv4.tcp_congestion_control->owner))
2935 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2936 	else
2937 		net->ipv4.tcp_congestion_control = &tcp_reno;
2938 
2939 	return 0;
2940 }
2941 
tcp_sk_exit_batch(struct list_head *net_exit_list)2942 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2943 {
2944 	struct net *net;
2945 
2946 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
2947 
2948 	list_for_each_entry(net, net_exit_list, exit_list)
2949 		tcp_fastopen_ctx_destroy(net);
2950 }
2951 
2952 static struct pernet_operations __net_initdata tcp_sk_ops = {
2953        .init	   = tcp_sk_init,
2954        .exit	   = tcp_sk_exit,
2955        .exit_batch = tcp_sk_exit_batch,
2956 };
2957 
2958 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
2959 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
2960 		     struct sock_common *sk_common, uid_t uid)
2961 
bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)2962 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
2963 {
2964 	struct tcp_iter_state *st = priv_data;
2965 	struct tcp_seq_afinfo *afinfo;
2966 	int ret;
2967 
2968 	afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
2969 	if (!afinfo)
2970 		return -ENOMEM;
2971 
2972 	afinfo->family = AF_UNSPEC;
2973 	st->bpf_seq_afinfo = afinfo;
2974 	ret = bpf_iter_init_seq_net(priv_data, aux);
2975 	if (ret)
2976 		kfree(afinfo);
2977 	return ret;
2978 }
2979 
bpf_iter_fini_tcp(void *priv_data)2980 static void bpf_iter_fini_tcp(void *priv_data)
2981 {
2982 	struct tcp_iter_state *st = priv_data;
2983 
2984 	kfree(st->bpf_seq_afinfo);
2985 	bpf_iter_fini_seq_net(priv_data);
2986 }
2987 
2988 static const struct bpf_iter_seq_info tcp_seq_info = {
2989 	.seq_ops		= &bpf_iter_tcp_seq_ops,
2990 	.init_seq_private	= bpf_iter_init_tcp,
2991 	.fini_seq_private	= bpf_iter_fini_tcp,
2992 	.seq_priv_size		= sizeof(struct tcp_iter_state),
2993 };
2994 
2995 static struct bpf_iter_reg tcp_reg_info = {
2996 	.target			= "tcp",
2997 	.ctx_arg_info_size	= 1,
2998 	.ctx_arg_info		= {
2999 		{ offsetof(struct bpf_iter__tcp, sk_common),
3000 		  PTR_TO_BTF_ID_OR_NULL },
3001 	},
3002 	.seq_info		= &tcp_seq_info,
3003 };
3004 
bpf_iter_register(void)3005 static void __init bpf_iter_register(void)
3006 {
3007 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3008 	if (bpf_iter_reg_target(&tcp_reg_info))
3009 		pr_warn("Warning: could not register bpf iterator tcp\n");
3010 }
3011 
3012 #endif
3013 
tcp_v4_init(void)3014 void __init tcp_v4_init(void)
3015 {
3016 	int cpu, res;
3017 
3018 	for_each_possible_cpu(cpu) {
3019 		struct sock *sk;
3020 
3021 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3022 					   IPPROTO_TCP, &init_net);
3023 		if (res)
3024 			panic("Failed to create the TCP control socket.\n");
3025 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3026 
3027 		/* Please enforce IP_DF and IPID==0 for RST and
3028 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3029 		 */
3030 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3031 
3032 		per_cpu(ipv4_tcp_sk, cpu) = sk;
3033 	}
3034 	if (register_pernet_subsys(&tcp_sk_ops))
3035 		panic("Failed to create the TCP control socket.\n");
3036 
3037 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3038 	bpf_iter_register();
3039 #endif
3040 }
3041