1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Implementation of the Transmission Control Protocol(TCP).
8 *
9 * IPv4 specific functions
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 */
18
19 /*
20 * Changes:
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
29 * ACK bit.
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
40 * coma.
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
46 */
47
48 #define pr_fmt(fmt) "TCP: " fmt
49
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
80
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
83
84 #include <trace/events/tcp.h>
85
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 __be32 daddr, __be32 saddr, const struct tcphdr *th);
89 #endif
90
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
93
94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
95
tcp_v4_init_seq(const struct sk_buff *skb)96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
97 {
98 return secure_tcp_seq(ip_hdr(skb)->daddr,
99 ip_hdr(skb)->saddr,
100 tcp_hdr(skb)->dest,
101 tcp_hdr(skb)->source);
102 }
103
tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
105 {
106 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
107 }
108
tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110 {
111 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
112 const struct inet_timewait_sock *tw = inet_twsk(sktw);
113 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114 struct tcp_sock *tp = tcp_sk(sk);
115
116 if (reuse == 2) {
117 /* Still does not detect *everything* that goes through
118 * lo, since we require a loopback src or dst address
119 * or direct binding to 'lo' interface.
120 */
121 bool loopback = false;
122 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
123 loopback = true;
124 #if IS_ENABLED(CONFIG_IPV6)
125 if (tw->tw_family == AF_INET6) {
126 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
128 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
129 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
130 loopback = true;
131 } else
132 #endif
133 {
134 if (ipv4_is_loopback(tw->tw_daddr) ||
135 ipv4_is_loopback(tw->tw_rcv_saddr))
136 loopback = true;
137 }
138 if (!loopback)
139 reuse = 0;
140 }
141
142 /* With PAWS, it is safe from the viewpoint
143 of data integrity. Even without PAWS it is safe provided sequence
144 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
145
146 Actually, the idea is close to VJ's one, only timestamp cache is
147 held not per host, but per port pair and TW bucket is used as state
148 holder.
149
150 If TW bucket has been already destroyed we fall back to VJ's scheme
151 and use initial timestamp retrieved from peer table.
152 */
153 if (tcptw->tw_ts_recent_stamp &&
154 (!twp || (reuse && time_after32(ktime_get_seconds(),
155 tcptw->tw_ts_recent_stamp)))) {
156 /* inet_twsk_hashdance() sets sk_refcnt after putting twsk
157 * and releasing the bucket lock.
158 */
159 if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
160 return 0;
161
162 /* In case of repair and re-using TIME-WAIT sockets we still
163 * want to be sure that it is safe as above but honor the
164 * sequence numbers and time stamps set as part of the repair
165 * process.
166 *
167 * Without this check re-using a TIME-WAIT socket with TCP
168 * repair would accumulate a -1 on the repair assigned
169 * sequence number. The first time it is reused the sequence
170 * is -1, the second time -2, etc. This fixes that issue
171 * without appearing to create any others.
172 */
173 if (likely(!tp->repair)) {
174 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
175
176 if (!seq)
177 seq = 1;
178 WRITE_ONCE(tp->write_seq, seq);
179 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
180 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
181 }
182
183 return 1;
184 }
185
186 return 0;
187 }
188 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
189
tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)190 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
191 int addr_len)
192 {
193 /* This check is replicated from tcp_v4_connect() and intended to
194 * prevent BPF program called below from accessing bytes that are out
195 * of the bound specified by user in addr_len.
196 */
197 if (addr_len < sizeof(struct sockaddr_in))
198 return -EINVAL;
199
200 sock_owned_by_me(sk);
201
202 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
203 }
204
205 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)206 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
207 {
208 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
209 struct inet_sock *inet = inet_sk(sk);
210 struct tcp_sock *tp = tcp_sk(sk);
211 __be16 orig_sport, orig_dport;
212 __be32 daddr, nexthop;
213 struct flowi4 *fl4;
214 struct rtable *rt;
215 int err;
216 struct ip_options_rcu *inet_opt;
217 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
218
219 if (addr_len < sizeof(struct sockaddr_in))
220 return -EINVAL;
221
222 if (usin->sin_family != AF_INET)
223 return -EAFNOSUPPORT;
224
225 nexthop = daddr = usin->sin_addr.s_addr;
226 inet_opt = rcu_dereference_protected(inet->inet_opt,
227 lockdep_sock_is_held(sk));
228 if (inet_opt && inet_opt->opt.srr) {
229 if (!daddr)
230 return -EINVAL;
231 nexthop = inet_opt->opt.faddr;
232 }
233
234 orig_sport = inet->inet_sport;
235 orig_dport = usin->sin_port;
236 fl4 = &inet->cork.fl.u.ip4;
237 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
238 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
239 IPPROTO_TCP,
240 orig_sport, orig_dport, sk);
241 if (IS_ERR(rt)) {
242 err = PTR_ERR(rt);
243 if (err == -ENETUNREACH)
244 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
245 return err;
246 }
247
248 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
249 ip_rt_put(rt);
250 return -ENETUNREACH;
251 }
252
253 if (!inet_opt || !inet_opt->opt.srr)
254 daddr = fl4->daddr;
255
256 if (!inet->inet_saddr)
257 inet->inet_saddr = fl4->saddr;
258 sk_rcv_saddr_set(sk, inet->inet_saddr);
259
260 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
261 /* Reset inherited state */
262 tp->rx_opt.ts_recent = 0;
263 tp->rx_opt.ts_recent_stamp = 0;
264 if (likely(!tp->repair))
265 WRITE_ONCE(tp->write_seq, 0);
266 }
267
268 inet->inet_dport = usin->sin_port;
269 sk_daddr_set(sk, daddr);
270
271 inet_csk(sk)->icsk_ext_hdr_len = 0;
272 if (inet_opt)
273 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
274
275 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
276
277 /* Socket identity is still unknown (sport may be zero).
278 * However we set state to SYN-SENT and not releasing socket
279 * lock select source port, enter ourselves into the hash tables and
280 * complete initialization after this.
281 */
282 tcp_set_state(sk, TCP_SYN_SENT);
283 err = inet_hash_connect(tcp_death_row, sk);
284 if (err)
285 goto failure;
286
287 sk_set_txhash(sk);
288
289 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
290 inet->inet_sport, inet->inet_dport, sk);
291 if (IS_ERR(rt)) {
292 err = PTR_ERR(rt);
293 rt = NULL;
294 goto failure;
295 }
296 /* OK, now commit destination to socket. */
297 sk->sk_gso_type = SKB_GSO_TCPV4;
298 sk_setup_caps(sk, &rt->dst);
299 rt = NULL;
300
301 if (likely(!tp->repair)) {
302 if (!tp->write_seq)
303 WRITE_ONCE(tp->write_seq,
304 secure_tcp_seq(inet->inet_saddr,
305 inet->inet_daddr,
306 inet->inet_sport,
307 usin->sin_port));
308 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
309 inet->inet_saddr,
310 inet->inet_daddr);
311 }
312
313 inet->inet_id = prandom_u32();
314
315 if (tcp_fastopen_defer_connect(sk, &err))
316 return err;
317 if (err)
318 goto failure;
319
320 err = tcp_connect(sk);
321
322 if (err)
323 goto failure;
324
325 return 0;
326
327 failure:
328 /*
329 * This unhashes the socket and releases the local port,
330 * if necessary.
331 */
332 tcp_set_state(sk, TCP_CLOSE);
333 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
334 inet_reset_saddr(sk);
335 ip_rt_put(rt);
336 sk->sk_route_caps = 0;
337 inet->inet_dport = 0;
338 return err;
339 }
340 EXPORT_SYMBOL(tcp_v4_connect);
341
342 /*
343 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
344 * It can be called through tcp_release_cb() if socket was owned by user
345 * at the time tcp_v4_err() was called to handle ICMP message.
346 */
tcp_v4_mtu_reduced(struct sock *sk)347 void tcp_v4_mtu_reduced(struct sock *sk)
348 {
349 struct inet_sock *inet = inet_sk(sk);
350 struct dst_entry *dst;
351 u32 mtu;
352
353 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
354 return;
355 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
356 dst = inet_csk_update_pmtu(sk, mtu);
357 if (!dst)
358 return;
359
360 /* Something is about to be wrong... Remember soft error
361 * for the case, if this connection will not able to recover.
362 */
363 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
364 sk->sk_err_soft = EMSGSIZE;
365
366 mtu = dst_mtu(dst);
367
368 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
369 ip_sk_accept_pmtu(sk) &&
370 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
371 tcp_sync_mss(sk, mtu);
372
373 /* Resend the TCP packet because it's
374 * clear that the old packet has been
375 * dropped. This is the new "fast" path mtu
376 * discovery.
377 */
378 tcp_simple_retransmit(sk);
379 } /* else let the usual retransmit timer handle it */
380 }
381 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
382
do_redirect(struct sk_buff *skb, struct sock *sk)383 static void do_redirect(struct sk_buff *skb, struct sock *sk)
384 {
385 struct dst_entry *dst = __sk_dst_check(sk, 0);
386
387 if (dst)
388 dst->ops->redirect(dst, sk, skb);
389 }
390
391
392 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock *sk, u32 seq, bool abort)393 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
394 {
395 struct request_sock *req = inet_reqsk(sk);
396 struct net *net = sock_net(sk);
397
398 /* ICMPs are not backlogged, hence we cannot get
399 * an established socket here.
400 */
401 if (seq != tcp_rsk(req)->snt_isn) {
402 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
403 } else if (abort) {
404 /*
405 * Still in SYN_RECV, just remove it silently.
406 * There is no good way to pass the error to the newly
407 * created socket, and POSIX does not want network
408 * errors returned from accept().
409 */
410 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
411 tcp_listendrop(req->rsk_listener);
412 }
413 reqsk_put(req);
414 }
415 EXPORT_SYMBOL(tcp_req_err);
416
417 /* TCP-LD (RFC 6069) logic */
tcp_ld_RTO_revert(struct sock *sk, u32 seq)418 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
419 {
420 struct inet_connection_sock *icsk = inet_csk(sk);
421 struct tcp_sock *tp = tcp_sk(sk);
422 struct sk_buff *skb;
423 s32 remaining;
424 u32 delta_us;
425
426 if (sock_owned_by_user(sk))
427 return;
428
429 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
430 !icsk->icsk_backoff)
431 return;
432
433 skb = tcp_rtx_queue_head(sk);
434 if (WARN_ON_ONCE(!skb))
435 return;
436
437 icsk->icsk_backoff--;
438 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
439 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
440
441 tcp_mstamp_refresh(tp);
442 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
443 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
444
445 if (remaining > 0) {
446 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
447 remaining, TCP_RTO_MAX);
448 } else {
449 /* RTO revert clocked out retransmission.
450 * Will retransmit now.
451 */
452 tcp_retransmit_timer(sk);
453 }
454 }
455 EXPORT_SYMBOL(tcp_ld_RTO_revert);
456
457 /*
458 * This routine is called by the ICMP module when it gets some
459 * sort of error condition. If err < 0 then the socket should
460 * be closed and the error returned to the user. If err > 0
461 * it's just the icmp type << 8 | icmp code. After adjustment
462 * header points to the first 8 bytes of the tcp header. We need
463 * to find the appropriate port.
464 *
465 * The locking strategy used here is very "optimistic". When
466 * someone else accesses the socket the ICMP is just dropped
467 * and for some paths there is no check at all.
468 * A more general error queue to queue errors for later handling
469 * is probably better.
470 *
471 */
472
tcp_v4_err(struct sk_buff *skb, u32 info)473 int tcp_v4_err(struct sk_buff *skb, u32 info)
474 {
475 const struct iphdr *iph = (const struct iphdr *)skb->data;
476 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
477 struct tcp_sock *tp;
478 struct inet_sock *inet;
479 const int type = icmp_hdr(skb)->type;
480 const int code = icmp_hdr(skb)->code;
481 struct sock *sk;
482 struct request_sock *fastopen;
483 u32 seq, snd_una;
484 int err;
485 struct net *net = dev_net(skb->dev);
486
487 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
488 th->dest, iph->saddr, ntohs(th->source),
489 inet_iif(skb), 0);
490 if (!sk) {
491 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
492 return -ENOENT;
493 }
494 if (sk->sk_state == TCP_TIME_WAIT) {
495 inet_twsk_put(inet_twsk(sk));
496 return 0;
497 }
498 seq = ntohl(th->seq);
499 if (sk->sk_state == TCP_NEW_SYN_RECV) {
500 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
501 type == ICMP_TIME_EXCEEDED ||
502 (type == ICMP_DEST_UNREACH &&
503 (code == ICMP_NET_UNREACH ||
504 code == ICMP_HOST_UNREACH)));
505 return 0;
506 }
507
508 bh_lock_sock(sk);
509 /* If too many ICMPs get dropped on busy
510 * servers this needs to be solved differently.
511 * We do take care of PMTU discovery (RFC1191) special case :
512 * we can receive locally generated ICMP messages while socket is held.
513 */
514 if (sock_owned_by_user(sk)) {
515 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
516 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
517 }
518 if (sk->sk_state == TCP_CLOSE)
519 goto out;
520
521 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
522 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
523 goto out;
524 }
525
526 tp = tcp_sk(sk);
527 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
528 fastopen = rcu_dereference(tp->fastopen_rsk);
529 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
530 if (sk->sk_state != TCP_LISTEN &&
531 !between(seq, snd_una, tp->snd_nxt)) {
532 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
533 goto out;
534 }
535
536 switch (type) {
537 case ICMP_REDIRECT:
538 if (!sock_owned_by_user(sk))
539 do_redirect(skb, sk);
540 goto out;
541 case ICMP_SOURCE_QUENCH:
542 /* Just silently ignore these. */
543 goto out;
544 case ICMP_PARAMETERPROB:
545 err = EPROTO;
546 break;
547 case ICMP_DEST_UNREACH:
548 if (code > NR_ICMP_UNREACH)
549 goto out;
550
551 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
552 /* We are not interested in TCP_LISTEN and open_requests
553 * (SYN-ACKs send out by Linux are always <576bytes so
554 * they should go through unfragmented).
555 */
556 if (sk->sk_state == TCP_LISTEN)
557 goto out;
558
559 WRITE_ONCE(tp->mtu_info, info);
560 if (!sock_owned_by_user(sk)) {
561 tcp_v4_mtu_reduced(sk);
562 } else {
563 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
564 sock_hold(sk);
565 }
566 goto out;
567 }
568
569 err = icmp_err_convert[code].errno;
570 /* check if this ICMP message allows revert of backoff.
571 * (see RFC 6069)
572 */
573 if (!fastopen &&
574 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
575 tcp_ld_RTO_revert(sk, seq);
576 break;
577 case ICMP_TIME_EXCEEDED:
578 err = EHOSTUNREACH;
579 break;
580 default:
581 goto out;
582 }
583
584 switch (sk->sk_state) {
585 case TCP_SYN_SENT:
586 case TCP_SYN_RECV:
587 /* Only in fast or simultaneous open. If a fast open socket is
588 * already accepted it is treated as a connected one below.
589 */
590 if (fastopen && !fastopen->sk)
591 break;
592
593 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
594
595 if (!sock_owned_by_user(sk)) {
596 sk->sk_err = err;
597
598 sk->sk_error_report(sk);
599
600 tcp_done(sk);
601 } else {
602 sk->sk_err_soft = err;
603 }
604 goto out;
605 }
606
607 /* If we've already connected we will keep trying
608 * until we time out, or the user gives up.
609 *
610 * rfc1122 4.2.3.9 allows to consider as hard errors
611 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
612 * but it is obsoleted by pmtu discovery).
613 *
614 * Note, that in modern internet, where routing is unreliable
615 * and in each dark corner broken firewalls sit, sending random
616 * errors ordered by their masters even this two messages finally lose
617 * their original sense (even Linux sends invalid PORT_UNREACHs)
618 *
619 * Now we are in compliance with RFCs.
620 * --ANK (980905)
621 */
622
623 inet = inet_sk(sk);
624 if (!sock_owned_by_user(sk) && inet->recverr) {
625 sk->sk_err = err;
626 sk->sk_error_report(sk);
627 } else { /* Only an error on timeout */
628 sk->sk_err_soft = err;
629 }
630
631 out:
632 bh_unlock_sock(sk);
633 sock_put(sk);
634 return 0;
635 }
636
__tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)637 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
638 {
639 struct tcphdr *th = tcp_hdr(skb);
640
641 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
642 skb->csum_start = skb_transport_header(skb) - skb->head;
643 skb->csum_offset = offsetof(struct tcphdr, check);
644 }
645
646 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)647 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
648 {
649 const struct inet_sock *inet = inet_sk(sk);
650
651 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
652 }
653 EXPORT_SYMBOL(tcp_v4_send_check);
654
655 /*
656 * This routine will send an RST to the other tcp.
657 *
658 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
659 * for reset.
660 * Answer: if a packet caused RST, it is not for a socket
661 * existing in our system, if it is matched to a socket,
662 * it is just duplicate segment or bug in other side's TCP.
663 * So that we build reply only basing on parameters
664 * arrived with segment.
665 * Exception: precedence violation. We do not implement it in any case.
666 */
667
tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)668 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
669 {
670 const struct tcphdr *th = tcp_hdr(skb);
671 struct {
672 struct tcphdr th;
673 #ifdef CONFIG_TCP_MD5SIG
674 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
675 #endif
676 } rep;
677 struct ip_reply_arg arg;
678 #ifdef CONFIG_TCP_MD5SIG
679 struct tcp_md5sig_key *key = NULL;
680 const __u8 *hash_location = NULL;
681 unsigned char newhash[16];
682 int genhash;
683 struct sock *sk1 = NULL;
684 #endif
685 u64 transmit_time = 0;
686 struct sock *ctl_sk;
687 struct net *net;
688
689 /* Never send a reset in response to a reset. */
690 if (th->rst)
691 return;
692
693 /* If sk not NULL, it means we did a successful lookup and incoming
694 * route had to be correct. prequeue might have dropped our dst.
695 */
696 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
697 return;
698
699 /* Swap the send and the receive. */
700 memset(&rep, 0, sizeof(rep));
701 rep.th.dest = th->source;
702 rep.th.source = th->dest;
703 rep.th.doff = sizeof(struct tcphdr) / 4;
704 rep.th.rst = 1;
705
706 if (th->ack) {
707 rep.th.seq = th->ack_seq;
708 } else {
709 rep.th.ack = 1;
710 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
711 skb->len - (th->doff << 2));
712 }
713
714 memset(&arg, 0, sizeof(arg));
715 arg.iov[0].iov_base = (unsigned char *)&rep;
716 arg.iov[0].iov_len = sizeof(rep.th);
717
718 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
719 #ifdef CONFIG_TCP_MD5SIG
720 rcu_read_lock();
721 hash_location = tcp_parse_md5sig_option(th);
722 if (sk && sk_fullsock(sk)) {
723 const union tcp_md5_addr *addr;
724 int l3index;
725
726 /* sdif set, means packet ingressed via a device
727 * in an L3 domain and inet_iif is set to it.
728 */
729 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
730 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
731 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
732 } else if (hash_location) {
733 const union tcp_md5_addr *addr;
734 int sdif = tcp_v4_sdif(skb);
735 int dif = inet_iif(skb);
736 int l3index;
737
738 /*
739 * active side is lost. Try to find listening socket through
740 * source port, and then find md5 key through listening socket.
741 * we are not loose security here:
742 * Incoming packet is checked with md5 hash with finding key,
743 * no RST generated if md5 hash doesn't match.
744 */
745 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
746 ip_hdr(skb)->saddr,
747 th->source, ip_hdr(skb)->daddr,
748 ntohs(th->source), dif, sdif);
749 /* don't send rst if it can't find key */
750 if (!sk1)
751 goto out;
752
753 /* sdif set, means packet ingressed via a device
754 * in an L3 domain and dif is set to it.
755 */
756 l3index = sdif ? dif : 0;
757 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
758 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
759 if (!key)
760 goto out;
761
762
763 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
764 if (genhash || memcmp(hash_location, newhash, 16) != 0)
765 goto out;
766
767 }
768
769 if (key) {
770 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
771 (TCPOPT_NOP << 16) |
772 (TCPOPT_MD5SIG << 8) |
773 TCPOLEN_MD5SIG);
774 /* Update length and the length the header thinks exists */
775 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
776 rep.th.doff = arg.iov[0].iov_len / 4;
777
778 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
779 key, ip_hdr(skb)->saddr,
780 ip_hdr(skb)->daddr, &rep.th);
781 }
782 #endif
783 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
784 ip_hdr(skb)->saddr, /* XXX */
785 arg.iov[0].iov_len, IPPROTO_TCP, 0);
786 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
787 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
788
789 /* When socket is gone, all binding information is lost.
790 * routing might fail in this case. No choice here, if we choose to force
791 * input interface, we will misroute in case of asymmetric route.
792 */
793 if (sk) {
794 arg.bound_dev_if = sk->sk_bound_dev_if;
795 if (sk_fullsock(sk))
796 trace_tcp_send_reset(sk, skb);
797 }
798
799 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
800 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
801
802 arg.tos = ip_hdr(skb)->tos;
803 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
804 local_bh_disable();
805 ctl_sk = this_cpu_read(ipv4_tcp_sk);
806 sock_net_set(ctl_sk, net);
807 if (sk) {
808 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
809 inet_twsk(sk)->tw_mark : sk->sk_mark;
810 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
811 inet_twsk(sk)->tw_priority : sk->sk_priority;
812 transmit_time = tcp_transmit_time(sk);
813 xfrm_sk_clone_policy(ctl_sk, sk);
814 } else {
815 ctl_sk->sk_mark = 0;
816 ctl_sk->sk_priority = 0;
817 }
818 ip_send_unicast_reply(ctl_sk,
819 skb, &TCP_SKB_CB(skb)->header.h4.opt,
820 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
821 &arg, arg.iov[0].iov_len,
822 transmit_time);
823
824 xfrm_sk_free_policy(ctl_sk);
825 sock_net_set(ctl_sk, &init_net);
826 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
827 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
828 local_bh_enable();
829
830 #ifdef CONFIG_TCP_MD5SIG
831 out:
832 rcu_read_unlock();
833 #endif
834 }
835
836 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
837 outside socket context is ugly, certainly. What can I do?
838 */
839
tcp_v4_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_md5sig_key *key, int reply_flags, u8 tos)840 static void tcp_v4_send_ack(const struct sock *sk,
841 struct sk_buff *skb, u32 seq, u32 ack,
842 u32 win, u32 tsval, u32 tsecr, int oif,
843 struct tcp_md5sig_key *key,
844 int reply_flags, u8 tos)
845 {
846 const struct tcphdr *th = tcp_hdr(skb);
847 struct {
848 struct tcphdr th;
849 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
850 #ifdef CONFIG_TCP_MD5SIG
851 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
852 #endif
853 ];
854 } rep;
855 struct net *net = sock_net(sk);
856 struct ip_reply_arg arg;
857 struct sock *ctl_sk;
858 u64 transmit_time;
859
860 memset(&rep.th, 0, sizeof(struct tcphdr));
861 memset(&arg, 0, sizeof(arg));
862
863 arg.iov[0].iov_base = (unsigned char *)&rep;
864 arg.iov[0].iov_len = sizeof(rep.th);
865 if (tsecr) {
866 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
867 (TCPOPT_TIMESTAMP << 8) |
868 TCPOLEN_TIMESTAMP);
869 rep.opt[1] = htonl(tsval);
870 rep.opt[2] = htonl(tsecr);
871 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
872 }
873
874 /* Swap the send and the receive. */
875 rep.th.dest = th->source;
876 rep.th.source = th->dest;
877 rep.th.doff = arg.iov[0].iov_len / 4;
878 rep.th.seq = htonl(seq);
879 rep.th.ack_seq = htonl(ack);
880 rep.th.ack = 1;
881 rep.th.window = htons(win);
882
883 #ifdef CONFIG_TCP_MD5SIG
884 if (key) {
885 int offset = (tsecr) ? 3 : 0;
886
887 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
888 (TCPOPT_NOP << 16) |
889 (TCPOPT_MD5SIG << 8) |
890 TCPOLEN_MD5SIG);
891 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
892 rep.th.doff = arg.iov[0].iov_len/4;
893
894 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
895 key, ip_hdr(skb)->saddr,
896 ip_hdr(skb)->daddr, &rep.th);
897 }
898 #endif
899 arg.flags = reply_flags;
900 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
901 ip_hdr(skb)->saddr, /* XXX */
902 arg.iov[0].iov_len, IPPROTO_TCP, 0);
903 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
904 if (oif)
905 arg.bound_dev_if = oif;
906 arg.tos = tos;
907 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
908 local_bh_disable();
909 ctl_sk = this_cpu_read(ipv4_tcp_sk);
910 sock_net_set(ctl_sk, net);
911 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
912 inet_twsk(sk)->tw_mark : sk->sk_mark;
913 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
914 inet_twsk(sk)->tw_priority : sk->sk_priority;
915 transmit_time = tcp_transmit_time(sk);
916 ip_send_unicast_reply(ctl_sk,
917 skb, &TCP_SKB_CB(skb)->header.h4.opt,
918 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
919 &arg, arg.iov[0].iov_len,
920 transmit_time);
921
922 sock_net_set(ctl_sk, &init_net);
923 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
924 local_bh_enable();
925 }
926
tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)927 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
928 {
929 struct inet_timewait_sock *tw = inet_twsk(sk);
930 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
931
932 tcp_v4_send_ack(sk, skb,
933 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
934 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
935 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
936 tcptw->tw_ts_recent,
937 tw->tw_bound_dev_if,
938 tcp_twsk_md5_key(tcptw),
939 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
940 tw->tw_tos
941 );
942
943 inet_twsk_put(tw);
944 }
945
tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req)946 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
947 struct request_sock *req)
948 {
949 const union tcp_md5_addr *addr;
950 int l3index;
951
952 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
953 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
954 */
955 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
956 tcp_sk(sk)->snd_nxt;
957
958 /* RFC 7323 2.3
959 * The window field (SEG.WND) of every outgoing segment, with the
960 * exception of <SYN> segments, MUST be right-shifted by
961 * Rcv.Wind.Shift bits:
962 */
963 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
964 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
965 tcp_v4_send_ack(sk, skb, seq,
966 tcp_rsk(req)->rcv_nxt,
967 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
968 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
969 READ_ONCE(req->ts_recent),
970 0,
971 tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
972 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
973 ip_hdr(skb)->tos);
974 }
975
976 /*
977 * Send a SYN-ACK after having received a SYN.
978 * This still operates on a request_sock only, not on a big
979 * socket.
980 */
tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type, struct sk_buff *syn_skb)981 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
982 struct flowi *fl,
983 struct request_sock *req,
984 struct tcp_fastopen_cookie *foc,
985 enum tcp_synack_type synack_type,
986 struct sk_buff *syn_skb)
987 {
988 const struct inet_request_sock *ireq = inet_rsk(req);
989 struct flowi4 fl4;
990 int err = -1;
991 struct sk_buff *skb;
992 u8 tos;
993
994 /* First, grab a route. */
995 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
996 return -1;
997
998 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
999
1000 if (skb) {
1001 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1002
1003 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1004 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1005 (inet_sk(sk)->tos & INET_ECN_MASK) :
1006 inet_sk(sk)->tos;
1007
1008 if (!INET_ECN_is_capable(tos) &&
1009 tcp_bpf_ca_needs_ecn((struct sock *)req))
1010 tos |= INET_ECN_ECT_0;
1011
1012 rcu_read_lock();
1013 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1014 ireq->ir_rmt_addr,
1015 rcu_dereference(ireq->ireq_opt),
1016 tos);
1017 rcu_read_unlock();
1018 err = net_xmit_eval(err);
1019 }
1020
1021 return err;
1022 }
1023
1024 /*
1025 * IPv4 request_sock destructor.
1026 */
tcp_v4_reqsk_destructor(struct request_sock *req)1027 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1028 {
1029 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1030 }
1031
1032 #ifdef CONFIG_TCP_MD5SIG
1033 /*
1034 * RFC2385 MD5 checksumming requires a mapping of
1035 * IP address->MD5 Key.
1036 * We need to maintain these in the sk structure.
1037 */
1038
1039 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1040 EXPORT_SYMBOL(tcp_md5_needed);
1041
better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)1042 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1043 {
1044 if (!old)
1045 return true;
1046
1047 /* l3index always overrides non-l3index */
1048 if (old->l3index && new->l3index == 0)
1049 return false;
1050 if (old->l3index == 0 && new->l3index)
1051 return true;
1052
1053 return old->prefixlen < new->prefixlen;
1054 }
1055
1056 /* Find the Key structure for an address. */
__tcp_md5_do_lookup(const struct sock *sk, int l3index, const union tcp_md5_addr *addr, int family)1057 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1058 const union tcp_md5_addr *addr,
1059 int family)
1060 {
1061 const struct tcp_sock *tp = tcp_sk(sk);
1062 struct tcp_md5sig_key *key;
1063 const struct tcp_md5sig_info *md5sig;
1064 __be32 mask;
1065 struct tcp_md5sig_key *best_match = NULL;
1066 bool match;
1067
1068 /* caller either holds rcu_read_lock() or socket lock */
1069 md5sig = rcu_dereference_check(tp->md5sig_info,
1070 lockdep_sock_is_held(sk));
1071 if (!md5sig)
1072 return NULL;
1073
1074 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1075 lockdep_sock_is_held(sk)) {
1076 if (key->family != family)
1077 continue;
1078 if (key->l3index && key->l3index != l3index)
1079 continue;
1080 if (family == AF_INET) {
1081 mask = inet_make_mask(key->prefixlen);
1082 match = (key->addr.a4.s_addr & mask) ==
1083 (addr->a4.s_addr & mask);
1084 #if IS_ENABLED(CONFIG_IPV6)
1085 } else if (family == AF_INET6) {
1086 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1087 key->prefixlen);
1088 #endif
1089 } else {
1090 match = false;
1091 }
1092
1093 if (match && better_md5_match(best_match, key))
1094 best_match = key;
1095 }
1096 return best_match;
1097 }
1098 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1099
tcp_md5_do_lookup_exact(const struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index)1100 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1101 const union tcp_md5_addr *addr,
1102 int family, u8 prefixlen,
1103 int l3index)
1104 {
1105 const struct tcp_sock *tp = tcp_sk(sk);
1106 struct tcp_md5sig_key *key;
1107 unsigned int size = sizeof(struct in_addr);
1108 const struct tcp_md5sig_info *md5sig;
1109
1110 /* caller either holds rcu_read_lock() or socket lock */
1111 md5sig = rcu_dereference_check(tp->md5sig_info,
1112 lockdep_sock_is_held(sk));
1113 if (!md5sig)
1114 return NULL;
1115 #if IS_ENABLED(CONFIG_IPV6)
1116 if (family == AF_INET6)
1117 size = sizeof(struct in6_addr);
1118 #endif
1119 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1120 lockdep_sock_is_held(sk)) {
1121 if (key->family != family)
1122 continue;
1123 if (key->l3index != l3index)
1124 continue;
1125 if (!memcmp(&key->addr, addr, size) &&
1126 key->prefixlen == prefixlen)
1127 return key;
1128 }
1129 return NULL;
1130 }
1131
tcp_v4_md5_lookup(const struct sock *sk, const struct sock *addr_sk)1132 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1133 const struct sock *addr_sk)
1134 {
1135 const union tcp_md5_addr *addr;
1136 int l3index;
1137
1138 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1139 addr_sk->sk_bound_dev_if);
1140 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1141 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1142 }
1143 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1144
1145 /* This can be called on a newly created socket, from other files */
tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index, const u8 *newkey, u8 newkeylen, gfp_t gfp)1146 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1147 int family, u8 prefixlen, int l3index,
1148 const u8 *newkey, u8 newkeylen, gfp_t gfp)
1149 {
1150 /* Add Key to the list */
1151 struct tcp_md5sig_key *key;
1152 struct tcp_sock *tp = tcp_sk(sk);
1153 struct tcp_md5sig_info *md5sig;
1154
1155 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1156 if (key) {
1157 /* Pre-existing entry - just update that one.
1158 * Note that the key might be used concurrently.
1159 * data_race() is telling kcsan that we do not care of
1160 * key mismatches, since changing MD5 key on live flows
1161 * can lead to packet drops.
1162 */
1163 data_race(memcpy(key->key, newkey, newkeylen));
1164
1165 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1166 * Also note that a reader could catch new key->keylen value
1167 * but old key->key[], this is the reason we use __GFP_ZERO
1168 * at sock_kmalloc() time below these lines.
1169 */
1170 WRITE_ONCE(key->keylen, newkeylen);
1171
1172 return 0;
1173 }
1174
1175 md5sig = rcu_dereference_protected(tp->md5sig_info,
1176 lockdep_sock_is_held(sk));
1177 if (!md5sig) {
1178 md5sig = kmalloc(sizeof(*md5sig), gfp);
1179 if (!md5sig)
1180 return -ENOMEM;
1181
1182 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1183 INIT_HLIST_HEAD(&md5sig->head);
1184 rcu_assign_pointer(tp->md5sig_info, md5sig);
1185 }
1186
1187 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1188 if (!key)
1189 return -ENOMEM;
1190 if (!tcp_alloc_md5sig_pool()) {
1191 sock_kfree_s(sk, key, sizeof(*key));
1192 return -ENOMEM;
1193 }
1194
1195 memcpy(key->key, newkey, newkeylen);
1196 key->keylen = newkeylen;
1197 key->family = family;
1198 key->prefixlen = prefixlen;
1199 key->l3index = l3index;
1200 memcpy(&key->addr, addr,
1201 (family == AF_INET6) ? sizeof(struct in6_addr) :
1202 sizeof(struct in_addr));
1203 hlist_add_head_rcu(&key->node, &md5sig->head);
1204 return 0;
1205 }
1206 EXPORT_SYMBOL(tcp_md5_do_add);
1207
tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index)1208 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1209 u8 prefixlen, int l3index)
1210 {
1211 struct tcp_md5sig_key *key;
1212
1213 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1214 if (!key)
1215 return -ENOENT;
1216 hlist_del_rcu(&key->node);
1217 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1218 kfree_rcu(key, rcu);
1219 return 0;
1220 }
1221 EXPORT_SYMBOL(tcp_md5_do_del);
1222
tcp_clear_md5_list(struct sock *sk)1223 static void tcp_clear_md5_list(struct sock *sk)
1224 {
1225 struct tcp_sock *tp = tcp_sk(sk);
1226 struct tcp_md5sig_key *key;
1227 struct hlist_node *n;
1228 struct tcp_md5sig_info *md5sig;
1229
1230 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1231
1232 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1233 hlist_del_rcu(&key->node);
1234 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1235 kfree_rcu(key, rcu);
1236 }
1237 }
1238
tcp_v4_parse_md5_keys(struct sock *sk, int optname, sockptr_t optval, int optlen)1239 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1240 sockptr_t optval, int optlen)
1241 {
1242 struct tcp_md5sig cmd;
1243 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1244 const union tcp_md5_addr *addr;
1245 u8 prefixlen = 32;
1246 int l3index = 0;
1247
1248 if (optlen < sizeof(cmd))
1249 return -EINVAL;
1250
1251 if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1252 return -EFAULT;
1253
1254 if (sin->sin_family != AF_INET)
1255 return -EINVAL;
1256
1257 if (optname == TCP_MD5SIG_EXT &&
1258 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1259 prefixlen = cmd.tcpm_prefixlen;
1260 if (prefixlen > 32)
1261 return -EINVAL;
1262 }
1263
1264 if (optname == TCP_MD5SIG_EXT &&
1265 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1266 struct net_device *dev;
1267
1268 rcu_read_lock();
1269 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1270 if (dev && netif_is_l3_master(dev))
1271 l3index = dev->ifindex;
1272
1273 rcu_read_unlock();
1274
1275 /* ok to reference set/not set outside of rcu;
1276 * right now device MUST be an L3 master
1277 */
1278 if (!dev || !l3index)
1279 return -EINVAL;
1280 }
1281
1282 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1283
1284 if (!cmd.tcpm_keylen)
1285 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1286
1287 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1288 return -EINVAL;
1289
1290 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1291 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1292 }
1293
tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, __be32 daddr, __be32 saddr, const struct tcphdr *th, int nbytes)1294 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1295 __be32 daddr, __be32 saddr,
1296 const struct tcphdr *th, int nbytes)
1297 {
1298 struct tcp4_pseudohdr *bp;
1299 struct scatterlist sg;
1300 struct tcphdr *_th;
1301
1302 bp = hp->scratch;
1303 bp->saddr = saddr;
1304 bp->daddr = daddr;
1305 bp->pad = 0;
1306 bp->protocol = IPPROTO_TCP;
1307 bp->len = cpu_to_be16(nbytes);
1308
1309 _th = (struct tcphdr *)(bp + 1);
1310 memcpy(_th, th, sizeof(*th));
1311 _th->check = 0;
1312
1313 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1314 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1315 sizeof(*bp) + sizeof(*th));
1316 return crypto_ahash_update(hp->md5_req);
1317 }
1318
tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, __be32 daddr, __be32 saddr, const struct tcphdr *th)1319 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1320 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1321 {
1322 struct tcp_md5sig_pool *hp;
1323 struct ahash_request *req;
1324
1325 hp = tcp_get_md5sig_pool();
1326 if (!hp)
1327 goto clear_hash_noput;
1328 req = hp->md5_req;
1329
1330 if (crypto_ahash_init(req))
1331 goto clear_hash;
1332 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1333 goto clear_hash;
1334 if (tcp_md5_hash_key(hp, key))
1335 goto clear_hash;
1336 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1337 if (crypto_ahash_final(req))
1338 goto clear_hash;
1339
1340 tcp_put_md5sig_pool();
1341 return 0;
1342
1343 clear_hash:
1344 tcp_put_md5sig_pool();
1345 clear_hash_noput:
1346 memset(md5_hash, 0, 16);
1347 return 1;
1348 }
1349
tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, const struct sock *sk, const struct sk_buff *skb)1350 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1351 const struct sock *sk,
1352 const struct sk_buff *skb)
1353 {
1354 struct tcp_md5sig_pool *hp;
1355 struct ahash_request *req;
1356 const struct tcphdr *th = tcp_hdr(skb);
1357 __be32 saddr, daddr;
1358
1359 if (sk) { /* valid for establish/request sockets */
1360 saddr = sk->sk_rcv_saddr;
1361 daddr = sk->sk_daddr;
1362 } else {
1363 const struct iphdr *iph = ip_hdr(skb);
1364 saddr = iph->saddr;
1365 daddr = iph->daddr;
1366 }
1367
1368 hp = tcp_get_md5sig_pool();
1369 if (!hp)
1370 goto clear_hash_noput;
1371 req = hp->md5_req;
1372
1373 if (crypto_ahash_init(req))
1374 goto clear_hash;
1375
1376 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1377 goto clear_hash;
1378 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1379 goto clear_hash;
1380 if (tcp_md5_hash_key(hp, key))
1381 goto clear_hash;
1382 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1383 if (crypto_ahash_final(req))
1384 goto clear_hash;
1385
1386 tcp_put_md5sig_pool();
1387 return 0;
1388
1389 clear_hash:
1390 tcp_put_md5sig_pool();
1391 clear_hash_noput:
1392 memset(md5_hash, 0, 16);
1393 return 1;
1394 }
1395 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1396
1397 #endif
1398
1399 /* Called with rcu_read_lock() */
tcp_v4_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, int dif, int sdif)1400 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1401 const struct sk_buff *skb,
1402 int dif, int sdif)
1403 {
1404 #ifdef CONFIG_TCP_MD5SIG
1405 /*
1406 * This gets called for each TCP segment that arrives
1407 * so we want to be efficient.
1408 * We have 3 drop cases:
1409 * o No MD5 hash and one expected.
1410 * o MD5 hash and we're not expecting one.
1411 * o MD5 hash and its wrong.
1412 */
1413 const __u8 *hash_location = NULL;
1414 struct tcp_md5sig_key *hash_expected;
1415 const struct iphdr *iph = ip_hdr(skb);
1416 const struct tcphdr *th = tcp_hdr(skb);
1417 const union tcp_md5_addr *addr;
1418 unsigned char newhash[16];
1419 int genhash, l3index;
1420
1421 /* sdif set, means packet ingressed via a device
1422 * in an L3 domain and dif is set to the l3mdev
1423 */
1424 l3index = sdif ? dif : 0;
1425
1426 addr = (union tcp_md5_addr *)&iph->saddr;
1427 hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1428 hash_location = tcp_parse_md5sig_option(th);
1429
1430 /* We've parsed the options - do we have a hash? */
1431 if (!hash_expected && !hash_location)
1432 return false;
1433
1434 if (hash_expected && !hash_location) {
1435 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1436 return true;
1437 }
1438
1439 if (!hash_expected && hash_location) {
1440 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1441 return true;
1442 }
1443
1444 /* Okay, so this is hash_expected and hash_location -
1445 * so we need to calculate the checksum.
1446 */
1447 genhash = tcp_v4_md5_hash_skb(newhash,
1448 hash_expected,
1449 NULL, skb);
1450
1451 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1452 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1453 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1454 &iph->saddr, ntohs(th->source),
1455 &iph->daddr, ntohs(th->dest),
1456 genhash ? " tcp_v4_calc_md5_hash failed"
1457 : "", l3index);
1458 return true;
1459 }
1460 return false;
1461 #endif
1462 return false;
1463 }
1464
tcp_v4_init_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb)1465 static void tcp_v4_init_req(struct request_sock *req,
1466 const struct sock *sk_listener,
1467 struct sk_buff *skb)
1468 {
1469 struct inet_request_sock *ireq = inet_rsk(req);
1470 struct net *net = sock_net(sk_listener);
1471
1472 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1473 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1474 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1475 }
1476
tcp_v4_route_req(const struct sock *sk, struct flowi *fl, const struct request_sock *req)1477 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1478 struct flowi *fl,
1479 const struct request_sock *req)
1480 {
1481 return inet_csk_route_req(sk, &fl->u.ip4, req);
1482 }
1483
1484 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1485 .family = PF_INET,
1486 .obj_size = sizeof(struct tcp_request_sock),
1487 .rtx_syn_ack = tcp_rtx_synack,
1488 .send_ack = tcp_v4_reqsk_send_ack,
1489 .destructor = tcp_v4_reqsk_destructor,
1490 .send_reset = tcp_v4_send_reset,
1491 .syn_ack_timeout = tcp_syn_ack_timeout,
1492 };
1493
1494 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1495 .mss_clamp = TCP_MSS_DEFAULT,
1496 #ifdef CONFIG_TCP_MD5SIG
1497 .req_md5_lookup = tcp_v4_md5_lookup,
1498 .calc_md5_hash = tcp_v4_md5_hash_skb,
1499 #endif
1500 .init_req = tcp_v4_init_req,
1501 #ifdef CONFIG_SYN_COOKIES
1502 .cookie_init_seq = cookie_v4_init_sequence,
1503 #endif
1504 .route_req = tcp_v4_route_req,
1505 .init_seq = tcp_v4_init_seq,
1506 .init_ts_off = tcp_v4_init_ts_off,
1507 .send_synack = tcp_v4_send_synack,
1508 };
1509
tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)1510 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1511 {
1512 /* Never answer to SYNs send to broadcast or multicast */
1513 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1514 goto drop;
1515
1516 return tcp_conn_request(&tcp_request_sock_ops,
1517 &tcp_request_sock_ipv4_ops, sk, skb);
1518
1519 drop:
1520 tcp_listendrop(sk);
1521 return 0;
1522 }
1523 EXPORT_SYMBOL(tcp_v4_conn_request);
1524
1525
1526 /*
1527 * The three way handshake has completed - we got a valid synack -
1528 * now create the new socket.
1529 */
tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst, struct request_sock *req_unhash, bool *own_req)1530 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1531 struct request_sock *req,
1532 struct dst_entry *dst,
1533 struct request_sock *req_unhash,
1534 bool *own_req)
1535 {
1536 struct inet_request_sock *ireq;
1537 bool found_dup_sk = false;
1538 struct inet_sock *newinet;
1539 struct tcp_sock *newtp;
1540 struct sock *newsk;
1541 #ifdef CONFIG_TCP_MD5SIG
1542 const union tcp_md5_addr *addr;
1543 struct tcp_md5sig_key *key;
1544 int l3index;
1545 #endif
1546 struct ip_options_rcu *inet_opt;
1547
1548 if (sk_acceptq_is_full(sk))
1549 goto exit_overflow;
1550
1551 newsk = tcp_create_openreq_child(sk, req, skb);
1552 if (!newsk)
1553 goto exit_nonewsk;
1554
1555 newsk->sk_gso_type = SKB_GSO_TCPV4;
1556 inet_sk_rx_dst_set(newsk, skb);
1557
1558 newtp = tcp_sk(newsk);
1559 newinet = inet_sk(newsk);
1560 ireq = inet_rsk(req);
1561 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1562 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1563 newsk->sk_bound_dev_if = ireq->ir_iif;
1564 newinet->inet_saddr = ireq->ir_loc_addr;
1565 inet_opt = rcu_dereference(ireq->ireq_opt);
1566 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1567 newinet->mc_index = inet_iif(skb);
1568 newinet->mc_ttl = ip_hdr(skb)->ttl;
1569 newinet->rcv_tos = ip_hdr(skb)->tos;
1570 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1571 if (inet_opt)
1572 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1573 newinet->inet_id = prandom_u32();
1574
1575 /* Set ToS of the new socket based upon the value of incoming SYN.
1576 * ECT bits are set later in tcp_init_transfer().
1577 */
1578 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1579 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1580
1581 if (!dst) {
1582 dst = inet_csk_route_child_sock(sk, newsk, req);
1583 if (!dst)
1584 goto put_and_exit;
1585 } else {
1586 /* syncookie case : see end of cookie_v4_check() */
1587 }
1588 sk_setup_caps(newsk, dst);
1589
1590 tcp_ca_openreq_child(newsk, dst);
1591
1592 tcp_sync_mss(newsk, dst_mtu(dst));
1593 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1594
1595 tcp_initialize_rcv_mss(newsk);
1596
1597 #ifdef CONFIG_TCP_MD5SIG
1598 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1599 /* Copy over the MD5 key from the original socket */
1600 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1601 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1602 if (key) {
1603 /*
1604 * We're using one, so create a matching key
1605 * on the newsk structure. If we fail to get
1606 * memory, then we end up not copying the key
1607 * across. Shucks.
1608 */
1609 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1610 key->key, key->keylen, GFP_ATOMIC);
1611 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1612 }
1613 #endif
1614
1615 if (__inet_inherit_port(sk, newsk) < 0)
1616 goto put_and_exit;
1617 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1618 &found_dup_sk);
1619 if (likely(*own_req)) {
1620 tcp_move_syn(newtp, req);
1621 ireq->ireq_opt = NULL;
1622 } else {
1623 newinet->inet_opt = NULL;
1624
1625 if (!req_unhash && found_dup_sk) {
1626 /* This code path should only be executed in the
1627 * syncookie case only
1628 */
1629 bh_unlock_sock(newsk);
1630 sock_put(newsk);
1631 newsk = NULL;
1632 }
1633 }
1634 return newsk;
1635
1636 exit_overflow:
1637 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1638 exit_nonewsk:
1639 dst_release(dst);
1640 exit:
1641 tcp_listendrop(sk);
1642 return NULL;
1643 put_and_exit:
1644 newinet->inet_opt = NULL;
1645 inet_csk_prepare_forced_close(newsk);
1646 tcp_done(newsk);
1647 goto exit;
1648 }
1649 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1650
tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)1651 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1652 {
1653 #ifdef CONFIG_SYN_COOKIES
1654 const struct tcphdr *th = tcp_hdr(skb);
1655
1656 if (!th->syn)
1657 sk = cookie_v4_check(sk, skb);
1658 #endif
1659 return sk;
1660 }
1661
tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, struct tcphdr *th, u32 *cookie)1662 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1663 struct tcphdr *th, u32 *cookie)
1664 {
1665 u16 mss = 0;
1666 #ifdef CONFIG_SYN_COOKIES
1667 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1668 &tcp_request_sock_ipv4_ops, sk, th);
1669 if (mss) {
1670 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1671 tcp_synq_overflow(sk);
1672 }
1673 #endif
1674 return mss;
1675 }
1676
1677 /* The socket must have it's spinlock held when we get
1678 * here, unless it is a TCP_LISTEN socket.
1679 *
1680 * We have a potential double-lock case here, so even when
1681 * doing backlog processing we use the BH locking scheme.
1682 * This is because we cannot sleep with the original spinlock
1683 * held.
1684 */
tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)1685 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1686 {
1687 struct sock *rsk;
1688
1689 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1690 struct dst_entry *dst;
1691
1692 dst = rcu_dereference_protected(sk->sk_rx_dst,
1693 lockdep_sock_is_held(sk));
1694
1695 sock_rps_save_rxhash(sk, skb);
1696 sk_mark_napi_id(sk, skb);
1697 if (dst) {
1698 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1699 !dst->ops->check(dst, 0)) {
1700 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1701 dst_release(dst);
1702 }
1703 }
1704 tcp_rcv_established(sk, skb);
1705 return 0;
1706 }
1707
1708 if (tcp_checksum_complete(skb))
1709 goto csum_err;
1710
1711 if (sk->sk_state == TCP_LISTEN) {
1712 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1713
1714 if (!nsk)
1715 goto discard;
1716 if (nsk != sk) {
1717 if (tcp_child_process(sk, nsk, skb)) {
1718 rsk = nsk;
1719 goto reset;
1720 }
1721 return 0;
1722 }
1723 } else
1724 sock_rps_save_rxhash(sk, skb);
1725
1726 if (tcp_rcv_state_process(sk, skb)) {
1727 rsk = sk;
1728 goto reset;
1729 }
1730 return 0;
1731
1732 reset:
1733 tcp_v4_send_reset(rsk, skb);
1734 discard:
1735 kfree_skb(skb);
1736 /* Be careful here. If this function gets more complicated and
1737 * gcc suffers from register pressure on the x86, sk (in %ebx)
1738 * might be destroyed here. This current version compiles correctly,
1739 * but you have been warned.
1740 */
1741 return 0;
1742
1743 csum_err:
1744 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1745 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1746 goto discard;
1747 }
1748 EXPORT_SYMBOL(tcp_v4_do_rcv);
1749
tcp_v4_early_demux(struct sk_buff *skb)1750 int tcp_v4_early_demux(struct sk_buff *skb)
1751 {
1752 const struct iphdr *iph;
1753 const struct tcphdr *th;
1754 struct sock *sk;
1755
1756 if (skb->pkt_type != PACKET_HOST)
1757 return 0;
1758
1759 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1760 return 0;
1761
1762 iph = ip_hdr(skb);
1763 th = tcp_hdr(skb);
1764
1765 if (th->doff < sizeof(struct tcphdr) / 4)
1766 return 0;
1767
1768 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1769 iph->saddr, th->source,
1770 iph->daddr, ntohs(th->dest),
1771 skb->skb_iif, inet_sdif(skb));
1772 if (sk) {
1773 skb->sk = sk;
1774 skb->destructor = sock_edemux;
1775 if (sk_fullsock(sk)) {
1776 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1777
1778 if (dst)
1779 dst = dst_check(dst, 0);
1780 if (dst &&
1781 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1782 skb_dst_set_noref(skb, dst);
1783 }
1784 }
1785 return 0;
1786 }
1787
tcp_add_backlog(struct sock *sk, struct sk_buff *skb)1788 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1789 {
1790 u32 limit, tail_gso_size, tail_gso_segs;
1791 struct skb_shared_info *shinfo;
1792 const struct tcphdr *th;
1793 struct tcphdr *thtail;
1794 struct sk_buff *tail;
1795 unsigned int hdrlen;
1796 bool fragstolen;
1797 u32 gso_segs;
1798 u32 gso_size;
1799 int delta;
1800
1801 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1802 * we can fix skb->truesize to its real value to avoid future drops.
1803 * This is valid because skb is not yet charged to the socket.
1804 * It has been noticed pure SACK packets were sometimes dropped
1805 * (if cooked by drivers without copybreak feature).
1806 */
1807 skb_condense(skb);
1808
1809 skb_dst_drop(skb);
1810
1811 if (unlikely(tcp_checksum_complete(skb))) {
1812 bh_unlock_sock(sk);
1813 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1814 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1815 return true;
1816 }
1817
1818 /* Attempt coalescing to last skb in backlog, even if we are
1819 * above the limits.
1820 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1821 */
1822 th = (const struct tcphdr *)skb->data;
1823 hdrlen = th->doff * 4;
1824
1825 tail = sk->sk_backlog.tail;
1826 if (!tail)
1827 goto no_coalesce;
1828 thtail = (struct tcphdr *)tail->data;
1829
1830 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1831 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1832 ((TCP_SKB_CB(tail)->tcp_flags |
1833 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1834 !((TCP_SKB_CB(tail)->tcp_flags &
1835 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1836 ((TCP_SKB_CB(tail)->tcp_flags ^
1837 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1838 #ifdef CONFIG_TLS_DEVICE
1839 tail->decrypted != skb->decrypted ||
1840 #endif
1841 !mptcp_skb_can_collapse(tail, skb) ||
1842 thtail->doff != th->doff ||
1843 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1844 goto no_coalesce;
1845
1846 __skb_pull(skb, hdrlen);
1847
1848 shinfo = skb_shinfo(skb);
1849 gso_size = shinfo->gso_size ?: skb->len;
1850 gso_segs = shinfo->gso_segs ?: 1;
1851
1852 shinfo = skb_shinfo(tail);
1853 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1854 tail_gso_segs = shinfo->gso_segs ?: 1;
1855
1856 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1857 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1858
1859 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1860 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1861 thtail->window = th->window;
1862 }
1863
1864 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1865 * thtail->fin, so that the fast path in tcp_rcv_established()
1866 * is not entered if we append a packet with a FIN.
1867 * SYN, RST, URG are not present.
1868 * ACK is set on both packets.
1869 * PSH : we do not really care in TCP stack,
1870 * at least for 'GRO' packets.
1871 */
1872 thtail->fin |= th->fin;
1873 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1874
1875 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1876 TCP_SKB_CB(tail)->has_rxtstamp = true;
1877 tail->tstamp = skb->tstamp;
1878 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1879 }
1880
1881 /* Not as strict as GRO. We only need to carry mss max value */
1882 shinfo->gso_size = max(gso_size, tail_gso_size);
1883 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1884
1885 sk->sk_backlog.len += delta;
1886 __NET_INC_STATS(sock_net(sk),
1887 LINUX_MIB_TCPBACKLOGCOALESCE);
1888 kfree_skb_partial(skb, fragstolen);
1889 return false;
1890 }
1891 __skb_push(skb, hdrlen);
1892
1893 no_coalesce:
1894 limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
1895
1896 /* Only socket owner can try to collapse/prune rx queues
1897 * to reduce memory overhead, so add a little headroom here.
1898 * Few sockets backlog are possibly concurrently non empty.
1899 */
1900 limit += 64 * 1024;
1901
1902 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1903 bh_unlock_sock(sk);
1904 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1905 return true;
1906 }
1907 return false;
1908 }
1909 EXPORT_SYMBOL(tcp_add_backlog);
1910
tcp_filter(struct sock *sk, struct sk_buff *skb)1911 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1912 {
1913 struct tcphdr *th = (struct tcphdr *)skb->data;
1914
1915 return sk_filter_trim_cap(sk, skb, th->doff * 4);
1916 }
1917 EXPORT_SYMBOL(tcp_filter);
1918
tcp_v4_restore_cb(struct sk_buff *skb)1919 static void tcp_v4_restore_cb(struct sk_buff *skb)
1920 {
1921 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1922 sizeof(struct inet_skb_parm));
1923 }
1924
tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, const struct tcphdr *th)1925 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1926 const struct tcphdr *th)
1927 {
1928 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1929 * barrier() makes sure compiler wont play fool^Waliasing games.
1930 */
1931 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1932 sizeof(struct inet_skb_parm));
1933 barrier();
1934
1935 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1936 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1937 skb->len - th->doff * 4);
1938 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1939 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1940 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1941 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1942 TCP_SKB_CB(skb)->sacked = 0;
1943 TCP_SKB_CB(skb)->has_rxtstamp =
1944 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1945 }
1946
1947 /*
1948 * From tcp_input.c
1949 */
1950
tcp_v4_rcv(struct sk_buff *skb)1951 int tcp_v4_rcv(struct sk_buff *skb)
1952 {
1953 struct net *net = dev_net(skb->dev);
1954 struct sk_buff *skb_to_free;
1955 int sdif = inet_sdif(skb);
1956 int dif = inet_iif(skb);
1957 const struct iphdr *iph;
1958 const struct tcphdr *th;
1959 bool refcounted;
1960 struct sock *sk;
1961 int ret;
1962
1963 if (skb->pkt_type != PACKET_HOST)
1964 goto discard_it;
1965
1966 /* Count it even if it's bad */
1967 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1968
1969 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1970 goto discard_it;
1971
1972 th = (const struct tcphdr *)skb->data;
1973
1974 if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1975 goto bad_packet;
1976 if (!pskb_may_pull(skb, th->doff * 4))
1977 goto discard_it;
1978
1979 /* An explanation is required here, I think.
1980 * Packet length and doff are validated by header prediction,
1981 * provided case of th->doff==0 is eliminated.
1982 * So, we defer the checks. */
1983
1984 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1985 goto csum_error;
1986
1987 th = (const struct tcphdr *)skb->data;
1988 iph = ip_hdr(skb);
1989 lookup:
1990 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1991 th->dest, sdif, &refcounted);
1992 if (!sk)
1993 goto no_tcp_socket;
1994
1995 process:
1996 if (sk->sk_state == TCP_TIME_WAIT)
1997 goto do_time_wait;
1998
1999 if (sk->sk_state == TCP_NEW_SYN_RECV) {
2000 struct request_sock *req = inet_reqsk(sk);
2001 bool req_stolen = false;
2002 struct sock *nsk;
2003
2004 sk = req->rsk_listener;
2005 if (unlikely(!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb) ||
2006 tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
2007 sk_drops_add(sk, skb);
2008 reqsk_put(req);
2009 goto discard_it;
2010 }
2011 if (tcp_checksum_complete(skb)) {
2012 reqsk_put(req);
2013 goto csum_error;
2014 }
2015 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2016 inet_csk_reqsk_queue_drop_and_put(sk, req);
2017 goto lookup;
2018 }
2019 /* We own a reference on the listener, increase it again
2020 * as we might lose it too soon.
2021 */
2022 sock_hold(sk);
2023 refcounted = true;
2024 nsk = NULL;
2025 if (!tcp_filter(sk, skb)) {
2026 th = (const struct tcphdr *)skb->data;
2027 iph = ip_hdr(skb);
2028 tcp_v4_fill_cb(skb, iph, th);
2029 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2030 }
2031 if (!nsk) {
2032 reqsk_put(req);
2033 if (req_stolen) {
2034 /* Another cpu got exclusive access to req
2035 * and created a full blown socket.
2036 * Try to feed this packet to this socket
2037 * instead of discarding it.
2038 */
2039 tcp_v4_restore_cb(skb);
2040 sock_put(sk);
2041 goto lookup;
2042 }
2043 goto discard_and_relse;
2044 }
2045 nf_reset_ct(skb);
2046 if (nsk == sk) {
2047 reqsk_put(req);
2048 tcp_v4_restore_cb(skb);
2049 } else if (tcp_child_process(sk, nsk, skb)) {
2050 tcp_v4_send_reset(nsk, skb);
2051 goto discard_and_relse;
2052 } else {
2053 sock_put(sk);
2054 return 0;
2055 }
2056 }
2057 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2058 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2059 goto discard_and_relse;
2060 }
2061
2062 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2063 goto discard_and_relse;
2064
2065 if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2066 goto discard_and_relse;
2067
2068 nf_reset_ct(skb);
2069
2070 if (tcp_filter(sk, skb))
2071 goto discard_and_relse;
2072 th = (const struct tcphdr *)skb->data;
2073 iph = ip_hdr(skb);
2074 tcp_v4_fill_cb(skb, iph, th);
2075
2076 skb->dev = NULL;
2077
2078 if (sk->sk_state == TCP_LISTEN) {
2079 ret = tcp_v4_do_rcv(sk, skb);
2080 goto put_and_return;
2081 }
2082
2083 sk_incoming_cpu_update(sk);
2084
2085 bh_lock_sock_nested(sk);
2086 tcp_segs_in(tcp_sk(sk), skb);
2087 ret = 0;
2088 if (!sock_owned_by_user(sk)) {
2089 skb_to_free = sk->sk_rx_skb_cache;
2090 sk->sk_rx_skb_cache = NULL;
2091 ret = tcp_v4_do_rcv(sk, skb);
2092 } else {
2093 if (tcp_add_backlog(sk, skb))
2094 goto discard_and_relse;
2095 skb_to_free = NULL;
2096 }
2097 bh_unlock_sock(sk);
2098 if (skb_to_free)
2099 __kfree_skb(skb_to_free);
2100
2101 put_and_return:
2102 if (refcounted)
2103 sock_put(sk);
2104
2105 return ret;
2106
2107 no_tcp_socket:
2108 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2109 goto discard_it;
2110
2111 tcp_v4_fill_cb(skb, iph, th);
2112
2113 if (tcp_checksum_complete(skb)) {
2114 csum_error:
2115 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2116 bad_packet:
2117 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2118 } else {
2119 tcp_v4_send_reset(NULL, skb);
2120 }
2121
2122 discard_it:
2123 /* Discard frame. */
2124 kfree_skb(skb);
2125 return 0;
2126
2127 discard_and_relse:
2128 sk_drops_add(sk, skb);
2129 if (refcounted)
2130 sock_put(sk);
2131 goto discard_it;
2132
2133 do_time_wait:
2134 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2135 inet_twsk_put(inet_twsk(sk));
2136 goto discard_it;
2137 }
2138
2139 tcp_v4_fill_cb(skb, iph, th);
2140
2141 if (tcp_checksum_complete(skb)) {
2142 inet_twsk_put(inet_twsk(sk));
2143 goto csum_error;
2144 }
2145 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2146 case TCP_TW_SYN: {
2147 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2148 &tcp_hashinfo, skb,
2149 __tcp_hdrlen(th),
2150 iph->saddr, th->source,
2151 iph->daddr, th->dest,
2152 inet_iif(skb),
2153 sdif);
2154 if (sk2) {
2155 inet_twsk_deschedule_put(inet_twsk(sk));
2156 sk = sk2;
2157 tcp_v4_restore_cb(skb);
2158 refcounted = false;
2159 goto process;
2160 }
2161 }
2162 /* to ACK */
2163 fallthrough;
2164 case TCP_TW_ACK:
2165 tcp_v4_timewait_ack(sk, skb);
2166 break;
2167 case TCP_TW_RST:
2168 tcp_v4_send_reset(sk, skb);
2169 inet_twsk_deschedule_put(inet_twsk(sk));
2170 goto discard_it;
2171 case TCP_TW_SUCCESS:;
2172 }
2173 goto discard_it;
2174 }
2175
2176 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2177 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2178 .twsk_unique = tcp_twsk_unique,
2179 .twsk_destructor= tcp_twsk_destructor,
2180 };
2181
inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)2182 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2183 {
2184 struct dst_entry *dst = skb_dst(skb);
2185
2186 if (dst && dst_hold_safe(dst)) {
2187 rcu_assign_pointer(sk->sk_rx_dst, dst);
2188 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2189 }
2190 }
2191 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2192
2193 const struct inet_connection_sock_af_ops ipv4_specific = {
2194 .queue_xmit = ip_queue_xmit,
2195 .send_check = tcp_v4_send_check,
2196 .rebuild_header = inet_sk_rebuild_header,
2197 .sk_rx_dst_set = inet_sk_rx_dst_set,
2198 .conn_request = tcp_v4_conn_request,
2199 .syn_recv_sock = tcp_v4_syn_recv_sock,
2200 .net_header_len = sizeof(struct iphdr),
2201 .setsockopt = ip_setsockopt,
2202 .getsockopt = ip_getsockopt,
2203 .addr2sockaddr = inet_csk_addr2sockaddr,
2204 .sockaddr_len = sizeof(struct sockaddr_in),
2205 .mtu_reduced = tcp_v4_mtu_reduced,
2206 };
2207 EXPORT_SYMBOL(ipv4_specific);
2208
2209 #ifdef CONFIG_TCP_MD5SIG
2210 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2211 .md5_lookup = tcp_v4_md5_lookup,
2212 .calc_md5_hash = tcp_v4_md5_hash_skb,
2213 .md5_parse = tcp_v4_parse_md5_keys,
2214 };
2215 #endif
2216
2217 /* NOTE: A lot of things set to zero explicitly by call to
2218 * sk_alloc() so need not be done here.
2219 */
tcp_v4_init_sock(struct sock *sk)2220 static int tcp_v4_init_sock(struct sock *sk)
2221 {
2222 struct inet_connection_sock *icsk = inet_csk(sk);
2223
2224 tcp_init_sock(sk);
2225
2226 icsk->icsk_af_ops = &ipv4_specific;
2227
2228 #ifdef CONFIG_TCP_MD5SIG
2229 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2230 #endif
2231
2232 return 0;
2233 }
2234
tcp_v4_destroy_sock(struct sock *sk)2235 void tcp_v4_destroy_sock(struct sock *sk)
2236 {
2237 struct tcp_sock *tp = tcp_sk(sk);
2238
2239 trace_tcp_destroy_sock(sk);
2240
2241 tcp_clear_xmit_timers(sk);
2242
2243 tcp_cleanup_congestion_control(sk);
2244
2245 tcp_cleanup_ulp(sk);
2246
2247 /* Cleanup up the write buffer. */
2248 tcp_write_queue_purge(sk);
2249
2250 /* Check if we want to disable active TFO */
2251 tcp_fastopen_active_disable_ofo_check(sk);
2252
2253 /* Cleans up our, hopefully empty, out_of_order_queue. */
2254 skb_rbtree_purge(&tp->out_of_order_queue);
2255
2256 #ifdef CONFIG_TCP_MD5SIG
2257 /* Clean up the MD5 key list, if any */
2258 if (tp->md5sig_info) {
2259 tcp_clear_md5_list(sk);
2260 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2261 tp->md5sig_info = NULL;
2262 }
2263 #endif
2264
2265 /* Clean up a referenced TCP bind bucket. */
2266 if (inet_csk(sk)->icsk_bind_hash)
2267 inet_put_port(sk);
2268
2269 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2270
2271 /* If socket is aborted during connect operation */
2272 tcp_free_fastopen_req(tp);
2273 tcp_fastopen_destroy_cipher(sk);
2274 tcp_saved_syn_free(tp);
2275
2276 sk_sockets_allocated_dec(sk);
2277 }
2278 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2279
2280 #ifdef CONFIG_PROC_FS
2281 /* Proc filesystem TCP sock list dumping. */
2282
2283 /*
2284 * Get next listener socket follow cur. If cur is NULL, get first socket
2285 * starting from bucket given in st->bucket; when st->bucket is zero the
2286 * very first socket in the hash table is returned.
2287 */
listening_get_next(struct seq_file *seq, void *cur)2288 static void *listening_get_next(struct seq_file *seq, void *cur)
2289 {
2290 struct tcp_seq_afinfo *afinfo;
2291 struct tcp_iter_state *st = seq->private;
2292 struct net *net = seq_file_net(seq);
2293 struct inet_listen_hashbucket *ilb;
2294 struct hlist_nulls_node *node;
2295 struct sock *sk = cur;
2296
2297 if (st->bpf_seq_afinfo)
2298 afinfo = st->bpf_seq_afinfo;
2299 else
2300 afinfo = PDE_DATA(file_inode(seq->file));
2301
2302 if (!sk) {
2303 get_head:
2304 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2305 spin_lock(&ilb->lock);
2306 sk = sk_nulls_head(&ilb->nulls_head);
2307 st->offset = 0;
2308 goto get_sk;
2309 }
2310 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2311 ++st->num;
2312 ++st->offset;
2313
2314 sk = sk_nulls_next(sk);
2315 get_sk:
2316 sk_nulls_for_each_from(sk, node) {
2317 if (!net_eq(sock_net(sk), net))
2318 continue;
2319 if (afinfo->family == AF_UNSPEC ||
2320 sk->sk_family == afinfo->family)
2321 return sk;
2322 }
2323 spin_unlock(&ilb->lock);
2324 st->offset = 0;
2325 if (++st->bucket < INET_LHTABLE_SIZE)
2326 goto get_head;
2327 return NULL;
2328 }
2329
listening_get_idx(struct seq_file *seq, loff_t *pos)2330 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2331 {
2332 struct tcp_iter_state *st = seq->private;
2333 void *rc;
2334
2335 st->bucket = 0;
2336 st->offset = 0;
2337 rc = listening_get_next(seq, NULL);
2338
2339 while (rc && *pos) {
2340 rc = listening_get_next(seq, rc);
2341 --*pos;
2342 }
2343 return rc;
2344 }
2345
empty_bucket(const struct tcp_iter_state *st)2346 static inline bool empty_bucket(const struct tcp_iter_state *st)
2347 {
2348 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2349 }
2350
2351 /*
2352 * Get first established socket starting from bucket given in st->bucket.
2353 * If st->bucket is zero, the very first socket in the hash is returned.
2354 */
established_get_first(struct seq_file *seq)2355 static void *established_get_first(struct seq_file *seq)
2356 {
2357 struct tcp_seq_afinfo *afinfo;
2358 struct tcp_iter_state *st = seq->private;
2359 struct net *net = seq_file_net(seq);
2360 void *rc = NULL;
2361
2362 if (st->bpf_seq_afinfo)
2363 afinfo = st->bpf_seq_afinfo;
2364 else
2365 afinfo = PDE_DATA(file_inode(seq->file));
2366
2367 st->offset = 0;
2368 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2369 struct sock *sk;
2370 struct hlist_nulls_node *node;
2371 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2372
2373 /* Lockless fast path for the common case of empty buckets */
2374 if (empty_bucket(st))
2375 continue;
2376
2377 spin_lock_bh(lock);
2378 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2379 if ((afinfo->family != AF_UNSPEC &&
2380 sk->sk_family != afinfo->family) ||
2381 !net_eq(sock_net(sk), net)) {
2382 continue;
2383 }
2384 rc = sk;
2385 goto out;
2386 }
2387 spin_unlock_bh(lock);
2388 }
2389 out:
2390 return rc;
2391 }
2392
established_get_next(struct seq_file *seq, void *cur)2393 static void *established_get_next(struct seq_file *seq, void *cur)
2394 {
2395 struct tcp_seq_afinfo *afinfo;
2396 struct sock *sk = cur;
2397 struct hlist_nulls_node *node;
2398 struct tcp_iter_state *st = seq->private;
2399 struct net *net = seq_file_net(seq);
2400
2401 if (st->bpf_seq_afinfo)
2402 afinfo = st->bpf_seq_afinfo;
2403 else
2404 afinfo = PDE_DATA(file_inode(seq->file));
2405
2406 ++st->num;
2407 ++st->offset;
2408
2409 sk = sk_nulls_next(sk);
2410
2411 sk_nulls_for_each_from(sk, node) {
2412 if ((afinfo->family == AF_UNSPEC ||
2413 sk->sk_family == afinfo->family) &&
2414 net_eq(sock_net(sk), net))
2415 return sk;
2416 }
2417
2418 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2419 ++st->bucket;
2420 return established_get_first(seq);
2421 }
2422
established_get_idx(struct seq_file *seq, loff_t pos)2423 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2424 {
2425 struct tcp_iter_state *st = seq->private;
2426 void *rc;
2427
2428 st->bucket = 0;
2429 rc = established_get_first(seq);
2430
2431 while (rc && pos) {
2432 rc = established_get_next(seq, rc);
2433 --pos;
2434 }
2435 return rc;
2436 }
2437
tcp_get_idx(struct seq_file *seq, loff_t pos)2438 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2439 {
2440 void *rc;
2441 struct tcp_iter_state *st = seq->private;
2442
2443 st->state = TCP_SEQ_STATE_LISTENING;
2444 rc = listening_get_idx(seq, &pos);
2445
2446 if (!rc) {
2447 st->state = TCP_SEQ_STATE_ESTABLISHED;
2448 rc = established_get_idx(seq, pos);
2449 }
2450
2451 return rc;
2452 }
2453
tcp_seek_last_pos(struct seq_file *seq)2454 static void *tcp_seek_last_pos(struct seq_file *seq)
2455 {
2456 struct tcp_iter_state *st = seq->private;
2457 int bucket = st->bucket;
2458 int offset = st->offset;
2459 int orig_num = st->num;
2460 void *rc = NULL;
2461
2462 switch (st->state) {
2463 case TCP_SEQ_STATE_LISTENING:
2464 if (st->bucket >= INET_LHTABLE_SIZE)
2465 break;
2466 st->state = TCP_SEQ_STATE_LISTENING;
2467 rc = listening_get_next(seq, NULL);
2468 while (offset-- && rc && bucket == st->bucket)
2469 rc = listening_get_next(seq, rc);
2470 if (rc)
2471 break;
2472 st->bucket = 0;
2473 st->state = TCP_SEQ_STATE_ESTABLISHED;
2474 fallthrough;
2475 case TCP_SEQ_STATE_ESTABLISHED:
2476 if (st->bucket > tcp_hashinfo.ehash_mask)
2477 break;
2478 rc = established_get_first(seq);
2479 while (offset-- && rc && bucket == st->bucket)
2480 rc = established_get_next(seq, rc);
2481 }
2482
2483 st->num = orig_num;
2484
2485 return rc;
2486 }
2487
tcp_seq_start(struct seq_file *seq, loff_t *pos)2488 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2489 {
2490 struct tcp_iter_state *st = seq->private;
2491 void *rc;
2492
2493 if (*pos && *pos == st->last_pos) {
2494 rc = tcp_seek_last_pos(seq);
2495 if (rc)
2496 goto out;
2497 }
2498
2499 st->state = TCP_SEQ_STATE_LISTENING;
2500 st->num = 0;
2501 st->bucket = 0;
2502 st->offset = 0;
2503 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2504
2505 out:
2506 st->last_pos = *pos;
2507 return rc;
2508 }
2509 EXPORT_SYMBOL(tcp_seq_start);
2510
tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)2511 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2512 {
2513 struct tcp_iter_state *st = seq->private;
2514 void *rc = NULL;
2515
2516 if (v == SEQ_START_TOKEN) {
2517 rc = tcp_get_idx(seq, 0);
2518 goto out;
2519 }
2520
2521 switch (st->state) {
2522 case TCP_SEQ_STATE_LISTENING:
2523 rc = listening_get_next(seq, v);
2524 if (!rc) {
2525 st->state = TCP_SEQ_STATE_ESTABLISHED;
2526 st->bucket = 0;
2527 st->offset = 0;
2528 rc = established_get_first(seq);
2529 }
2530 break;
2531 case TCP_SEQ_STATE_ESTABLISHED:
2532 rc = established_get_next(seq, v);
2533 break;
2534 }
2535 out:
2536 ++*pos;
2537 st->last_pos = *pos;
2538 return rc;
2539 }
2540 EXPORT_SYMBOL(tcp_seq_next);
2541
tcp_seq_stop(struct seq_file *seq, void *v)2542 void tcp_seq_stop(struct seq_file *seq, void *v)
2543 {
2544 struct tcp_iter_state *st = seq->private;
2545
2546 switch (st->state) {
2547 case TCP_SEQ_STATE_LISTENING:
2548 if (v != SEQ_START_TOKEN)
2549 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2550 break;
2551 case TCP_SEQ_STATE_ESTABLISHED:
2552 if (v)
2553 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2554 break;
2555 }
2556 }
2557 EXPORT_SYMBOL(tcp_seq_stop);
2558
get_openreq4(const struct request_sock *req, struct seq_file *f, int i)2559 static void get_openreq4(const struct request_sock *req,
2560 struct seq_file *f, int i)
2561 {
2562 const struct inet_request_sock *ireq = inet_rsk(req);
2563 long delta = req->rsk_timer.expires - jiffies;
2564
2565 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2566 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2567 i,
2568 ireq->ir_loc_addr,
2569 ireq->ir_num,
2570 ireq->ir_rmt_addr,
2571 ntohs(ireq->ir_rmt_port),
2572 TCP_SYN_RECV,
2573 0, 0, /* could print option size, but that is af dependent. */
2574 1, /* timers active (only the expire timer) */
2575 jiffies_delta_to_clock_t(delta),
2576 req->num_timeout,
2577 from_kuid_munged(seq_user_ns(f),
2578 sock_i_uid(req->rsk_listener)),
2579 0, /* non standard timer */
2580 0, /* open_requests have no inode */
2581 0,
2582 req);
2583 }
2584
get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)2585 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2586 {
2587 int timer_active;
2588 unsigned long timer_expires;
2589 const struct tcp_sock *tp = tcp_sk(sk);
2590 const struct inet_connection_sock *icsk = inet_csk(sk);
2591 const struct inet_sock *inet = inet_sk(sk);
2592 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2593 __be32 dest = inet->inet_daddr;
2594 __be32 src = inet->inet_rcv_saddr;
2595 __u16 destp = ntohs(inet->inet_dport);
2596 __u16 srcp = ntohs(inet->inet_sport);
2597 int rx_queue;
2598 int state;
2599
2600 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2601 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2602 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2603 timer_active = 1;
2604 timer_expires = icsk->icsk_timeout;
2605 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2606 timer_active = 4;
2607 timer_expires = icsk->icsk_timeout;
2608 } else if (timer_pending(&sk->sk_timer)) {
2609 timer_active = 2;
2610 timer_expires = sk->sk_timer.expires;
2611 } else {
2612 timer_active = 0;
2613 timer_expires = jiffies;
2614 }
2615
2616 state = inet_sk_state_load(sk);
2617 if (state == TCP_LISTEN)
2618 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2619 else
2620 /* Because we don't lock the socket,
2621 * we might find a transient negative value.
2622 */
2623 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2624 READ_ONCE(tp->copied_seq), 0);
2625
2626 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2627 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2628 i, src, srcp, dest, destp, state,
2629 READ_ONCE(tp->write_seq) - tp->snd_una,
2630 rx_queue,
2631 timer_active,
2632 jiffies_delta_to_clock_t(timer_expires - jiffies),
2633 icsk->icsk_retransmits,
2634 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2635 icsk->icsk_probes_out,
2636 sock_i_ino(sk),
2637 refcount_read(&sk->sk_refcnt), sk,
2638 jiffies_to_clock_t(icsk->icsk_rto),
2639 jiffies_to_clock_t(icsk->icsk_ack.ato),
2640 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2641 tp->snd_cwnd,
2642 state == TCP_LISTEN ?
2643 fastopenq->max_qlen :
2644 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2645 }
2646
get_timewait4_sock(const struct inet_timewait_sock *tw, struct seq_file *f, int i)2647 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2648 struct seq_file *f, int i)
2649 {
2650 long delta = tw->tw_timer.expires - jiffies;
2651 __be32 dest, src;
2652 __u16 destp, srcp;
2653
2654 dest = tw->tw_daddr;
2655 src = tw->tw_rcv_saddr;
2656 destp = ntohs(tw->tw_dport);
2657 srcp = ntohs(tw->tw_sport);
2658
2659 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2660 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2661 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2662 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2663 refcount_read(&tw->tw_refcnt), tw);
2664 }
2665
2666 #define TMPSZ 150
2667
tcp4_seq_show(struct seq_file *seq, void *v)2668 static int tcp4_seq_show(struct seq_file *seq, void *v)
2669 {
2670 struct tcp_iter_state *st;
2671 struct sock *sk = v;
2672
2673 seq_setwidth(seq, TMPSZ - 1);
2674 if (v == SEQ_START_TOKEN) {
2675 seq_puts(seq, " sl local_address rem_address st tx_queue "
2676 "rx_queue tr tm->when retrnsmt uid timeout "
2677 "inode");
2678 goto out;
2679 }
2680 st = seq->private;
2681
2682 if (sk->sk_state == TCP_TIME_WAIT)
2683 get_timewait4_sock(v, seq, st->num);
2684 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2685 get_openreq4(v, seq, st->num);
2686 else
2687 get_tcp4_sock(v, seq, st->num);
2688 out:
2689 seq_pad(seq, '\n');
2690 return 0;
2691 }
2692
2693 #ifdef CONFIG_BPF_SYSCALL
2694 struct bpf_iter__tcp {
2695 __bpf_md_ptr(struct bpf_iter_meta *, meta);
2696 __bpf_md_ptr(struct sock_common *, sk_common);
2697 uid_t uid __aligned(8);
2698 };
2699
tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, struct sock_common *sk_common, uid_t uid)2700 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2701 struct sock_common *sk_common, uid_t uid)
2702 {
2703 struct bpf_iter__tcp ctx;
2704
2705 meta->seq_num--; /* skip SEQ_START_TOKEN */
2706 ctx.meta = meta;
2707 ctx.sk_common = sk_common;
2708 ctx.uid = uid;
2709 return bpf_iter_run_prog(prog, &ctx);
2710 }
2711
bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)2712 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2713 {
2714 struct bpf_iter_meta meta;
2715 struct bpf_prog *prog;
2716 struct sock *sk = v;
2717 uid_t uid;
2718
2719 if (v == SEQ_START_TOKEN)
2720 return 0;
2721
2722 if (sk->sk_state == TCP_TIME_WAIT) {
2723 uid = 0;
2724 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2725 const struct request_sock *req = v;
2726
2727 uid = from_kuid_munged(seq_user_ns(seq),
2728 sock_i_uid(req->rsk_listener));
2729 } else {
2730 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2731 }
2732
2733 meta.seq = seq;
2734 prog = bpf_iter_get_info(&meta, false);
2735 return tcp_prog_seq_show(prog, &meta, v, uid);
2736 }
2737
bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)2738 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2739 {
2740 struct bpf_iter_meta meta;
2741 struct bpf_prog *prog;
2742
2743 if (!v) {
2744 meta.seq = seq;
2745 prog = bpf_iter_get_info(&meta, true);
2746 if (prog)
2747 (void)tcp_prog_seq_show(prog, &meta, v, 0);
2748 }
2749
2750 tcp_seq_stop(seq, v);
2751 }
2752
2753 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2754 .show = bpf_iter_tcp_seq_show,
2755 .start = tcp_seq_start,
2756 .next = tcp_seq_next,
2757 .stop = bpf_iter_tcp_seq_stop,
2758 };
2759 #endif
2760
2761 static const struct seq_operations tcp4_seq_ops = {
2762 .show = tcp4_seq_show,
2763 .start = tcp_seq_start,
2764 .next = tcp_seq_next,
2765 .stop = tcp_seq_stop,
2766 };
2767
2768 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2769 .family = AF_INET,
2770 };
2771
tcp4_proc_init_net(struct net *net)2772 static int __net_init tcp4_proc_init_net(struct net *net)
2773 {
2774 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2775 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2776 return -ENOMEM;
2777 return 0;
2778 }
2779
tcp4_proc_exit_net(struct net *net)2780 static void __net_exit tcp4_proc_exit_net(struct net *net)
2781 {
2782 remove_proc_entry("tcp", net->proc_net);
2783 }
2784
2785 static struct pernet_operations tcp4_net_ops = {
2786 .init = tcp4_proc_init_net,
2787 .exit = tcp4_proc_exit_net,
2788 };
2789
tcp4_proc_init(void)2790 int __init tcp4_proc_init(void)
2791 {
2792 return register_pernet_subsys(&tcp4_net_ops);
2793 }
2794
tcp4_proc_exit(void)2795 void tcp4_proc_exit(void)
2796 {
2797 unregister_pernet_subsys(&tcp4_net_ops);
2798 }
2799 #endif /* CONFIG_PROC_FS */
2800
2801 struct proto tcp_prot = {
2802 .name = "TCP",
2803 .owner = THIS_MODULE,
2804 .close = tcp_close,
2805 .pre_connect = tcp_v4_pre_connect,
2806 .connect = tcp_v4_connect,
2807 .disconnect = tcp_disconnect,
2808 .accept = inet_csk_accept,
2809 .ioctl = tcp_ioctl,
2810 .init = tcp_v4_init_sock,
2811 .destroy = tcp_v4_destroy_sock,
2812 .shutdown = tcp_shutdown,
2813 .setsockopt = tcp_setsockopt,
2814 .getsockopt = tcp_getsockopt,
2815 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
2816 .keepalive = tcp_set_keepalive,
2817 .recvmsg = tcp_recvmsg,
2818 .sendmsg = tcp_sendmsg,
2819 .sendpage = tcp_sendpage,
2820 .backlog_rcv = tcp_v4_do_rcv,
2821 .release_cb = tcp_release_cb,
2822 .hash = inet_hash,
2823 .unhash = inet_unhash,
2824 .get_port = inet_csk_get_port,
2825 .enter_memory_pressure = tcp_enter_memory_pressure,
2826 .leave_memory_pressure = tcp_leave_memory_pressure,
2827 .stream_memory_free = tcp_stream_memory_free,
2828 .sockets_allocated = &tcp_sockets_allocated,
2829 .orphan_count = &tcp_orphan_count,
2830 .memory_allocated = &tcp_memory_allocated,
2831 .memory_pressure = &tcp_memory_pressure,
2832 .sysctl_mem = sysctl_tcp_mem,
2833 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2834 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2835 .max_header = MAX_TCP_HEADER,
2836 .obj_size = sizeof(struct tcp_sock),
2837 .slab_flags = SLAB_TYPESAFE_BY_RCU,
2838 .twsk_prot = &tcp_timewait_sock_ops,
2839 .rsk_prot = &tcp_request_sock_ops,
2840 .h.hashinfo = &tcp_hashinfo,
2841 .no_autobind = true,
2842 .diag_destroy = tcp_abort,
2843 };
2844 EXPORT_SYMBOL(tcp_prot);
2845
tcp_sk_exit(struct net *net)2846 static void __net_exit tcp_sk_exit(struct net *net)
2847 {
2848 if (net->ipv4.tcp_congestion_control)
2849 bpf_module_put(net->ipv4.tcp_congestion_control,
2850 net->ipv4.tcp_congestion_control->owner);
2851 }
2852
tcp_sk_init(struct net *net)2853 static int __net_init tcp_sk_init(struct net *net)
2854 {
2855 int cnt;
2856
2857 net->ipv4.sysctl_tcp_ecn = 2;
2858 net->ipv4.sysctl_tcp_ecn_fallback = 1;
2859
2860 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2861 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2862 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2863 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2864 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2865
2866 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2867 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2868 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2869
2870 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2871 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2872 net->ipv4.sysctl_tcp_syncookies = 1;
2873 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2874 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2875 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2876 net->ipv4.sysctl_tcp_orphan_retries = 0;
2877 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2878 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2879 net->ipv4.sysctl_tcp_tw_reuse = 2;
2880 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
2881
2882 cnt = tcp_hashinfo.ehash_mask + 1;
2883 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2884 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2885
2886 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2887 net->ipv4.sysctl_tcp_sack = 1;
2888 net->ipv4.sysctl_tcp_window_scaling = 1;
2889 net->ipv4.sysctl_tcp_timestamps = 1;
2890 net->ipv4.sysctl_tcp_early_retrans = 3;
2891 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2892 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
2893 net->ipv4.sysctl_tcp_retrans_collapse = 1;
2894 net->ipv4.sysctl_tcp_max_reordering = 300;
2895 net->ipv4.sysctl_tcp_dsack = 1;
2896 net->ipv4.sysctl_tcp_app_win = 31;
2897 net->ipv4.sysctl_tcp_adv_win_scale = 1;
2898 net->ipv4.sysctl_tcp_frto = 2;
2899 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2900 /* This limits the percentage of the congestion window which we
2901 * will allow a single TSO frame to consume. Building TSO frames
2902 * which are too large can cause TCP streams to be bursty.
2903 */
2904 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2905 /* Default TSQ limit of 16 TSO segments */
2906 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2907 /* rfc5961 challenge ack rate limiting */
2908 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2909 net->ipv4.sysctl_tcp_min_tso_segs = 2;
2910 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2911 net->ipv4.sysctl_tcp_autocorking = 1;
2912 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2913 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2914 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2915 if (net != &init_net) {
2916 memcpy(net->ipv4.sysctl_tcp_rmem,
2917 init_net.ipv4.sysctl_tcp_rmem,
2918 sizeof(init_net.ipv4.sysctl_tcp_rmem));
2919 memcpy(net->ipv4.sysctl_tcp_wmem,
2920 init_net.ipv4.sysctl_tcp_wmem,
2921 sizeof(init_net.ipv4.sysctl_tcp_wmem));
2922 }
2923 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2924 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
2925 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2926 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2927 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2928 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
2929 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2930
2931 /* Reno is always built in */
2932 if (!net_eq(net, &init_net) &&
2933 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2934 init_net.ipv4.tcp_congestion_control->owner))
2935 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2936 else
2937 net->ipv4.tcp_congestion_control = &tcp_reno;
2938
2939 return 0;
2940 }
2941
tcp_sk_exit_batch(struct list_head *net_exit_list)2942 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2943 {
2944 struct net *net;
2945
2946 inet_twsk_purge(&tcp_hashinfo, AF_INET);
2947
2948 list_for_each_entry(net, net_exit_list, exit_list)
2949 tcp_fastopen_ctx_destroy(net);
2950 }
2951
2952 static struct pernet_operations __net_initdata tcp_sk_ops = {
2953 .init = tcp_sk_init,
2954 .exit = tcp_sk_exit,
2955 .exit_batch = tcp_sk_exit_batch,
2956 };
2957
2958 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
2959 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
2960 struct sock_common *sk_common, uid_t uid)
2961
bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)2962 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
2963 {
2964 struct tcp_iter_state *st = priv_data;
2965 struct tcp_seq_afinfo *afinfo;
2966 int ret;
2967
2968 afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
2969 if (!afinfo)
2970 return -ENOMEM;
2971
2972 afinfo->family = AF_UNSPEC;
2973 st->bpf_seq_afinfo = afinfo;
2974 ret = bpf_iter_init_seq_net(priv_data, aux);
2975 if (ret)
2976 kfree(afinfo);
2977 return ret;
2978 }
2979
bpf_iter_fini_tcp(void *priv_data)2980 static void bpf_iter_fini_tcp(void *priv_data)
2981 {
2982 struct tcp_iter_state *st = priv_data;
2983
2984 kfree(st->bpf_seq_afinfo);
2985 bpf_iter_fini_seq_net(priv_data);
2986 }
2987
2988 static const struct bpf_iter_seq_info tcp_seq_info = {
2989 .seq_ops = &bpf_iter_tcp_seq_ops,
2990 .init_seq_private = bpf_iter_init_tcp,
2991 .fini_seq_private = bpf_iter_fini_tcp,
2992 .seq_priv_size = sizeof(struct tcp_iter_state),
2993 };
2994
2995 static struct bpf_iter_reg tcp_reg_info = {
2996 .target = "tcp",
2997 .ctx_arg_info_size = 1,
2998 .ctx_arg_info = {
2999 { offsetof(struct bpf_iter__tcp, sk_common),
3000 PTR_TO_BTF_ID_OR_NULL },
3001 },
3002 .seq_info = &tcp_seq_info,
3003 };
3004
bpf_iter_register(void)3005 static void __init bpf_iter_register(void)
3006 {
3007 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3008 if (bpf_iter_reg_target(&tcp_reg_info))
3009 pr_warn("Warning: could not register bpf iterator tcp\n");
3010 }
3011
3012 #endif
3013
tcp_v4_init(void)3014 void __init tcp_v4_init(void)
3015 {
3016 int cpu, res;
3017
3018 for_each_possible_cpu(cpu) {
3019 struct sock *sk;
3020
3021 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3022 IPPROTO_TCP, &init_net);
3023 if (res)
3024 panic("Failed to create the TCP control socket.\n");
3025 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3026
3027 /* Please enforce IP_DF and IPID==0 for RST and
3028 * ACK sent in SYN-RECV and TIME-WAIT state.
3029 */
3030 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3031
3032 per_cpu(ipv4_tcp_sk, cpu) = sk;
3033 }
3034 if (register_pernet_subsys(&tcp_sk_ops))
3035 panic("Failed to create the TCP control socket.\n");
3036
3037 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3038 bpf_iter_register();
3039 #endif
3040 }
3041