xref: /kernel/linux/linux-5.10/net/mptcp/options.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0
2/* Multipath TCP
3 *
4 * Copyright (c) 2017 - 2019, Intel Corporation.
5 */
6
7#define pr_fmt(fmt) "MPTCP: " fmt
8
9#include <linux/kernel.h>
10#include <crypto/sha.h>
11#include <net/tcp.h>
12#include <net/mptcp.h>
13#include "protocol.h"
14#include "mib.h"
15
16static bool mptcp_cap_flag_sha256(u8 flags)
17{
18	return (flags & MPTCP_CAP_FLAG_MASK) == MPTCP_CAP_HMAC_SHA256;
19}
20
21static void mptcp_parse_option(const struct sk_buff *skb,
22			       const unsigned char *ptr, int opsize,
23			       struct mptcp_options_received *mp_opt)
24{
25	u8 subtype = *ptr >> 4;
26	int expected_opsize;
27	u8 version;
28	u8 flags;
29
30	switch (subtype) {
31	case MPTCPOPT_MP_CAPABLE:
32		/* strict size checking */
33		if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
34			if (skb->len > tcp_hdr(skb)->doff << 2)
35				expected_opsize = TCPOLEN_MPTCP_MPC_ACK_DATA;
36			else
37				expected_opsize = TCPOLEN_MPTCP_MPC_ACK;
38		} else {
39			if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)
40				expected_opsize = TCPOLEN_MPTCP_MPC_SYNACK;
41			else
42				expected_opsize = TCPOLEN_MPTCP_MPC_SYN;
43		}
44		if (opsize != expected_opsize)
45			break;
46
47		/* try to be gentle vs future versions on the initial syn */
48		version = *ptr++ & MPTCP_VERSION_MASK;
49		if (opsize != TCPOLEN_MPTCP_MPC_SYN) {
50			if (version != MPTCP_SUPPORTED_VERSION)
51				break;
52		} else if (version < MPTCP_SUPPORTED_VERSION) {
53			break;
54		}
55
56		flags = *ptr++;
57		if (!mptcp_cap_flag_sha256(flags) ||
58		    (flags & MPTCP_CAP_EXTENSIBILITY))
59			break;
60
61		/* RFC 6824, Section 3.1:
62		 * "For the Checksum Required bit (labeled "A"), if either
63		 * host requires the use of checksums, checksums MUST be used.
64		 * In other words, the only way for checksums not to be used
65		 * is if both hosts in their SYNs set A=0."
66		 *
67		 * Section 3.3.0:
68		 * "If a checksum is not present when its use has been
69		 * negotiated, the receiver MUST close the subflow with a RST as
70		 * it is considered broken."
71		 *
72		 * We don't implement DSS checksum - fall back to TCP.
73		 */
74		if (flags & MPTCP_CAP_CHECKSUM_REQD)
75			break;
76
77		mp_opt->mp_capable = 1;
78		if (opsize >= TCPOLEN_MPTCP_MPC_SYNACK) {
79			mp_opt->sndr_key = get_unaligned_be64(ptr);
80			ptr += 8;
81		}
82		if (opsize >= TCPOLEN_MPTCP_MPC_ACK) {
83			mp_opt->rcvr_key = get_unaligned_be64(ptr);
84			ptr += 8;
85		}
86		if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA) {
87			/* Section 3.1.:
88			 * "the data parameters in a MP_CAPABLE are semantically
89			 * equivalent to those in a DSS option and can be used
90			 * interchangeably."
91			 */
92			mp_opt->dss = 1;
93			mp_opt->use_map = 1;
94			mp_opt->mpc_map = 1;
95			mp_opt->use_ack = 0;
96			mp_opt->data_len = get_unaligned_be16(ptr);
97			ptr += 2;
98		}
99		pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d",
100			 version, flags, opsize, mp_opt->sndr_key,
101			 mp_opt->rcvr_key, mp_opt->data_len);
102		break;
103
104	case MPTCPOPT_MP_JOIN:
105		mp_opt->mp_join = 1;
106		if (opsize == TCPOLEN_MPTCP_MPJ_SYN) {
107			mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP;
108			mp_opt->join_id = *ptr++;
109			mp_opt->token = get_unaligned_be32(ptr);
110			ptr += 4;
111			mp_opt->nonce = get_unaligned_be32(ptr);
112			ptr += 4;
113			pr_debug("MP_JOIN bkup=%u, id=%u, token=%u, nonce=%u",
114				 mp_opt->backup, mp_opt->join_id,
115				 mp_opt->token, mp_opt->nonce);
116		} else if (opsize == TCPOLEN_MPTCP_MPJ_SYNACK) {
117			mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP;
118			mp_opt->join_id = *ptr++;
119			mp_opt->thmac = get_unaligned_be64(ptr);
120			ptr += 8;
121			mp_opt->nonce = get_unaligned_be32(ptr);
122			ptr += 4;
123			pr_debug("MP_JOIN bkup=%u, id=%u, thmac=%llu, nonce=%u",
124				 mp_opt->backup, mp_opt->join_id,
125				 mp_opt->thmac, mp_opt->nonce);
126		} else if (opsize == TCPOLEN_MPTCP_MPJ_ACK) {
127			ptr += 2;
128			memcpy(mp_opt->hmac, ptr, MPTCPOPT_HMAC_LEN);
129			pr_debug("MP_JOIN hmac");
130		} else {
131			mp_opt->mp_join = 0;
132		}
133		break;
134
135	case MPTCPOPT_DSS:
136		pr_debug("DSS");
137		ptr++;
138
139		/* we must clear 'mpc_map' be able to detect MP_CAPABLE
140		 * map vs DSS map in mptcp_incoming_options(), and reconstruct
141		 * map info accordingly
142		 */
143		mp_opt->mpc_map = 0;
144		flags = (*ptr++) & MPTCP_DSS_FLAG_MASK;
145		mp_opt->data_fin = (flags & MPTCP_DSS_DATA_FIN) != 0;
146		mp_opt->dsn64 = (flags & MPTCP_DSS_DSN64) != 0;
147		mp_opt->use_map = (flags & MPTCP_DSS_HAS_MAP) != 0;
148		mp_opt->ack64 = (flags & MPTCP_DSS_ACK64) != 0;
149		mp_opt->use_ack = (flags & MPTCP_DSS_HAS_ACK);
150
151		pr_debug("data_fin=%d dsn64=%d use_map=%d ack64=%d use_ack=%d",
152			 mp_opt->data_fin, mp_opt->dsn64,
153			 mp_opt->use_map, mp_opt->ack64,
154			 mp_opt->use_ack);
155
156		expected_opsize = TCPOLEN_MPTCP_DSS_BASE;
157
158		if (mp_opt->use_ack) {
159			if (mp_opt->ack64)
160				expected_opsize += TCPOLEN_MPTCP_DSS_ACK64;
161			else
162				expected_opsize += TCPOLEN_MPTCP_DSS_ACK32;
163		}
164
165		if (mp_opt->use_map) {
166			if (mp_opt->dsn64)
167				expected_opsize += TCPOLEN_MPTCP_DSS_MAP64;
168			else
169				expected_opsize += TCPOLEN_MPTCP_DSS_MAP32;
170		}
171
172		/* RFC 6824, Section 3.3:
173		 * If a checksum is present, but its use had
174		 * not been negotiated in the MP_CAPABLE handshake,
175		 * the checksum field MUST be ignored.
176		 */
177		if (opsize != expected_opsize &&
178		    opsize != expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM)
179			break;
180
181		mp_opt->dss = 1;
182
183		if (mp_opt->use_ack) {
184			if (mp_opt->ack64) {
185				mp_opt->data_ack = get_unaligned_be64(ptr);
186				ptr += 8;
187			} else {
188				mp_opt->data_ack = get_unaligned_be32(ptr);
189				ptr += 4;
190			}
191
192			pr_debug("data_ack=%llu", mp_opt->data_ack);
193		}
194
195		if (mp_opt->use_map) {
196			if (mp_opt->dsn64) {
197				mp_opt->data_seq = get_unaligned_be64(ptr);
198				ptr += 8;
199			} else {
200				mp_opt->data_seq = get_unaligned_be32(ptr);
201				ptr += 4;
202			}
203
204			mp_opt->subflow_seq = get_unaligned_be32(ptr);
205			ptr += 4;
206
207			mp_opt->data_len = get_unaligned_be16(ptr);
208			ptr += 2;
209
210			pr_debug("data_seq=%llu subflow_seq=%u data_len=%u",
211				 mp_opt->data_seq, mp_opt->subflow_seq,
212				 mp_opt->data_len);
213		}
214
215		break;
216
217	case MPTCPOPT_ADD_ADDR:
218		mp_opt->echo = (*ptr++) & MPTCP_ADDR_ECHO;
219		if (!mp_opt->echo) {
220			if (opsize == TCPOLEN_MPTCP_ADD_ADDR ||
221			    opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT)
222				mp_opt->family = MPTCP_ADDR_IPVERSION_4;
223#if IS_ENABLED(CONFIG_MPTCP_IPV6)
224			else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6 ||
225				 opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT)
226				mp_opt->family = MPTCP_ADDR_IPVERSION_6;
227#endif
228			else
229				break;
230		} else {
231			if (opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE ||
232			    opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT)
233				mp_opt->family = MPTCP_ADDR_IPVERSION_4;
234#if IS_ENABLED(CONFIG_MPTCP_IPV6)
235			else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE ||
236				 opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT)
237				mp_opt->family = MPTCP_ADDR_IPVERSION_6;
238#endif
239			else
240				break;
241		}
242
243		mp_opt->add_addr = 1;
244		mp_opt->addr_id = *ptr++;
245		pr_debug("ADD_ADDR: id=%d, echo=%d", mp_opt->addr_id, mp_opt->echo);
246		if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) {
247			memcpy((u8 *)&mp_opt->addr.s_addr, (u8 *)ptr, 4);
248			ptr += 4;
249			if (opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT ||
250			    opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT) {
251				mp_opt->port = get_unaligned_be16(ptr);
252				ptr += 2;
253			}
254		}
255#if IS_ENABLED(CONFIG_MPTCP_IPV6)
256		else {
257			memcpy(mp_opt->addr6.s6_addr, (u8 *)ptr, 16);
258			ptr += 16;
259			if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT ||
260			    opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT) {
261				mp_opt->port = get_unaligned_be16(ptr);
262				ptr += 2;
263			}
264		}
265#endif
266		if (!mp_opt->echo) {
267			mp_opt->ahmac = get_unaligned_be64(ptr);
268			ptr += 8;
269		}
270		break;
271
272	case MPTCPOPT_RM_ADDR:
273		if (opsize != TCPOLEN_MPTCP_RM_ADDR_BASE)
274			break;
275
276		ptr++;
277
278		mp_opt->rm_addr = 1;
279		mp_opt->rm_id = *ptr++;
280		pr_debug("RM_ADDR: id=%d", mp_opt->rm_id);
281		break;
282
283	default:
284		break;
285	}
286}
287
288void mptcp_get_options(const struct sk_buff *skb,
289		       struct mptcp_options_received *mp_opt)
290{
291	const struct tcphdr *th = tcp_hdr(skb);
292	const unsigned char *ptr;
293	int length;
294
295	/* initialize option status */
296	mp_opt->mp_capable = 0;
297	mp_opt->mp_join = 0;
298	mp_opt->add_addr = 0;
299	mp_opt->ahmac = 0;
300	mp_opt->port = 0;
301	mp_opt->rm_addr = 0;
302	mp_opt->dss = 0;
303
304	length = (th->doff * 4) - sizeof(struct tcphdr);
305	ptr = (const unsigned char *)(th + 1);
306
307	while (length > 0) {
308		int opcode = *ptr++;
309		int opsize;
310
311		switch (opcode) {
312		case TCPOPT_EOL:
313			return;
314		case TCPOPT_NOP:	/* Ref: RFC 793 section 3.1 */
315			length--;
316			continue;
317		default:
318			if (length < 2)
319				return;
320			opsize = *ptr++;
321			if (opsize < 2) /* "silly options" */
322				return;
323			if (opsize > length)
324				return;	/* don't parse partial options */
325			if (opcode == TCPOPT_MPTCP)
326				mptcp_parse_option(skb, ptr, opsize, mp_opt);
327			ptr += opsize - 2;
328			length -= opsize;
329		}
330	}
331}
332
333bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb,
334		       unsigned int *size, struct mptcp_out_options *opts)
335{
336	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
337
338	/* we will use snd_isn to detect first pkt [re]transmission
339	 * in mptcp_established_options_mp()
340	 */
341	subflow->snd_isn = TCP_SKB_CB(skb)->end_seq;
342	if (subflow->request_mptcp) {
343		opts->suboptions = OPTION_MPTCP_MPC_SYN;
344		*size = TCPOLEN_MPTCP_MPC_SYN;
345		return true;
346	} else if (subflow->request_join) {
347		pr_debug("remote_token=%u, nonce=%u", subflow->remote_token,
348			 subflow->local_nonce);
349		opts->suboptions = OPTION_MPTCP_MPJ_SYN;
350		opts->join_id = subflow->local_id;
351		opts->token = subflow->remote_token;
352		opts->nonce = subflow->local_nonce;
353		opts->backup = subflow->request_bkup;
354		*size = TCPOLEN_MPTCP_MPJ_SYN;
355		return true;
356	}
357	return false;
358}
359
360/* MP_JOIN client subflow must wait for 4th ack before sending any data:
361 * TCP can't schedule delack timer before the subflow is fully established.
362 * MPTCP uses the delack timer to do 3rd ack retransmissions
363 */
364static void schedule_3rdack_retransmission(struct sock *sk)
365{
366	struct inet_connection_sock *icsk = inet_csk(sk);
367	struct tcp_sock *tp = tcp_sk(sk);
368	unsigned long timeout;
369
370	/* reschedule with a timeout above RTT, as we must look only for drop */
371	if (tp->srtt_us)
372		timeout = usecs_to_jiffies(tp->srtt_us >> (3 - 1));
373	else
374		timeout = TCP_TIMEOUT_INIT;
375	timeout += jiffies;
376
377	WARN_ON_ONCE(icsk->icsk_ack.pending & ICSK_ACK_TIMER);
378	icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
379	icsk->icsk_ack.timeout = timeout;
380	sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
381}
382
383static void clear_3rdack_retransmission(struct sock *sk)
384{
385	struct inet_connection_sock *icsk = inet_csk(sk);
386
387	sk_stop_timer(sk, &icsk->icsk_delack_timer);
388	icsk->icsk_ack.timeout = 0;
389	icsk->icsk_ack.ato = 0;
390	icsk->icsk_ack.pending &= ~(ICSK_ACK_SCHED | ICSK_ACK_TIMER);
391}
392
393static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
394					 unsigned int *size,
395					 unsigned int remaining,
396					 struct mptcp_out_options *opts)
397{
398	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
399	struct mptcp_ext *mpext;
400	unsigned int data_len;
401
402	/* When skb is not available, we better over-estimate the emitted
403	 * options len. A full DSS option (28 bytes) is longer than
404	 * TCPOLEN_MPTCP_MPC_ACK_DATA(22) or TCPOLEN_MPTCP_MPJ_ACK(24), so
405	 * tell the caller to defer the estimate to
406	 * mptcp_established_options_dss(), which will reserve enough space.
407	 */
408	if (!skb)
409		return false;
410
411	/* MPC/MPJ needed only on 3rd ack packet */
412	if (subflow->fully_established ||
413	    subflow->snd_isn != TCP_SKB_CB(skb)->seq)
414		return false;
415
416	if (subflow->mp_capable) {
417		mpext = mptcp_get_ext(skb);
418		data_len = mpext ? mpext->data_len : 0;
419
420		/* we will check ext_copy.data_len in mptcp_write_options() to
421		 * discriminate between TCPOLEN_MPTCP_MPC_ACK_DATA and
422		 * TCPOLEN_MPTCP_MPC_ACK
423		 */
424		opts->ext_copy.data_len = data_len;
425		opts->suboptions = OPTION_MPTCP_MPC_ACK;
426		opts->sndr_key = subflow->local_key;
427		opts->rcvr_key = subflow->remote_key;
428
429		/* Section 3.1.
430		 * The MP_CAPABLE option is carried on the SYN, SYN/ACK, and ACK
431		 * packets that start the first subflow of an MPTCP connection,
432		 * as well as the first packet that carries data
433		 */
434		if (data_len > 0)
435			*size = ALIGN(TCPOLEN_MPTCP_MPC_ACK_DATA, 4);
436		else
437			*size = TCPOLEN_MPTCP_MPC_ACK;
438
439		pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d",
440			 subflow, subflow->local_key, subflow->remote_key,
441			 data_len);
442
443		return true;
444	} else if (subflow->mp_join) {
445		opts->suboptions = OPTION_MPTCP_MPJ_ACK;
446		memcpy(opts->hmac, subflow->hmac, MPTCPOPT_HMAC_LEN);
447		*size = TCPOLEN_MPTCP_MPJ_ACK;
448		pr_debug("subflow=%p", subflow);
449
450		schedule_3rdack_retransmission(sk);
451		return true;
452	}
453	return false;
454}
455
456static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow,
457				 struct sk_buff *skb, struct mptcp_ext *ext)
458{
459	/* The write_seq value has already been incremented, so the actual
460	 * sequence number for the DATA_FIN is one less.
461	 */
462	u64 data_fin_tx_seq = READ_ONCE(mptcp_sk(subflow->conn)->write_seq) - 1;
463
464	if (!ext->use_map || !skb->len) {
465		/* RFC6824 requires a DSS mapping with specific values
466		 * if DATA_FIN is set but no data payload is mapped
467		 */
468		ext->data_fin = 1;
469		ext->use_map = 1;
470		ext->dsn64 = 1;
471		ext->data_seq = data_fin_tx_seq;
472		ext->subflow_seq = 0;
473		ext->data_len = 1;
474	} else if (ext->data_seq + ext->data_len == data_fin_tx_seq) {
475		/* If there's an existing DSS mapping and it is the
476		 * final mapping, DATA_FIN consumes 1 additional byte of
477		 * mapping space.
478		 */
479		ext->data_fin = 1;
480		ext->data_len++;
481	}
482}
483
484static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
485					  unsigned int *size,
486					  unsigned int remaining,
487					  struct mptcp_out_options *opts)
488{
489	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
490	struct mptcp_sock *msk = mptcp_sk(subflow->conn);
491	unsigned int dss_size = 0;
492	u64 snd_data_fin_enable;
493	struct mptcp_ext *mpext;
494	unsigned int ack_size;
495	bool ret = false;
496
497	mpext = skb ? mptcp_get_ext(skb) : NULL;
498	snd_data_fin_enable = READ_ONCE(msk->snd_data_fin_enable);
499
500	if (!skb || (mpext && mpext->use_map) || snd_data_fin_enable) {
501		unsigned int map_size;
502
503		map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64;
504
505		remaining -= map_size;
506		dss_size = map_size;
507		if (mpext)
508			opts->ext_copy = *mpext;
509
510		if (skb && snd_data_fin_enable)
511			mptcp_write_data_fin(subflow, skb, &opts->ext_copy);
512		ret = true;
513	}
514
515	/* passive sockets msk will set the 'can_ack' after accept(), even
516	 * if the first subflow may have the already the remote key handy
517	 */
518	opts->ext_copy.use_ack = 0;
519	if (!READ_ONCE(msk->can_ack)) {
520		*size = ALIGN(dss_size, 4);
521		return ret;
522	}
523
524	if (READ_ONCE(msk->use_64bit_ack)) {
525		ack_size = TCPOLEN_MPTCP_DSS_ACK64;
526		opts->ext_copy.data_ack = READ_ONCE(msk->ack_seq);
527		opts->ext_copy.ack64 = 1;
528	} else {
529		ack_size = TCPOLEN_MPTCP_DSS_ACK32;
530		opts->ext_copy.data_ack32 = (uint32_t)READ_ONCE(msk->ack_seq);
531		opts->ext_copy.ack64 = 0;
532	}
533	opts->ext_copy.use_ack = 1;
534
535	/* Add kind/length/subtype/flag overhead if mapping is not populated */
536	if (dss_size == 0)
537		ack_size += TCPOLEN_MPTCP_DSS_BASE;
538
539	dss_size += ack_size;
540
541	*size = ALIGN(dss_size, 4);
542	return true;
543}
544
545static u64 add_addr_generate_hmac(u64 key1, u64 key2, u8 addr_id,
546				  struct in_addr *addr)
547{
548	u8 hmac[SHA256_DIGEST_SIZE];
549	u8 msg[7];
550
551	msg[0] = addr_id;
552	memcpy(&msg[1], &addr->s_addr, 4);
553	msg[5] = 0;
554	msg[6] = 0;
555
556	mptcp_crypto_hmac_sha(key1, key2, msg, 7, hmac);
557
558	return get_unaligned_be64(&hmac[SHA256_DIGEST_SIZE - sizeof(u64)]);
559}
560
561#if IS_ENABLED(CONFIG_MPTCP_IPV6)
562static u64 add_addr6_generate_hmac(u64 key1, u64 key2, u8 addr_id,
563				   struct in6_addr *addr)
564{
565	u8 hmac[SHA256_DIGEST_SIZE];
566	u8 msg[19];
567
568	msg[0] = addr_id;
569	memcpy(&msg[1], &addr->s6_addr, 16);
570	msg[17] = 0;
571	msg[18] = 0;
572
573	mptcp_crypto_hmac_sha(key1, key2, msg, 19, hmac);
574
575	return get_unaligned_be64(&hmac[SHA256_DIGEST_SIZE - sizeof(u64)]);
576}
577#endif
578
579static bool mptcp_established_options_add_addr(struct sock *sk,
580					       unsigned int *size,
581					       unsigned int remaining,
582					       struct mptcp_out_options *opts)
583{
584	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
585	struct mptcp_sock *msk = mptcp_sk(subflow->conn);
586	struct mptcp_addr_info saddr;
587	bool echo;
588	int len;
589
590	if (!mptcp_pm_should_add_signal(msk) ||
591	    !(mptcp_pm_add_addr_signal(msk, remaining, &saddr, &echo)))
592		return false;
593
594	len = mptcp_add_addr_len(saddr.family, echo);
595	if (remaining < len)
596		return false;
597
598	*size = len;
599	opts->addr_id = saddr.id;
600	if (saddr.family == AF_INET) {
601		opts->suboptions |= OPTION_MPTCP_ADD_ADDR;
602		opts->addr = saddr.addr;
603		if (!echo) {
604			opts->ahmac = add_addr_generate_hmac(msk->local_key,
605							     msk->remote_key,
606							     opts->addr_id,
607							     &opts->addr);
608		}
609	}
610#if IS_ENABLED(CONFIG_MPTCP_IPV6)
611	else if (saddr.family == AF_INET6) {
612		opts->suboptions |= OPTION_MPTCP_ADD_ADDR6;
613		opts->addr6 = saddr.addr6;
614		if (!echo) {
615			opts->ahmac = add_addr6_generate_hmac(msk->local_key,
616							      msk->remote_key,
617							      opts->addr_id,
618							      &opts->addr6);
619		}
620	}
621#endif
622	pr_debug("addr_id=%d, ahmac=%llu, echo=%d", opts->addr_id, opts->ahmac, echo);
623
624	return true;
625}
626
627static bool mptcp_established_options_rm_addr(struct sock *sk,
628					      unsigned int *size,
629					      unsigned int remaining,
630					      struct mptcp_out_options *opts)
631{
632	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
633	struct mptcp_sock *msk = mptcp_sk(subflow->conn);
634	u8 rm_id;
635
636	if (!mptcp_pm_should_rm_signal(msk) ||
637	    !(mptcp_pm_rm_addr_signal(msk, remaining, &rm_id)))
638		return false;
639
640	if (remaining < TCPOLEN_MPTCP_RM_ADDR_BASE)
641		return false;
642
643	*size = TCPOLEN_MPTCP_RM_ADDR_BASE;
644	opts->suboptions |= OPTION_MPTCP_RM_ADDR;
645	opts->rm_id = rm_id;
646
647	pr_debug("rm_id=%d", opts->rm_id);
648
649	return true;
650}
651
652bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
653			       unsigned int *size, unsigned int remaining,
654			       struct mptcp_out_options *opts)
655{
656	unsigned int opt_size = 0;
657	bool ret = false;
658
659	opts->suboptions = 0;
660
661	if (unlikely(mptcp_check_fallback(sk)))
662		return false;
663
664	/* prevent adding of any MPTCP related options on reset packet
665	 * until we support MP_TCPRST/MP_FASTCLOSE
666	 */
667	if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST))
668		return false;
669
670	if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts))
671		ret = true;
672	else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining,
673					       opts))
674		ret = true;
675
676	/* we reserved enough space for the above options, and exceeding the
677	 * TCP option space would be fatal
678	 */
679	if (WARN_ON_ONCE(opt_size > remaining))
680		return false;
681
682	*size += opt_size;
683	remaining -= opt_size;
684	if (mptcp_established_options_add_addr(sk, &opt_size, remaining, opts)) {
685		*size += opt_size;
686		remaining -= opt_size;
687		ret = true;
688	} else if (mptcp_established_options_rm_addr(sk, &opt_size, remaining, opts)) {
689		*size += opt_size;
690		remaining -= opt_size;
691		ret = true;
692	}
693
694	return ret;
695}
696
697bool mptcp_synack_options(const struct request_sock *req, unsigned int *size,
698			  struct mptcp_out_options *opts)
699{
700	struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
701
702	if (subflow_req->mp_capable) {
703		opts->suboptions = OPTION_MPTCP_MPC_SYNACK;
704		opts->sndr_key = subflow_req->local_key;
705		*size = TCPOLEN_MPTCP_MPC_SYNACK;
706		pr_debug("subflow_req=%p, local_key=%llu",
707			 subflow_req, subflow_req->local_key);
708		return true;
709	} else if (subflow_req->mp_join) {
710		opts->suboptions = OPTION_MPTCP_MPJ_SYNACK;
711		opts->backup = subflow_req->backup;
712		opts->join_id = subflow_req->local_id;
713		opts->thmac = subflow_req->thmac;
714		opts->nonce = subflow_req->local_nonce;
715		pr_debug("req=%p, bkup=%u, id=%u, thmac=%llu, nonce=%u",
716			 subflow_req, opts->backup, opts->join_id,
717			 opts->thmac, opts->nonce);
718		*size = TCPOLEN_MPTCP_MPJ_SYNACK;
719		return true;
720	}
721	return false;
722}
723
724static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk,
725				    struct mptcp_subflow_context *subflow,
726				    struct sk_buff *skb,
727				    struct mptcp_options_received *mp_opt)
728{
729	/* here we can process OoO, in-window pkts, only in-sequence 4th ack
730	 * will make the subflow fully established
731	 */
732	if (likely(subflow->fully_established)) {
733		/* on passive sockets, check for 3rd ack retransmission
734		 * note that msk is always set by subflow_syn_recv_sock()
735		 * for mp_join subflows
736		 */
737		if (TCP_SKB_CB(skb)->seq == subflow->ssn_offset + 1 &&
738		    TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq &&
739		    subflow->mp_join && mp_opt->mp_join &&
740		    READ_ONCE(msk->pm.server_side))
741			tcp_send_ack(ssk);
742		goto fully_established;
743	}
744
745	/* we must process OoO packets before the first subflow is fully
746	 * established. OoO packets are instead a protocol violation
747	 * for MP_JOIN subflows as the peer must not send any data
748	 * before receiving the forth ack - cfr. RFC 8684 section 3.2.
749	 */
750	if (TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1) {
751		if (subflow->mp_join)
752			goto reset;
753		return subflow->mp_capable;
754	}
755
756	if (mp_opt->dss && mp_opt->use_ack) {
757		/* subflows are fully established as soon as we get any
758		 * additional ack.
759		 */
760		subflow->fully_established = 1;
761		WRITE_ONCE(msk->fully_established, true);
762		goto fully_established;
763	}
764
765	/* If the first established packet does not contain MP_CAPABLE + data
766	 * then fallback to TCP. Fallback scenarios requires a reset for
767	 * MP_JOIN subflows.
768	 */
769	if (!mp_opt->mp_capable) {
770		if (subflow->mp_join)
771			goto reset;
772		subflow->mp_capable = 0;
773		pr_fallback(msk);
774		__mptcp_do_fallback(msk);
775		return false;
776	}
777
778	if (unlikely(!READ_ONCE(msk->pm.server_side)))
779		pr_warn_once("bogus mpc option on established client sk");
780	mptcp_subflow_fully_established(subflow, mp_opt);
781
782fully_established:
783	if (likely(subflow->pm_notified))
784		return true;
785
786	subflow->pm_notified = 1;
787	if (subflow->mp_join) {
788		clear_3rdack_retransmission(ssk);
789		mptcp_pm_subflow_established(msk, subflow);
790	} else {
791		mptcp_pm_fully_established(msk);
792	}
793	return true;
794
795reset:
796	mptcp_subflow_reset(ssk);
797	return false;
798}
799
800static u64 expand_ack(u64 old_ack, u64 cur_ack, bool use_64bit)
801{
802	u32 old_ack32, cur_ack32;
803
804	if (use_64bit)
805		return cur_ack;
806
807	old_ack32 = (u32)old_ack;
808	cur_ack32 = (u32)cur_ack;
809	cur_ack = (old_ack & GENMASK_ULL(63, 32)) + cur_ack32;
810	if (unlikely(before(cur_ack32, old_ack32)))
811		return cur_ack + (1LL << 32);
812	return cur_ack;
813}
814
815static void update_una(struct mptcp_sock *msk,
816		       struct mptcp_options_received *mp_opt)
817{
818	u64 new_snd_una, snd_una, old_snd_una = atomic64_read(&msk->snd_una);
819	u64 write_seq = READ_ONCE(msk->write_seq);
820
821	/* avoid ack expansion on update conflict, to reduce the risk of
822	 * wrongly expanding to a future ack sequence number, which is way
823	 * more dangerous than missing an ack
824	 */
825	new_snd_una = expand_ack(old_snd_una, mp_opt->data_ack, mp_opt->ack64);
826
827	/* ACK for data not even sent yet? Ignore. */
828	if (after64(new_snd_una, write_seq))
829		new_snd_una = old_snd_una;
830
831	while (after64(new_snd_una, old_snd_una)) {
832		snd_una = old_snd_una;
833		old_snd_una = atomic64_cmpxchg(&msk->snd_una, snd_una,
834					       new_snd_una);
835		if (old_snd_una == snd_una) {
836			mptcp_data_acked((struct sock *)msk);
837			break;
838		}
839	}
840}
841
842bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit)
843{
844	/* Skip if DATA_FIN was already received.
845	 * If updating simultaneously with the recvmsg loop, values
846	 * should match. If they mismatch, the peer is misbehaving and
847	 * we will prefer the most recent information.
848	 */
849	if (READ_ONCE(msk->rcv_data_fin) || !READ_ONCE(msk->first))
850		return false;
851
852	WRITE_ONCE(msk->rcv_data_fin_seq,
853		   expand_ack(READ_ONCE(msk->ack_seq), data_fin_seq, use_64bit));
854	WRITE_ONCE(msk->rcv_data_fin, 1);
855
856	return true;
857}
858
859static bool add_addr_hmac_valid(struct mptcp_sock *msk,
860				struct mptcp_options_received *mp_opt)
861{
862	u64 hmac = 0;
863
864	if (mp_opt->echo)
865		return true;
866
867	if (mp_opt->family == MPTCP_ADDR_IPVERSION_4)
868		hmac = add_addr_generate_hmac(msk->remote_key,
869					      msk->local_key,
870					      mp_opt->addr_id, &mp_opt->addr);
871#if IS_ENABLED(CONFIG_MPTCP_IPV6)
872	else
873		hmac = add_addr6_generate_hmac(msk->remote_key,
874					       msk->local_key,
875					       mp_opt->addr_id, &mp_opt->addr6);
876#endif
877
878	pr_debug("msk=%p, ahmac=%llu, mp_opt->ahmac=%llu\n",
879		 msk, (unsigned long long)hmac,
880		 (unsigned long long)mp_opt->ahmac);
881
882	return hmac == mp_opt->ahmac;
883}
884
885void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
886{
887	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
888	struct mptcp_sock *msk = mptcp_sk(subflow->conn);
889	struct mptcp_options_received mp_opt;
890	struct mptcp_ext *mpext;
891
892	if (__mptcp_check_fallback(msk))
893		return;
894
895	mptcp_get_options(skb, &mp_opt);
896	if (!check_fully_established(msk, sk, subflow, skb, &mp_opt))
897		return;
898
899	if (mp_opt.add_addr && add_addr_hmac_valid(msk, &mp_opt)) {
900		struct mptcp_addr_info addr;
901
902		addr.port = htons(mp_opt.port);
903		addr.id = mp_opt.addr_id;
904		if (mp_opt.family == MPTCP_ADDR_IPVERSION_4) {
905			addr.family = AF_INET;
906			addr.addr = mp_opt.addr;
907		}
908#if IS_ENABLED(CONFIG_MPTCP_IPV6)
909		else if (mp_opt.family == MPTCP_ADDR_IPVERSION_6) {
910			addr.family = AF_INET6;
911			addr.addr6 = mp_opt.addr6;
912		}
913#endif
914		if (!mp_opt.echo) {
915			mptcp_pm_add_addr_received(msk, &addr);
916			MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDR);
917		} else {
918			mptcp_pm_del_add_timer(msk, &addr);
919			MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADD);
920		}
921		mp_opt.add_addr = 0;
922	}
923
924	if (mp_opt.rm_addr) {
925		mptcp_pm_rm_addr_received(msk, mp_opt.rm_id);
926		mp_opt.rm_addr = 0;
927	}
928
929	if (!mp_opt.dss)
930		return;
931
932	/* we can't wait for recvmsg() to update the ack_seq, otherwise
933	 * monodirectional flows will stuck
934	 */
935	if (mp_opt.use_ack)
936		update_una(msk, &mp_opt);
937
938	/* Zero-data-length packets are dropped by the caller and not
939	 * propagated to the MPTCP layer, so the skb extension does not
940	 * need to be allocated or populated. DATA_FIN information, if
941	 * present, needs to be updated here before the skb is freed.
942	 */
943	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
944		if (mp_opt.data_fin && mp_opt.data_len == 1 &&
945		    mptcp_update_rcv_data_fin(msk, mp_opt.data_seq, mp_opt.dsn64) &&
946		    schedule_work(&msk->work))
947			sock_hold(subflow->conn);
948
949		return;
950	}
951
952	mpext = skb_ext_add(skb, SKB_EXT_MPTCP);
953	if (!mpext)
954		return;
955
956	memset(mpext, 0, sizeof(*mpext));
957
958	if (mp_opt.use_map) {
959		if (mp_opt.mpc_map) {
960			/* this is an MP_CAPABLE carrying MPTCP data
961			 * we know this map the first chunk of data
962			 */
963			mptcp_crypto_key_sha(subflow->remote_key, NULL,
964					     &mpext->data_seq);
965			mpext->data_seq++;
966			mpext->subflow_seq = 1;
967			mpext->dsn64 = 1;
968			mpext->mpc_map = 1;
969			mpext->data_fin = 0;
970		} else {
971			mpext->data_seq = mp_opt.data_seq;
972			mpext->subflow_seq = mp_opt.subflow_seq;
973			mpext->dsn64 = mp_opt.dsn64;
974			mpext->data_fin = mp_opt.data_fin;
975		}
976		mpext->data_len = mp_opt.data_len;
977		mpext->use_map = 1;
978	}
979}
980
981void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts)
982{
983	if ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK |
984	     OPTION_MPTCP_MPC_ACK) & opts->suboptions) {
985		u8 len;
986
987		if (OPTION_MPTCP_MPC_SYN & opts->suboptions)
988			len = TCPOLEN_MPTCP_MPC_SYN;
989		else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions)
990			len = TCPOLEN_MPTCP_MPC_SYNACK;
991		else if (opts->ext_copy.data_len)
992			len = TCPOLEN_MPTCP_MPC_ACK_DATA;
993		else
994			len = TCPOLEN_MPTCP_MPC_ACK;
995
996		*ptr++ = mptcp_option(MPTCPOPT_MP_CAPABLE, len,
997				      MPTCP_SUPPORTED_VERSION,
998				      MPTCP_CAP_HMAC_SHA256);
999
1000		if (!((OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) &
1001		    opts->suboptions))
1002			goto mp_capable_done;
1003
1004		put_unaligned_be64(opts->sndr_key, ptr);
1005		ptr += 2;
1006		if (!((OPTION_MPTCP_MPC_ACK) & opts->suboptions))
1007			goto mp_capable_done;
1008
1009		put_unaligned_be64(opts->rcvr_key, ptr);
1010		ptr += 2;
1011		if (!opts->ext_copy.data_len)
1012			goto mp_capable_done;
1013
1014		put_unaligned_be32(opts->ext_copy.data_len << 16 |
1015				   TCPOPT_NOP << 8 | TCPOPT_NOP, ptr);
1016		ptr += 1;
1017	}
1018
1019mp_capable_done:
1020	if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) {
1021		if (opts->ahmac)
1022			*ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR,
1023					      TCPOLEN_MPTCP_ADD_ADDR, 0,
1024					      opts->addr_id);
1025		else
1026			*ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR,
1027					      TCPOLEN_MPTCP_ADD_ADDR_BASE,
1028					      MPTCP_ADDR_ECHO,
1029					      opts->addr_id);
1030		memcpy((u8 *)ptr, (u8 *)&opts->addr.s_addr, 4);
1031		ptr += 1;
1032		if (opts->ahmac) {
1033			put_unaligned_be64(opts->ahmac, ptr);
1034			ptr += 2;
1035		}
1036	}
1037
1038#if IS_ENABLED(CONFIG_MPTCP_IPV6)
1039	if (OPTION_MPTCP_ADD_ADDR6 & opts->suboptions) {
1040		if (opts->ahmac)
1041			*ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR,
1042					      TCPOLEN_MPTCP_ADD_ADDR6, 0,
1043					      opts->addr_id);
1044		else
1045			*ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR,
1046					      TCPOLEN_MPTCP_ADD_ADDR6_BASE,
1047					      MPTCP_ADDR_ECHO,
1048					      opts->addr_id);
1049		memcpy((u8 *)ptr, opts->addr6.s6_addr, 16);
1050		ptr += 4;
1051		if (opts->ahmac) {
1052			put_unaligned_be64(opts->ahmac, ptr);
1053			ptr += 2;
1054		}
1055	}
1056#endif
1057
1058	if (OPTION_MPTCP_RM_ADDR & opts->suboptions) {
1059		*ptr++ = mptcp_option(MPTCPOPT_RM_ADDR,
1060				      TCPOLEN_MPTCP_RM_ADDR_BASE,
1061				      0, opts->rm_id);
1062	}
1063
1064	if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) {
1065		*ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
1066				      TCPOLEN_MPTCP_MPJ_SYN,
1067				      opts->backup, opts->join_id);
1068		put_unaligned_be32(opts->token, ptr);
1069		ptr += 1;
1070		put_unaligned_be32(opts->nonce, ptr);
1071		ptr += 1;
1072	}
1073
1074	if (OPTION_MPTCP_MPJ_SYNACK & opts->suboptions) {
1075		*ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
1076				      TCPOLEN_MPTCP_MPJ_SYNACK,
1077				      opts->backup, opts->join_id);
1078		put_unaligned_be64(opts->thmac, ptr);
1079		ptr += 2;
1080		put_unaligned_be32(opts->nonce, ptr);
1081		ptr += 1;
1082	}
1083
1084	if (OPTION_MPTCP_MPJ_ACK & opts->suboptions) {
1085		*ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
1086				      TCPOLEN_MPTCP_MPJ_ACK, 0, 0);
1087		memcpy(ptr, opts->hmac, MPTCPOPT_HMAC_LEN);
1088		ptr += 5;
1089	}
1090
1091	if (opts->ext_copy.use_ack || opts->ext_copy.use_map) {
1092		struct mptcp_ext *mpext = &opts->ext_copy;
1093		u8 len = TCPOLEN_MPTCP_DSS_BASE;
1094		u8 flags = 0;
1095
1096		if (mpext->use_ack) {
1097			flags = MPTCP_DSS_HAS_ACK;
1098			if (mpext->ack64) {
1099				len += TCPOLEN_MPTCP_DSS_ACK64;
1100				flags |= MPTCP_DSS_ACK64;
1101			} else {
1102				len += TCPOLEN_MPTCP_DSS_ACK32;
1103			}
1104		}
1105
1106		if (mpext->use_map) {
1107			len += TCPOLEN_MPTCP_DSS_MAP64;
1108
1109			/* Use only 64-bit mapping flags for now, add
1110			 * support for optional 32-bit mappings later.
1111			 */
1112			flags |= MPTCP_DSS_HAS_MAP | MPTCP_DSS_DSN64;
1113			if (mpext->data_fin)
1114				flags |= MPTCP_DSS_DATA_FIN;
1115		}
1116
1117		*ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags);
1118
1119		if (mpext->use_ack) {
1120			if (mpext->ack64) {
1121				put_unaligned_be64(mpext->data_ack, ptr);
1122				ptr += 2;
1123			} else {
1124				put_unaligned_be32(mpext->data_ack32, ptr);
1125				ptr += 1;
1126			}
1127		}
1128
1129		if (mpext->use_map) {
1130			put_unaligned_be64(mpext->data_seq, ptr);
1131			ptr += 2;
1132			put_unaligned_be32(mpext->subflow_seq, ptr);
1133			ptr += 1;
1134			put_unaligned_be32(mpext->data_len << 16 |
1135					   TCPOPT_NOP << 8 | TCPOPT_NOP, ptr);
1136		}
1137	}
1138}
1139