xref: /kernel/linux/linux-5.10/net/ipv4/ip_tunnel.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright (c) 2013 Nicira, Inc.
4 */
5
6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7
8#include <linux/capability.h>
9#include <linux/module.h>
10#include <linux/types.h>
11#include <linux/kernel.h>
12#include <linux/slab.h>
13#include <linux/uaccess.h>
14#include <linux/skbuff.h>
15#include <linux/netdevice.h>
16#include <linux/in.h>
17#include <linux/tcp.h>
18#include <linux/udp.h>
19#include <linux/if_arp.h>
20#include <linux/init.h>
21#include <linux/in6.h>
22#include <linux/inetdevice.h>
23#include <linux/igmp.h>
24#include <linux/netfilter_ipv4.h>
25#include <linux/etherdevice.h>
26#include <linux/if_ether.h>
27#include <linux/if_vlan.h>
28#include <linux/rculist.h>
29#include <linux/err.h>
30
31#include <net/sock.h>
32#include <net/ip.h>
33#include <net/icmp.h>
34#include <net/protocol.h>
35#include <net/ip_tunnels.h>
36#include <net/arp.h>
37#include <net/checksum.h>
38#include <net/dsfield.h>
39#include <net/inet_ecn.h>
40#include <net/xfrm.h>
41#include <net/net_namespace.h>
42#include <net/netns/generic.h>
43#include <net/rtnetlink.h>
44#include <net/udp.h>
45#include <net/dst_metadata.h>
46
47#if IS_ENABLED(CONFIG_IPV6)
48#include <net/ipv6.h>
49#include <net/ip6_fib.h>
50#include <net/ip6_route.h>
51#endif
52
53static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
54{
55	return hash_32((__force u32)key ^ (__force u32)remote,
56			 IP_TNL_HASH_BITS);
57}
58
59static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
60				__be16 flags, __be32 key)
61{
62	if (p->i_flags & TUNNEL_KEY) {
63		if (flags & TUNNEL_KEY)
64			return key == p->i_key;
65		else
66			/* key expected, none present */
67			return false;
68	} else
69		return !(flags & TUNNEL_KEY);
70}
71
72/* Fallback tunnel: no source, no destination, no key, no options
73
74   Tunnel hash table:
75   We require exact key match i.e. if a key is present in packet
76   it will match only tunnel with the same key; if it is not present,
77   it will match only keyless tunnel.
78
79   All keysless packets, if not matched configured keyless tunnels
80   will match fallback tunnel.
81   Given src, dst and key, find appropriate for input tunnel.
82*/
83struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
84				   int link, __be16 flags,
85				   __be32 remote, __be32 local,
86				   __be32 key)
87{
88	struct ip_tunnel *t, *cand = NULL;
89	struct hlist_head *head;
90	struct net_device *ndev;
91	unsigned int hash;
92
93	hash = ip_tunnel_hash(key, remote);
94	head = &itn->tunnels[hash];
95
96	hlist_for_each_entry_rcu(t, head, hash_node) {
97		if (local != t->parms.iph.saddr ||
98		    remote != t->parms.iph.daddr ||
99		    !(t->dev->flags & IFF_UP))
100			continue;
101
102		if (!ip_tunnel_key_match(&t->parms, flags, key))
103			continue;
104
105		if (t->parms.link == link)
106			return t;
107		else
108			cand = t;
109	}
110
111	hlist_for_each_entry_rcu(t, head, hash_node) {
112		if (remote != t->parms.iph.daddr ||
113		    t->parms.iph.saddr != 0 ||
114		    !(t->dev->flags & IFF_UP))
115			continue;
116
117		if (!ip_tunnel_key_match(&t->parms, flags, key))
118			continue;
119
120		if (t->parms.link == link)
121			return t;
122		else if (!cand)
123			cand = t;
124	}
125
126	hash = ip_tunnel_hash(key, 0);
127	head = &itn->tunnels[hash];
128
129	hlist_for_each_entry_rcu(t, head, hash_node) {
130		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
131		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
132			continue;
133
134		if (!(t->dev->flags & IFF_UP))
135			continue;
136
137		if (!ip_tunnel_key_match(&t->parms, flags, key))
138			continue;
139
140		if (t->parms.link == link)
141			return t;
142		else if (!cand)
143			cand = t;
144	}
145
146	hlist_for_each_entry_rcu(t, head, hash_node) {
147		if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) ||
148		    t->parms.iph.saddr != 0 ||
149		    t->parms.iph.daddr != 0 ||
150		    !(t->dev->flags & IFF_UP))
151			continue;
152
153		if (t->parms.link == link)
154			return t;
155		else if (!cand)
156			cand = t;
157	}
158
159	if (cand)
160		return cand;
161
162	t = rcu_dereference(itn->collect_md_tun);
163	if (t && t->dev->flags & IFF_UP)
164		return t;
165
166	ndev = READ_ONCE(itn->fb_tunnel_dev);
167	if (ndev && ndev->flags & IFF_UP)
168		return netdev_priv(ndev);
169
170	return NULL;
171}
172EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
173
174static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
175				    struct ip_tunnel_parm *parms)
176{
177	unsigned int h;
178	__be32 remote;
179	__be32 i_key = parms->i_key;
180
181	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
182		remote = parms->iph.daddr;
183	else
184		remote = 0;
185
186	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
187		i_key = 0;
188
189	h = ip_tunnel_hash(i_key, remote);
190	return &itn->tunnels[h];
191}
192
193static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
194{
195	struct hlist_head *head = ip_bucket(itn, &t->parms);
196
197	if (t->collect_md)
198		rcu_assign_pointer(itn->collect_md_tun, t);
199	hlist_add_head_rcu(&t->hash_node, head);
200}
201
202static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
203{
204	if (t->collect_md)
205		rcu_assign_pointer(itn->collect_md_tun, NULL);
206	hlist_del_init_rcu(&t->hash_node);
207}
208
209static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
210					struct ip_tunnel_parm *parms,
211					int type)
212{
213	__be32 remote = parms->iph.daddr;
214	__be32 local = parms->iph.saddr;
215	__be32 key = parms->i_key;
216	__be16 flags = parms->i_flags;
217	int link = parms->link;
218	struct ip_tunnel *t = NULL;
219	struct hlist_head *head = ip_bucket(itn, parms);
220
221	hlist_for_each_entry_rcu(t, head, hash_node) {
222		if (local == t->parms.iph.saddr &&
223		    remote == t->parms.iph.daddr &&
224		    link == t->parms.link &&
225		    type == t->dev->type &&
226		    ip_tunnel_key_match(&t->parms, flags, key))
227			break;
228	}
229	return t;
230}
231
232static struct net_device *__ip_tunnel_create(struct net *net,
233					     const struct rtnl_link_ops *ops,
234					     struct ip_tunnel_parm *parms)
235{
236	int err;
237	struct ip_tunnel *tunnel;
238	struct net_device *dev;
239	char name[IFNAMSIZ];
240
241	err = -E2BIG;
242	if (parms->name[0]) {
243		if (!dev_valid_name(parms->name))
244			goto failed;
245		strlcpy(name, parms->name, IFNAMSIZ);
246	} else {
247		if (strlen(ops->kind) > (IFNAMSIZ - 3))
248			goto failed;
249		strcpy(name, ops->kind);
250		strcat(name, "%d");
251	}
252
253	ASSERT_RTNL();
254	dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
255	if (!dev) {
256		err = -ENOMEM;
257		goto failed;
258	}
259	dev_net_set(dev, net);
260
261	dev->rtnl_link_ops = ops;
262
263	tunnel = netdev_priv(dev);
264	tunnel->parms = *parms;
265	tunnel->net = net;
266
267	err = register_netdevice(dev);
268	if (err)
269		goto failed_free;
270
271	return dev;
272
273failed_free:
274	free_netdev(dev);
275failed:
276	return ERR_PTR(err);
277}
278
279static int ip_tunnel_bind_dev(struct net_device *dev)
280{
281	struct net_device *tdev = NULL;
282	struct ip_tunnel *tunnel = netdev_priv(dev);
283	const struct iphdr *iph;
284	int hlen = LL_MAX_HEADER;
285	int mtu = ETH_DATA_LEN;
286	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
287
288	iph = &tunnel->parms.iph;
289
290	/* Guess output device to choose reasonable mtu and needed_headroom */
291	if (iph->daddr) {
292		struct flowi4 fl4;
293		struct rtable *rt;
294
295		ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
296				    iph->saddr, tunnel->parms.o_key,
297				    RT_TOS(iph->tos), tunnel->parms.link,
298				    tunnel->fwmark, 0);
299		rt = ip_route_output_key(tunnel->net, &fl4);
300
301		if (!IS_ERR(rt)) {
302			tdev = rt->dst.dev;
303			ip_rt_put(rt);
304		}
305		if (dev->type != ARPHRD_ETHER)
306			dev->flags |= IFF_POINTOPOINT;
307
308		dst_cache_reset(&tunnel->dst_cache);
309	}
310
311	if (!tdev && tunnel->parms.link)
312		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
313
314	if (tdev) {
315		hlen = tdev->hard_header_len + tdev->needed_headroom;
316		mtu = min(tdev->mtu, IP_MAX_MTU);
317	}
318
319	dev->needed_headroom = t_hlen + hlen;
320	mtu -= t_hlen + (dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0);
321
322	if (mtu < IPV4_MIN_MTU)
323		mtu = IPV4_MIN_MTU;
324
325	return mtu;
326}
327
328static struct ip_tunnel *ip_tunnel_create(struct net *net,
329					  struct ip_tunnel_net *itn,
330					  struct ip_tunnel_parm *parms)
331{
332	struct ip_tunnel *nt;
333	struct net_device *dev;
334	int t_hlen;
335	int mtu;
336	int err;
337
338	dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
339	if (IS_ERR(dev))
340		return ERR_CAST(dev);
341
342	mtu = ip_tunnel_bind_dev(dev);
343	err = dev_set_mtu(dev, mtu);
344	if (err)
345		goto err_dev_set_mtu;
346
347	nt = netdev_priv(dev);
348	t_hlen = nt->hlen + sizeof(struct iphdr);
349	dev->min_mtu = ETH_MIN_MTU;
350	dev->max_mtu = IP_MAX_MTU - t_hlen;
351	if (dev->type == ARPHRD_ETHER)
352		dev->max_mtu -= dev->hard_header_len;
353
354	ip_tunnel_add(itn, nt);
355	return nt;
356
357err_dev_set_mtu:
358	unregister_netdevice(dev);
359	return ERR_PTR(err);
360}
361
362int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
363		  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
364		  bool log_ecn_error)
365{
366	const struct iphdr *iph = ip_hdr(skb);
367	int nh, err;
368
369#ifdef CONFIG_NET_IPGRE_BROADCAST
370	if (ipv4_is_multicast(iph->daddr)) {
371		tunnel->dev->stats.multicast++;
372		skb->pkt_type = PACKET_BROADCAST;
373	}
374#endif
375
376	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
377	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
378		tunnel->dev->stats.rx_crc_errors++;
379		tunnel->dev->stats.rx_errors++;
380		goto drop;
381	}
382
383	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
384		if (!(tpi->flags&TUNNEL_SEQ) ||
385		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
386			tunnel->dev->stats.rx_fifo_errors++;
387			tunnel->dev->stats.rx_errors++;
388			goto drop;
389		}
390		tunnel->i_seqno = ntohl(tpi->seq) + 1;
391	}
392
393	/* Save offset of outer header relative to skb->head,
394	 * because we are going to reset the network header to the inner header
395	 * and might change skb->head.
396	 */
397	nh = skb_network_header(skb) - skb->head;
398
399	skb_set_network_header(skb, (tunnel->dev->type == ARPHRD_ETHER) ? ETH_HLEN : 0);
400
401	if (!pskb_inet_may_pull(skb)) {
402		DEV_STATS_INC(tunnel->dev, rx_length_errors);
403		DEV_STATS_INC(tunnel->dev, rx_errors);
404		goto drop;
405	}
406	iph = (struct iphdr *)(skb->head + nh);
407
408	err = IP_ECN_decapsulate(iph, skb);
409	if (unlikely(err)) {
410		if (log_ecn_error)
411			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
412					&iph->saddr, iph->tos);
413		if (err > 1) {
414			++tunnel->dev->stats.rx_frame_errors;
415			++tunnel->dev->stats.rx_errors;
416			goto drop;
417		}
418	}
419
420	dev_sw_netstats_rx_add(tunnel->dev, skb->len);
421	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
422
423	if (tunnel->dev->type == ARPHRD_ETHER) {
424		skb->protocol = eth_type_trans(skb, tunnel->dev);
425		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
426	} else {
427		skb->dev = tunnel->dev;
428	}
429
430	if (tun_dst)
431		skb_dst_set(skb, (struct dst_entry *)tun_dst);
432
433	gro_cells_receive(&tunnel->gro_cells, skb);
434	return 0;
435
436drop:
437	if (tun_dst)
438		dst_release((struct dst_entry *)tun_dst);
439	kfree_skb(skb);
440	return 0;
441}
442EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
443
444int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
445			    unsigned int num)
446{
447	if (num >= MAX_IPTUN_ENCAP_OPS)
448		return -ERANGE;
449
450	return !cmpxchg((const struct ip_tunnel_encap_ops **)
451			&iptun_encaps[num],
452			NULL, ops) ? 0 : -1;
453}
454EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
455
456int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
457			    unsigned int num)
458{
459	int ret;
460
461	if (num >= MAX_IPTUN_ENCAP_OPS)
462		return -ERANGE;
463
464	ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
465		       &iptun_encaps[num],
466		       ops, NULL) == ops) ? 0 : -1;
467
468	synchronize_net();
469
470	return ret;
471}
472EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
473
474int ip_tunnel_encap_setup(struct ip_tunnel *t,
475			  struct ip_tunnel_encap *ipencap)
476{
477	int hlen;
478
479	memset(&t->encap, 0, sizeof(t->encap));
480
481	hlen = ip_encap_hlen(ipencap);
482	if (hlen < 0)
483		return hlen;
484
485	t->encap.type = ipencap->type;
486	t->encap.sport = ipencap->sport;
487	t->encap.dport = ipencap->dport;
488	t->encap.flags = ipencap->flags;
489
490	t->encap_hlen = hlen;
491	t->hlen = t->encap_hlen + t->tun_hlen;
492
493	return 0;
494}
495EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
496
497static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
498			    struct rtable *rt, __be16 df,
499			    const struct iphdr *inner_iph,
500			    int tunnel_hlen, __be32 dst, bool md)
501{
502	struct ip_tunnel *tunnel = netdev_priv(dev);
503	int pkt_size;
504	int mtu;
505
506	tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
507	pkt_size = skb->len - tunnel_hlen;
508	pkt_size -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
509
510	if (df) {
511		mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen);
512		mtu -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
513	} else {
514		mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
515	}
516
517	if (skb_valid_dst(skb))
518		skb_dst_update_pmtu_no_confirm(skb, mtu);
519
520	if (skb->protocol == htons(ETH_P_IP)) {
521		if (!skb_is_gso(skb) &&
522		    (inner_iph->frag_off & htons(IP_DF)) &&
523		    mtu < pkt_size) {
524			icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
525			return -E2BIG;
526		}
527	}
528#if IS_ENABLED(CONFIG_IPV6)
529	else if (skb->protocol == htons(ETH_P_IPV6)) {
530		struct rt6_info *rt6;
531		__be32 daddr;
532
533		rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) :
534					   NULL;
535		daddr = md ? dst : tunnel->parms.iph.daddr;
536
537		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
538			   mtu >= IPV6_MIN_MTU) {
539			if ((daddr && !ipv4_is_multicast(daddr)) ||
540			    rt6->rt6i_dst.plen == 128) {
541				rt6->rt6i_flags |= RTF_MODIFIED;
542				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
543			}
544		}
545
546		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
547					mtu < pkt_size) {
548			icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
549			return -E2BIG;
550		}
551	}
552#endif
553	return 0;
554}
555
556static void ip_tunnel_adj_headroom(struct net_device *dev, unsigned int headroom)
557{
558	/* we must cap headroom to some upperlimit, else pskb_expand_head
559	 * will overflow header offsets in skb_headers_offset_update().
560	 */
561	static const unsigned int max_allowed = 512;
562
563	if (headroom > max_allowed)
564		headroom = max_allowed;
565
566	if (headroom > READ_ONCE(dev->needed_headroom))
567		WRITE_ONCE(dev->needed_headroom, headroom);
568}
569
570void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
571		       u8 proto, int tunnel_hlen)
572{
573	struct ip_tunnel *tunnel = netdev_priv(dev);
574	u32 headroom = sizeof(struct iphdr);
575	struct ip_tunnel_info *tun_info;
576	const struct ip_tunnel_key *key;
577	const struct iphdr *inner_iph;
578	struct rtable *rt = NULL;
579	struct flowi4 fl4;
580	__be16 df = 0;
581	u8 tos, ttl;
582	bool use_cache;
583
584	tun_info = skb_tunnel_info(skb);
585	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
586		     ip_tunnel_info_af(tun_info) != AF_INET))
587		goto tx_error;
588	key = &tun_info->key;
589	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
590	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
591	tos = key->tos;
592	if (tos == 1) {
593		if (skb->protocol == htons(ETH_P_IP))
594			tos = inner_iph->tos;
595		else if (skb->protocol == htons(ETH_P_IPV6))
596			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
597	}
598	ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
599			    tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
600			    0, skb->mark, skb_get_hash(skb));
601	if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
602		goto tx_error;
603
604	use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
605	if (use_cache)
606		rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
607	if (!rt) {
608		rt = ip_route_output_key(tunnel->net, &fl4);
609		if (IS_ERR(rt)) {
610			dev->stats.tx_carrier_errors++;
611			goto tx_error;
612		}
613		if (use_cache)
614			dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
615					  fl4.saddr);
616	}
617	if (rt->dst.dev == dev) {
618		ip_rt_put(rt);
619		dev->stats.collisions++;
620		goto tx_error;
621	}
622
623	if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
624		df = htons(IP_DF);
625	if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
626			    key->u.ipv4.dst, true)) {
627		ip_rt_put(rt);
628		goto tx_error;
629	}
630
631	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
632	ttl = key->ttl;
633	if (ttl == 0) {
634		if (skb->protocol == htons(ETH_P_IP))
635			ttl = inner_iph->ttl;
636		else if (skb->protocol == htons(ETH_P_IPV6))
637			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
638		else
639			ttl = ip4_dst_hoplimit(&rt->dst);
640	}
641
642	headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
643	if (skb_cow_head(skb, headroom)) {
644		ip_rt_put(rt);
645		goto tx_dropped;
646	}
647
648	ip_tunnel_adj_headroom(dev, headroom);
649
650	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
651		      df, !net_eq(tunnel->net, dev_net(dev)));
652	return;
653tx_error:
654	dev->stats.tx_errors++;
655	goto kfree;
656tx_dropped:
657	dev->stats.tx_dropped++;
658kfree:
659	kfree_skb(skb);
660}
661EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
662
663void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
664		    const struct iphdr *tnl_params, u8 protocol)
665{
666	struct ip_tunnel *tunnel = netdev_priv(dev);
667	struct ip_tunnel_info *tun_info = NULL;
668	const struct iphdr *inner_iph;
669	unsigned int max_headroom;	/* The extra header space needed */
670	struct rtable *rt = NULL;		/* Route to the other host */
671	bool use_cache = false;
672	struct flowi4 fl4;
673	bool md = false;
674	bool connected;
675	u8 tos, ttl;
676	__be32 dst;
677	__be16 df;
678
679	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
680	connected = (tunnel->parms.iph.daddr != 0);
681
682	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
683
684	dst = tnl_params->daddr;
685	if (dst == 0) {
686		/* NBMA tunnel */
687
688		if (!skb_dst(skb)) {
689			dev->stats.tx_fifo_errors++;
690			goto tx_error;
691		}
692
693		tun_info = skb_tunnel_info(skb);
694		if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
695		    ip_tunnel_info_af(tun_info) == AF_INET &&
696		    tun_info->key.u.ipv4.dst) {
697			dst = tun_info->key.u.ipv4.dst;
698			md = true;
699			connected = true;
700		}
701		else if (skb->protocol == htons(ETH_P_IP)) {
702			rt = skb_rtable(skb);
703			dst = rt_nexthop(rt, inner_iph->daddr);
704		}
705#if IS_ENABLED(CONFIG_IPV6)
706		else if (skb->protocol == htons(ETH_P_IPV6)) {
707			const struct in6_addr *addr6;
708			struct neighbour *neigh;
709			bool do_tx_error_icmp;
710			int addr_type;
711
712			neigh = dst_neigh_lookup(skb_dst(skb),
713						 &ipv6_hdr(skb)->daddr);
714			if (!neigh)
715				goto tx_error;
716
717			addr6 = (const struct in6_addr *)&neigh->primary_key;
718			addr_type = ipv6_addr_type(addr6);
719
720			if (addr_type == IPV6_ADDR_ANY) {
721				addr6 = &ipv6_hdr(skb)->daddr;
722				addr_type = ipv6_addr_type(addr6);
723			}
724
725			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
726				do_tx_error_icmp = true;
727			else {
728				do_tx_error_icmp = false;
729				dst = addr6->s6_addr32[3];
730			}
731			neigh_release(neigh);
732			if (do_tx_error_icmp)
733				goto tx_error_icmp;
734		}
735#endif
736		else
737			goto tx_error;
738
739		if (!md)
740			connected = false;
741	}
742
743	tos = tnl_params->tos;
744	if (tos & 0x1) {
745		tos &= ~0x1;
746		if (skb->protocol == htons(ETH_P_IP)) {
747			tos = inner_iph->tos;
748			connected = false;
749		} else if (skb->protocol == htons(ETH_P_IPV6)) {
750			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
751			connected = false;
752		}
753	}
754
755	ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
756			    tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
757			    tunnel->fwmark, skb_get_hash(skb));
758
759	if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
760		goto tx_error;
761
762	if (connected && md) {
763		use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
764		if (use_cache)
765			rt = dst_cache_get_ip4(&tun_info->dst_cache,
766					       &fl4.saddr);
767	} else {
768		rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
769						&fl4.saddr) : NULL;
770	}
771
772	if (!rt) {
773		rt = ip_route_output_key(tunnel->net, &fl4);
774
775		if (IS_ERR(rt)) {
776			dev->stats.tx_carrier_errors++;
777			goto tx_error;
778		}
779		if (use_cache)
780			dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
781					  fl4.saddr);
782		else if (!md && connected)
783			dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
784					  fl4.saddr);
785	}
786
787	if (rt->dst.dev == dev) {
788		ip_rt_put(rt);
789		dev->stats.collisions++;
790		goto tx_error;
791	}
792
793	df = tnl_params->frag_off;
794	if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
795		df |= (inner_iph->frag_off & htons(IP_DF));
796
797	if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) {
798		ip_rt_put(rt);
799		goto tx_error;
800	}
801
802	if (tunnel->err_count > 0) {
803		if (time_before(jiffies,
804				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
805			tunnel->err_count--;
806
807			dst_link_failure(skb);
808		} else
809			tunnel->err_count = 0;
810	}
811
812	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
813	ttl = tnl_params->ttl;
814	if (ttl == 0) {
815		if (skb->protocol == htons(ETH_P_IP))
816			ttl = inner_iph->ttl;
817#if IS_ENABLED(CONFIG_IPV6)
818		else if (skb->protocol == htons(ETH_P_IPV6))
819			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
820#endif
821		else
822			ttl = ip4_dst_hoplimit(&rt->dst);
823	}
824
825	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
826			+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
827
828	if (skb_cow_head(skb, max_headroom)) {
829		ip_rt_put(rt);
830		dev->stats.tx_dropped++;
831		kfree_skb(skb);
832		return;
833	}
834
835	ip_tunnel_adj_headroom(dev, max_headroom);
836
837	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
838		      df, !net_eq(tunnel->net, dev_net(dev)));
839	return;
840
841#if IS_ENABLED(CONFIG_IPV6)
842tx_error_icmp:
843	dst_link_failure(skb);
844#endif
845tx_error:
846	dev->stats.tx_errors++;
847	kfree_skb(skb);
848}
849EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
850
851static void ip_tunnel_update(struct ip_tunnel_net *itn,
852			     struct ip_tunnel *t,
853			     struct net_device *dev,
854			     struct ip_tunnel_parm *p,
855			     bool set_mtu,
856			     __u32 fwmark)
857{
858	ip_tunnel_del(itn, t);
859	t->parms.iph.saddr = p->iph.saddr;
860	t->parms.iph.daddr = p->iph.daddr;
861	t->parms.i_key = p->i_key;
862	t->parms.o_key = p->o_key;
863	if (dev->type != ARPHRD_ETHER) {
864		memcpy(dev->dev_addr, &p->iph.saddr, 4);
865		memcpy(dev->broadcast, &p->iph.daddr, 4);
866	}
867	ip_tunnel_add(itn, t);
868
869	t->parms.iph.ttl = p->iph.ttl;
870	t->parms.iph.tos = p->iph.tos;
871	t->parms.iph.frag_off = p->iph.frag_off;
872
873	if (t->parms.link != p->link || t->fwmark != fwmark) {
874		int mtu;
875
876		t->parms.link = p->link;
877		t->fwmark = fwmark;
878		mtu = ip_tunnel_bind_dev(dev);
879		if (set_mtu)
880			dev->mtu = mtu;
881	}
882	dst_cache_reset(&t->dst_cache);
883	netdev_state_change(dev);
884}
885
886int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
887{
888	int err = 0;
889	struct ip_tunnel *t = netdev_priv(dev);
890	struct net *net = t->net;
891	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
892
893	switch (cmd) {
894	case SIOCGETTUNNEL:
895		if (dev == itn->fb_tunnel_dev) {
896			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
897			if (!t)
898				t = netdev_priv(dev);
899		}
900		memcpy(p, &t->parms, sizeof(*p));
901		break;
902
903	case SIOCADDTUNNEL:
904	case SIOCCHGTUNNEL:
905		err = -EPERM;
906		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
907			goto done;
908		if (p->iph.ttl)
909			p->iph.frag_off |= htons(IP_DF);
910		if (!(p->i_flags & VTI_ISVTI)) {
911			if (!(p->i_flags & TUNNEL_KEY))
912				p->i_key = 0;
913			if (!(p->o_flags & TUNNEL_KEY))
914				p->o_key = 0;
915		}
916
917		t = ip_tunnel_find(itn, p, itn->type);
918
919		if (cmd == SIOCADDTUNNEL) {
920			if (!t) {
921				t = ip_tunnel_create(net, itn, p);
922				err = PTR_ERR_OR_ZERO(t);
923				break;
924			}
925
926			err = -EEXIST;
927			break;
928		}
929		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
930			if (t) {
931				if (t->dev != dev) {
932					err = -EEXIST;
933					break;
934				}
935			} else {
936				unsigned int nflags = 0;
937
938				if (ipv4_is_multicast(p->iph.daddr))
939					nflags = IFF_BROADCAST;
940				else if (p->iph.daddr)
941					nflags = IFF_POINTOPOINT;
942
943				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
944					err = -EINVAL;
945					break;
946				}
947
948				t = netdev_priv(dev);
949			}
950		}
951
952		if (t) {
953			err = 0;
954			ip_tunnel_update(itn, t, dev, p, true, 0);
955		} else {
956			err = -ENOENT;
957		}
958		break;
959
960	case SIOCDELTUNNEL:
961		err = -EPERM;
962		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
963			goto done;
964
965		if (dev == itn->fb_tunnel_dev) {
966			err = -ENOENT;
967			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
968			if (!t)
969				goto done;
970			err = -EPERM;
971			if (t == netdev_priv(itn->fb_tunnel_dev))
972				goto done;
973			dev = t->dev;
974		}
975		unregister_netdevice(dev);
976		err = 0;
977		break;
978
979	default:
980		err = -EINVAL;
981	}
982
983done:
984	return err;
985}
986EXPORT_SYMBOL_GPL(ip_tunnel_ctl);
987
988int ip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
989{
990	struct ip_tunnel_parm p;
991	int err;
992
993	if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
994		return -EFAULT;
995	err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd);
996	if (!err && copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
997		return -EFAULT;
998	return err;
999}
1000EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
1001
1002int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
1003{
1004	struct ip_tunnel *tunnel = netdev_priv(dev);
1005	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
1006	int max_mtu = IP_MAX_MTU - t_hlen;
1007
1008	if (dev->type == ARPHRD_ETHER)
1009		max_mtu -= dev->hard_header_len;
1010
1011	if (new_mtu < ETH_MIN_MTU)
1012		return -EINVAL;
1013
1014	if (new_mtu > max_mtu) {
1015		if (strict)
1016			return -EINVAL;
1017
1018		new_mtu = max_mtu;
1019	}
1020
1021	dev->mtu = new_mtu;
1022	return 0;
1023}
1024EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
1025
1026int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1027{
1028	return __ip_tunnel_change_mtu(dev, new_mtu, true);
1029}
1030EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
1031
1032static void ip_tunnel_dev_free(struct net_device *dev)
1033{
1034	struct ip_tunnel *tunnel = netdev_priv(dev);
1035
1036	gro_cells_destroy(&tunnel->gro_cells);
1037	dst_cache_destroy(&tunnel->dst_cache);
1038	free_percpu(dev->tstats);
1039}
1040
1041void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
1042{
1043	struct ip_tunnel *tunnel = netdev_priv(dev);
1044	struct ip_tunnel_net *itn;
1045
1046	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
1047
1048	if (itn->fb_tunnel_dev != dev) {
1049		ip_tunnel_del(itn, netdev_priv(dev));
1050		unregister_netdevice_queue(dev, head);
1051	}
1052}
1053EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1054
1055struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1056{
1057	struct ip_tunnel *tunnel = netdev_priv(dev);
1058
1059	return tunnel->net;
1060}
1061EXPORT_SYMBOL(ip_tunnel_get_link_net);
1062
1063int ip_tunnel_get_iflink(const struct net_device *dev)
1064{
1065	struct ip_tunnel *tunnel = netdev_priv(dev);
1066
1067	return tunnel->parms.link;
1068}
1069EXPORT_SYMBOL(ip_tunnel_get_iflink);
1070
1071int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1072				  struct rtnl_link_ops *ops, char *devname)
1073{
1074	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1075	struct ip_tunnel_parm parms;
1076	unsigned int i;
1077
1078	itn->rtnl_link_ops = ops;
1079	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1080		INIT_HLIST_HEAD(&itn->tunnels[i]);
1081
1082	if (!ops || !net_has_fallback_tunnels(net)) {
1083		struct ip_tunnel_net *it_init_net;
1084
1085		it_init_net = net_generic(&init_net, ip_tnl_net_id);
1086		itn->type = it_init_net->type;
1087		itn->fb_tunnel_dev = NULL;
1088		return 0;
1089	}
1090
1091	memset(&parms, 0, sizeof(parms));
1092	if (devname)
1093		strlcpy(parms.name, devname, IFNAMSIZ);
1094
1095	rtnl_lock();
1096	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1097	/* FB netdevice is special: we have one, and only one per netns.
1098	 * Allowing to move it to another netns is clearly unsafe.
1099	 */
1100	if (!IS_ERR(itn->fb_tunnel_dev)) {
1101		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1102		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1103		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1104		itn->type = itn->fb_tunnel_dev->type;
1105	}
1106	rtnl_unlock();
1107
1108	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1109}
1110EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1111
1112static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1113			      struct list_head *head,
1114			      struct rtnl_link_ops *ops)
1115{
1116	struct net_device *dev, *aux;
1117	int h;
1118
1119	for_each_netdev_safe(net, dev, aux)
1120		if (dev->rtnl_link_ops == ops)
1121			unregister_netdevice_queue(dev, head);
1122
1123	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1124		struct ip_tunnel *t;
1125		struct hlist_node *n;
1126		struct hlist_head *thead = &itn->tunnels[h];
1127
1128		hlist_for_each_entry_safe(t, n, thead, hash_node)
1129			/* If dev is in the same netns, it has already
1130			 * been added to the list by the previous loop.
1131			 */
1132			if (!net_eq(dev_net(t->dev), net))
1133				unregister_netdevice_queue(t->dev, head);
1134	}
1135}
1136
1137void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1138			   struct rtnl_link_ops *ops)
1139{
1140	struct ip_tunnel_net *itn;
1141	struct net *net;
1142	LIST_HEAD(list);
1143
1144	rtnl_lock();
1145	list_for_each_entry(net, net_list, exit_list) {
1146		itn = net_generic(net, id);
1147		ip_tunnel_destroy(net, itn, &list, ops);
1148	}
1149	unregister_netdevice_many(&list);
1150	rtnl_unlock();
1151}
1152EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1153
1154int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1155		      struct ip_tunnel_parm *p, __u32 fwmark)
1156{
1157	struct ip_tunnel *nt;
1158	struct net *net = dev_net(dev);
1159	struct ip_tunnel_net *itn;
1160	int mtu;
1161	int err;
1162
1163	nt = netdev_priv(dev);
1164	itn = net_generic(net, nt->ip_tnl_net_id);
1165
1166	if (nt->collect_md) {
1167		if (rtnl_dereference(itn->collect_md_tun))
1168			return -EEXIST;
1169	} else {
1170		if (ip_tunnel_find(itn, p, dev->type))
1171			return -EEXIST;
1172	}
1173
1174	nt->net = net;
1175	nt->parms = *p;
1176	nt->fwmark = fwmark;
1177	err = register_netdevice(dev);
1178	if (err)
1179		goto err_register_netdevice;
1180
1181	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1182		eth_hw_addr_random(dev);
1183
1184	mtu = ip_tunnel_bind_dev(dev);
1185	if (tb[IFLA_MTU]) {
1186		unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr));
1187
1188		if (dev->type == ARPHRD_ETHER)
1189			max -= dev->hard_header_len;
1190
1191		mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max);
1192	}
1193
1194	err = dev_set_mtu(dev, mtu);
1195	if (err)
1196		goto err_dev_set_mtu;
1197
1198	ip_tunnel_add(itn, nt);
1199	return 0;
1200
1201err_dev_set_mtu:
1202	unregister_netdevice(dev);
1203err_register_netdevice:
1204	return err;
1205}
1206EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1207
1208int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1209			 struct ip_tunnel_parm *p, __u32 fwmark)
1210{
1211	struct ip_tunnel *t;
1212	struct ip_tunnel *tunnel = netdev_priv(dev);
1213	struct net *net = tunnel->net;
1214	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1215
1216	if (dev == itn->fb_tunnel_dev)
1217		return -EINVAL;
1218
1219	t = ip_tunnel_find(itn, p, dev->type);
1220
1221	if (t) {
1222		if (t->dev != dev)
1223			return -EEXIST;
1224	} else {
1225		t = tunnel;
1226
1227		if (dev->type != ARPHRD_ETHER) {
1228			unsigned int nflags = 0;
1229
1230			if (ipv4_is_multicast(p->iph.daddr))
1231				nflags = IFF_BROADCAST;
1232			else if (p->iph.daddr)
1233				nflags = IFF_POINTOPOINT;
1234
1235			if ((dev->flags ^ nflags) &
1236			    (IFF_POINTOPOINT | IFF_BROADCAST))
1237				return -EINVAL;
1238		}
1239	}
1240
1241	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1242	return 0;
1243}
1244EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1245
1246int ip_tunnel_init(struct net_device *dev)
1247{
1248	struct ip_tunnel *tunnel = netdev_priv(dev);
1249	struct iphdr *iph = &tunnel->parms.iph;
1250	int err;
1251
1252	dev->needs_free_netdev = true;
1253	dev->priv_destructor = ip_tunnel_dev_free;
1254	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1255	if (!dev->tstats)
1256		return -ENOMEM;
1257
1258	err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1259	if (err) {
1260		free_percpu(dev->tstats);
1261		return err;
1262	}
1263
1264	err = gro_cells_init(&tunnel->gro_cells, dev);
1265	if (err) {
1266		dst_cache_destroy(&tunnel->dst_cache);
1267		free_percpu(dev->tstats);
1268		return err;
1269	}
1270
1271	tunnel->dev = dev;
1272	tunnel->net = dev_net(dev);
1273	strcpy(tunnel->parms.name, dev->name);
1274	iph->version		= 4;
1275	iph->ihl		= 5;
1276
1277	if (tunnel->collect_md)
1278		netif_keep_dst(dev);
1279	return 0;
1280}
1281EXPORT_SYMBOL_GPL(ip_tunnel_init);
1282
1283void ip_tunnel_uninit(struct net_device *dev)
1284{
1285	struct ip_tunnel *tunnel = netdev_priv(dev);
1286	struct net *net = tunnel->net;
1287	struct ip_tunnel_net *itn;
1288
1289	itn = net_generic(net, tunnel->ip_tnl_net_id);
1290	ip_tunnel_del(itn, netdev_priv(dev));
1291	if (itn->fb_tunnel_dev == dev)
1292		WRITE_ONCE(itn->fb_tunnel_dev, NULL);
1293
1294	dst_cache_reset(&tunnel->dst_cache);
1295}
1296EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1297
1298/* Do least required initialization, rest of init is done in tunnel_init call */
1299void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1300{
1301	struct ip_tunnel *tunnel = netdev_priv(dev);
1302	tunnel->ip_tnl_net_id = net_id;
1303}
1304EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1305
1306MODULE_LICENSE("GPL");
1307