xref: /kernel/linux/linux-6.6/net/ipv4/ip_tunnel.c (revision 62306a36)
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright (c) 2013 Nicira, Inc.
4 */
5
6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7
8#include <linux/capability.h>
9#include <linux/module.h>
10#include <linux/types.h>
11#include <linux/kernel.h>
12#include <linux/slab.h>
13#include <linux/uaccess.h>
14#include <linux/skbuff.h>
15#include <linux/netdevice.h>
16#include <linux/in.h>
17#include <linux/tcp.h>
18#include <linux/udp.h>
19#include <linux/if_arp.h>
20#include <linux/init.h>
21#include <linux/in6.h>
22#include <linux/inetdevice.h>
23#include <linux/igmp.h>
24#include <linux/netfilter_ipv4.h>
25#include <linux/etherdevice.h>
26#include <linux/if_ether.h>
27#include <linux/if_vlan.h>
28#include <linux/rculist.h>
29#include <linux/err.h>
30
31#include <net/sock.h>
32#include <net/ip.h>
33#include <net/icmp.h>
34#include <net/protocol.h>
35#include <net/ip_tunnels.h>
36#include <net/arp.h>
37#include <net/checksum.h>
38#include <net/dsfield.h>
39#include <net/inet_ecn.h>
40#include <net/xfrm.h>
41#include <net/net_namespace.h>
42#include <net/netns/generic.h>
43#include <net/rtnetlink.h>
44#include <net/udp.h>
45#include <net/dst_metadata.h>
46
47#if IS_ENABLED(CONFIG_IPV6)
48#include <net/ipv6.h>
49#include <net/ip6_fib.h>
50#include <net/ip6_route.h>
51#endif
52
53static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
54{
55	return hash_32((__force u32)key ^ (__force u32)remote,
56			 IP_TNL_HASH_BITS);
57}
58
59static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
60				__be16 flags, __be32 key)
61{
62	if (p->i_flags & TUNNEL_KEY) {
63		if (flags & TUNNEL_KEY)
64			return key == p->i_key;
65		else
66			/* key expected, none present */
67			return false;
68	} else
69		return !(flags & TUNNEL_KEY);
70}
71
72/* Fallback tunnel: no source, no destination, no key, no options
73
74   Tunnel hash table:
75   We require exact key match i.e. if a key is present in packet
76   it will match only tunnel with the same key; if it is not present,
77   it will match only keyless tunnel.
78
79   All keysless packets, if not matched configured keyless tunnels
80   will match fallback tunnel.
81   Given src, dst and key, find appropriate for input tunnel.
82*/
83struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
84				   int link, __be16 flags,
85				   __be32 remote, __be32 local,
86				   __be32 key)
87{
88	struct ip_tunnel *t, *cand = NULL;
89	struct hlist_head *head;
90	struct net_device *ndev;
91	unsigned int hash;
92
93	hash = ip_tunnel_hash(key, remote);
94	head = &itn->tunnels[hash];
95
96	hlist_for_each_entry_rcu(t, head, hash_node) {
97		if (local != t->parms.iph.saddr ||
98		    remote != t->parms.iph.daddr ||
99		    !(t->dev->flags & IFF_UP))
100			continue;
101
102		if (!ip_tunnel_key_match(&t->parms, flags, key))
103			continue;
104
105		if (t->parms.link == link)
106			return t;
107		else
108			cand = t;
109	}
110
111	hlist_for_each_entry_rcu(t, head, hash_node) {
112		if (remote != t->parms.iph.daddr ||
113		    t->parms.iph.saddr != 0 ||
114		    !(t->dev->flags & IFF_UP))
115			continue;
116
117		if (!ip_tunnel_key_match(&t->parms, flags, key))
118			continue;
119
120		if (t->parms.link == link)
121			return t;
122		else if (!cand)
123			cand = t;
124	}
125
126	hash = ip_tunnel_hash(key, 0);
127	head = &itn->tunnels[hash];
128
129	hlist_for_each_entry_rcu(t, head, hash_node) {
130		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
131		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
132			continue;
133
134		if (!(t->dev->flags & IFF_UP))
135			continue;
136
137		if (!ip_tunnel_key_match(&t->parms, flags, key))
138			continue;
139
140		if (t->parms.link == link)
141			return t;
142		else if (!cand)
143			cand = t;
144	}
145
146	hlist_for_each_entry_rcu(t, head, hash_node) {
147		if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) ||
148		    t->parms.iph.saddr != 0 ||
149		    t->parms.iph.daddr != 0 ||
150		    !(t->dev->flags & IFF_UP))
151			continue;
152
153		if (t->parms.link == link)
154			return t;
155		else if (!cand)
156			cand = t;
157	}
158
159	if (cand)
160		return cand;
161
162	t = rcu_dereference(itn->collect_md_tun);
163	if (t && t->dev->flags & IFF_UP)
164		return t;
165
166	ndev = READ_ONCE(itn->fb_tunnel_dev);
167	if (ndev && ndev->flags & IFF_UP)
168		return netdev_priv(ndev);
169
170	return NULL;
171}
172EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
173
174static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
175				    struct ip_tunnel_parm *parms)
176{
177	unsigned int h;
178	__be32 remote;
179	__be32 i_key = parms->i_key;
180
181	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
182		remote = parms->iph.daddr;
183	else
184		remote = 0;
185
186	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
187		i_key = 0;
188
189	h = ip_tunnel_hash(i_key, remote);
190	return &itn->tunnels[h];
191}
192
193static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
194{
195	struct hlist_head *head = ip_bucket(itn, &t->parms);
196
197	if (t->collect_md)
198		rcu_assign_pointer(itn->collect_md_tun, t);
199	hlist_add_head_rcu(&t->hash_node, head);
200}
201
202static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
203{
204	if (t->collect_md)
205		rcu_assign_pointer(itn->collect_md_tun, NULL);
206	hlist_del_init_rcu(&t->hash_node);
207}
208
209static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
210					struct ip_tunnel_parm *parms,
211					int type)
212{
213	__be32 remote = parms->iph.daddr;
214	__be32 local = parms->iph.saddr;
215	__be32 key = parms->i_key;
216	__be16 flags = parms->i_flags;
217	int link = parms->link;
218	struct ip_tunnel *t = NULL;
219	struct hlist_head *head = ip_bucket(itn, parms);
220
221	hlist_for_each_entry_rcu(t, head, hash_node) {
222		if (local == t->parms.iph.saddr &&
223		    remote == t->parms.iph.daddr &&
224		    link == t->parms.link &&
225		    type == t->dev->type &&
226		    ip_tunnel_key_match(&t->parms, flags, key))
227			break;
228	}
229	return t;
230}
231
232static struct net_device *__ip_tunnel_create(struct net *net,
233					     const struct rtnl_link_ops *ops,
234					     struct ip_tunnel_parm *parms)
235{
236	int err;
237	struct ip_tunnel *tunnel;
238	struct net_device *dev;
239	char name[IFNAMSIZ];
240
241	err = -E2BIG;
242	if (parms->name[0]) {
243		if (!dev_valid_name(parms->name))
244			goto failed;
245		strscpy(name, parms->name, IFNAMSIZ);
246	} else {
247		if (strlen(ops->kind) > (IFNAMSIZ - 3))
248			goto failed;
249		strcpy(name, ops->kind);
250		strcat(name, "%d");
251	}
252
253	ASSERT_RTNL();
254	dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
255	if (!dev) {
256		err = -ENOMEM;
257		goto failed;
258	}
259	dev_net_set(dev, net);
260
261	dev->rtnl_link_ops = ops;
262
263	tunnel = netdev_priv(dev);
264	tunnel->parms = *parms;
265	tunnel->net = net;
266
267	err = register_netdevice(dev);
268	if (err)
269		goto failed_free;
270
271	return dev;
272
273failed_free:
274	free_netdev(dev);
275failed:
276	return ERR_PTR(err);
277}
278
279static int ip_tunnel_bind_dev(struct net_device *dev)
280{
281	struct net_device *tdev = NULL;
282	struct ip_tunnel *tunnel = netdev_priv(dev);
283	const struct iphdr *iph;
284	int hlen = LL_MAX_HEADER;
285	int mtu = ETH_DATA_LEN;
286	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
287
288	iph = &tunnel->parms.iph;
289
290	/* Guess output device to choose reasonable mtu and needed_headroom */
291	if (iph->daddr) {
292		struct flowi4 fl4;
293		struct rtable *rt;
294
295		ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
296				    iph->saddr, tunnel->parms.o_key,
297				    RT_TOS(iph->tos), dev_net(dev),
298				    tunnel->parms.link, tunnel->fwmark, 0, 0);
299		rt = ip_route_output_key(tunnel->net, &fl4);
300
301		if (!IS_ERR(rt)) {
302			tdev = rt->dst.dev;
303			ip_rt_put(rt);
304		}
305		if (dev->type != ARPHRD_ETHER)
306			dev->flags |= IFF_POINTOPOINT;
307
308		dst_cache_reset(&tunnel->dst_cache);
309	}
310
311	if (!tdev && tunnel->parms.link)
312		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
313
314	if (tdev) {
315		hlen = tdev->hard_header_len + tdev->needed_headroom;
316		mtu = min(tdev->mtu, IP_MAX_MTU);
317	}
318
319	dev->needed_headroom = t_hlen + hlen;
320	mtu -= t_hlen + (dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0);
321
322	if (mtu < IPV4_MIN_MTU)
323		mtu = IPV4_MIN_MTU;
324
325	return mtu;
326}
327
328static struct ip_tunnel *ip_tunnel_create(struct net *net,
329					  struct ip_tunnel_net *itn,
330					  struct ip_tunnel_parm *parms)
331{
332	struct ip_tunnel *nt;
333	struct net_device *dev;
334	int t_hlen;
335	int mtu;
336	int err;
337
338	dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
339	if (IS_ERR(dev))
340		return ERR_CAST(dev);
341
342	mtu = ip_tunnel_bind_dev(dev);
343	err = dev_set_mtu(dev, mtu);
344	if (err)
345		goto err_dev_set_mtu;
346
347	nt = netdev_priv(dev);
348	t_hlen = nt->hlen + sizeof(struct iphdr);
349	dev->min_mtu = ETH_MIN_MTU;
350	dev->max_mtu = IP_MAX_MTU - t_hlen;
351	if (dev->type == ARPHRD_ETHER)
352		dev->max_mtu -= dev->hard_header_len;
353
354	ip_tunnel_add(itn, nt);
355	return nt;
356
357err_dev_set_mtu:
358	unregister_netdevice(dev);
359	return ERR_PTR(err);
360}
361
362void ip_tunnel_md_udp_encap(struct sk_buff *skb, struct ip_tunnel_info *info)
363{
364	const struct iphdr *iph = ip_hdr(skb);
365	const struct udphdr *udph;
366
367	if (iph->protocol != IPPROTO_UDP)
368		return;
369
370	udph = (struct udphdr *)((__u8 *)iph + (iph->ihl << 2));
371	info->encap.sport = udph->source;
372	info->encap.dport = udph->dest;
373}
374EXPORT_SYMBOL(ip_tunnel_md_udp_encap);
375
376int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
377		  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
378		  bool log_ecn_error)
379{
380	const struct iphdr *iph = ip_hdr(skb);
381	int nh, err;
382
383#ifdef CONFIG_NET_IPGRE_BROADCAST
384	if (ipv4_is_multicast(iph->daddr)) {
385		DEV_STATS_INC(tunnel->dev, multicast);
386		skb->pkt_type = PACKET_BROADCAST;
387	}
388#endif
389
390	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
391	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
392		DEV_STATS_INC(tunnel->dev, rx_crc_errors);
393		DEV_STATS_INC(tunnel->dev, rx_errors);
394		goto drop;
395	}
396
397	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
398		if (!(tpi->flags&TUNNEL_SEQ) ||
399		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
400			DEV_STATS_INC(tunnel->dev, rx_fifo_errors);
401			DEV_STATS_INC(tunnel->dev, rx_errors);
402			goto drop;
403		}
404		tunnel->i_seqno = ntohl(tpi->seq) + 1;
405	}
406
407	/* Save offset of outer header relative to skb->head,
408	 * because we are going to reset the network header to the inner header
409	 * and might change skb->head.
410	 */
411	nh = skb_network_header(skb) - skb->head;
412
413	skb_set_network_header(skb, (tunnel->dev->type == ARPHRD_ETHER) ? ETH_HLEN : 0);
414
415	if (!pskb_inet_may_pull(skb)) {
416		DEV_STATS_INC(tunnel->dev, rx_length_errors);
417		DEV_STATS_INC(tunnel->dev, rx_errors);
418		goto drop;
419	}
420	iph = (struct iphdr *)(skb->head + nh);
421
422	err = IP_ECN_decapsulate(iph, skb);
423	if (unlikely(err)) {
424		if (log_ecn_error)
425			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
426					&iph->saddr, iph->tos);
427		if (err > 1) {
428			DEV_STATS_INC(tunnel->dev, rx_frame_errors);
429			DEV_STATS_INC(tunnel->dev, rx_errors);
430			goto drop;
431		}
432	}
433
434	dev_sw_netstats_rx_add(tunnel->dev, skb->len);
435	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
436
437	if (tunnel->dev->type == ARPHRD_ETHER) {
438		skb->protocol = eth_type_trans(skb, tunnel->dev);
439		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
440	} else {
441		skb->dev = tunnel->dev;
442	}
443
444	if (tun_dst)
445		skb_dst_set(skb, (struct dst_entry *)tun_dst);
446
447	gro_cells_receive(&tunnel->gro_cells, skb);
448	return 0;
449
450drop:
451	if (tun_dst)
452		dst_release((struct dst_entry *)tun_dst);
453	kfree_skb(skb);
454	return 0;
455}
456EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
457
458int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
459			    unsigned int num)
460{
461	if (num >= MAX_IPTUN_ENCAP_OPS)
462		return -ERANGE;
463
464	return !cmpxchg((const struct ip_tunnel_encap_ops **)
465			&iptun_encaps[num],
466			NULL, ops) ? 0 : -1;
467}
468EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
469
470int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
471			    unsigned int num)
472{
473	int ret;
474
475	if (num >= MAX_IPTUN_ENCAP_OPS)
476		return -ERANGE;
477
478	ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
479		       &iptun_encaps[num],
480		       ops, NULL) == ops) ? 0 : -1;
481
482	synchronize_net();
483
484	return ret;
485}
486EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
487
488int ip_tunnel_encap_setup(struct ip_tunnel *t,
489			  struct ip_tunnel_encap *ipencap)
490{
491	int hlen;
492
493	memset(&t->encap, 0, sizeof(t->encap));
494
495	hlen = ip_encap_hlen(ipencap);
496	if (hlen < 0)
497		return hlen;
498
499	t->encap.type = ipencap->type;
500	t->encap.sport = ipencap->sport;
501	t->encap.dport = ipencap->dport;
502	t->encap.flags = ipencap->flags;
503
504	t->encap_hlen = hlen;
505	t->hlen = t->encap_hlen + t->tun_hlen;
506
507	return 0;
508}
509EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
510
511static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
512			    struct rtable *rt, __be16 df,
513			    const struct iphdr *inner_iph,
514			    int tunnel_hlen, __be32 dst, bool md)
515{
516	struct ip_tunnel *tunnel = netdev_priv(dev);
517	int pkt_size;
518	int mtu;
519
520	tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
521	pkt_size = skb->len - tunnel_hlen;
522	pkt_size -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
523
524	if (df) {
525		mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen);
526		mtu -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
527	} else {
528		mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
529	}
530
531	if (skb_valid_dst(skb))
532		skb_dst_update_pmtu_no_confirm(skb, mtu);
533
534	if (skb->protocol == htons(ETH_P_IP)) {
535		if (!skb_is_gso(skb) &&
536		    (inner_iph->frag_off & htons(IP_DF)) &&
537		    mtu < pkt_size) {
538			icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
539			return -E2BIG;
540		}
541	}
542#if IS_ENABLED(CONFIG_IPV6)
543	else if (skb->protocol == htons(ETH_P_IPV6)) {
544		struct rt6_info *rt6;
545		__be32 daddr;
546
547		rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) :
548					   NULL;
549		daddr = md ? dst : tunnel->parms.iph.daddr;
550
551		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
552			   mtu >= IPV6_MIN_MTU) {
553			if ((daddr && !ipv4_is_multicast(daddr)) ||
554			    rt6->rt6i_dst.plen == 128) {
555				rt6->rt6i_flags |= RTF_MODIFIED;
556				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
557			}
558		}
559
560		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
561					mtu < pkt_size) {
562			icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
563			return -E2BIG;
564		}
565	}
566#endif
567	return 0;
568}
569
570static void ip_tunnel_adj_headroom(struct net_device *dev, unsigned int headroom)
571{
572	/* we must cap headroom to some upperlimit, else pskb_expand_head
573	 * will overflow header offsets in skb_headers_offset_update().
574	 */
575	static const unsigned int max_allowed = 512;
576
577	if (headroom > max_allowed)
578		headroom = max_allowed;
579
580	if (headroom > READ_ONCE(dev->needed_headroom))
581		WRITE_ONCE(dev->needed_headroom, headroom);
582}
583
584void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
585		       u8 proto, int tunnel_hlen)
586{
587	struct ip_tunnel *tunnel = netdev_priv(dev);
588	u32 headroom = sizeof(struct iphdr);
589	struct ip_tunnel_info *tun_info;
590	const struct ip_tunnel_key *key;
591	const struct iphdr *inner_iph;
592	struct rtable *rt = NULL;
593	struct flowi4 fl4;
594	__be16 df = 0;
595	u8 tos, ttl;
596	bool use_cache;
597
598	tun_info = skb_tunnel_info(skb);
599	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
600		     ip_tunnel_info_af(tun_info) != AF_INET))
601		goto tx_error;
602	key = &tun_info->key;
603	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
604	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
605	tos = key->tos;
606	if (tos == 1) {
607		if (skb->protocol == htons(ETH_P_IP))
608			tos = inner_iph->tos;
609		else if (skb->protocol == htons(ETH_P_IPV6))
610			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
611	}
612	ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
613			    tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
614			    dev_net(dev), 0, skb->mark, skb_get_hash(skb),
615			    key->flow_flags);
616
617	if (!tunnel_hlen)
618		tunnel_hlen = ip_encap_hlen(&tun_info->encap);
619
620	if (ip_tunnel_encap(skb, &tun_info->encap, &proto, &fl4) < 0)
621		goto tx_error;
622
623	use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
624	if (use_cache)
625		rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
626	if (!rt) {
627		rt = ip_route_output_key(tunnel->net, &fl4);
628		if (IS_ERR(rt)) {
629			DEV_STATS_INC(dev, tx_carrier_errors);
630			goto tx_error;
631		}
632		if (use_cache)
633			dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
634					  fl4.saddr);
635	}
636	if (rt->dst.dev == dev) {
637		ip_rt_put(rt);
638		DEV_STATS_INC(dev, collisions);
639		goto tx_error;
640	}
641
642	if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
643		df = htons(IP_DF);
644	if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
645			    key->u.ipv4.dst, true)) {
646		ip_rt_put(rt);
647		goto tx_error;
648	}
649
650	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
651	ttl = key->ttl;
652	if (ttl == 0) {
653		if (skb->protocol == htons(ETH_P_IP))
654			ttl = inner_iph->ttl;
655		else if (skb->protocol == htons(ETH_P_IPV6))
656			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
657		else
658			ttl = ip4_dst_hoplimit(&rt->dst);
659	}
660
661	headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
662	if (skb_cow_head(skb, headroom)) {
663		ip_rt_put(rt);
664		goto tx_dropped;
665	}
666
667	ip_tunnel_adj_headroom(dev, headroom);
668
669	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
670		      df, !net_eq(tunnel->net, dev_net(dev)));
671	return;
672tx_error:
673	DEV_STATS_INC(dev, tx_errors);
674	goto kfree;
675tx_dropped:
676	DEV_STATS_INC(dev, tx_dropped);
677kfree:
678	kfree_skb(skb);
679}
680EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
681
682void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
683		    const struct iphdr *tnl_params, u8 protocol)
684{
685	struct ip_tunnel *tunnel = netdev_priv(dev);
686	struct ip_tunnel_info *tun_info = NULL;
687	const struct iphdr *inner_iph;
688	unsigned int max_headroom;	/* The extra header space needed */
689	struct rtable *rt = NULL;		/* Route to the other host */
690	__be16 payload_protocol;
691	bool use_cache = false;
692	struct flowi4 fl4;
693	bool md = false;
694	bool connected;
695	u8 tos, ttl;
696	__be32 dst;
697	__be16 df;
698
699	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
700	connected = (tunnel->parms.iph.daddr != 0);
701	payload_protocol = skb_protocol(skb, true);
702
703	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
704
705	dst = tnl_params->daddr;
706	if (dst == 0) {
707		/* NBMA tunnel */
708
709		if (!skb_dst(skb)) {
710			DEV_STATS_INC(dev, tx_fifo_errors);
711			goto tx_error;
712		}
713
714		tun_info = skb_tunnel_info(skb);
715		if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
716		    ip_tunnel_info_af(tun_info) == AF_INET &&
717		    tun_info->key.u.ipv4.dst) {
718			dst = tun_info->key.u.ipv4.dst;
719			md = true;
720			connected = true;
721		} else if (payload_protocol == htons(ETH_P_IP)) {
722			rt = skb_rtable(skb);
723			dst = rt_nexthop(rt, inner_iph->daddr);
724		}
725#if IS_ENABLED(CONFIG_IPV6)
726		else if (payload_protocol == htons(ETH_P_IPV6)) {
727			const struct in6_addr *addr6;
728			struct neighbour *neigh;
729			bool do_tx_error_icmp;
730			int addr_type;
731
732			neigh = dst_neigh_lookup(skb_dst(skb),
733						 &ipv6_hdr(skb)->daddr);
734			if (!neigh)
735				goto tx_error;
736
737			addr6 = (const struct in6_addr *)&neigh->primary_key;
738			addr_type = ipv6_addr_type(addr6);
739
740			if (addr_type == IPV6_ADDR_ANY) {
741				addr6 = &ipv6_hdr(skb)->daddr;
742				addr_type = ipv6_addr_type(addr6);
743			}
744
745			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
746				do_tx_error_icmp = true;
747			else {
748				do_tx_error_icmp = false;
749				dst = addr6->s6_addr32[3];
750			}
751			neigh_release(neigh);
752			if (do_tx_error_icmp)
753				goto tx_error_icmp;
754		}
755#endif
756		else
757			goto tx_error;
758
759		if (!md)
760			connected = false;
761	}
762
763	tos = tnl_params->tos;
764	if (tos & 0x1) {
765		tos &= ~0x1;
766		if (payload_protocol == htons(ETH_P_IP)) {
767			tos = inner_iph->tos;
768			connected = false;
769		} else if (payload_protocol == htons(ETH_P_IPV6)) {
770			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
771			connected = false;
772		}
773	}
774
775	ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
776			    tunnel->parms.o_key, RT_TOS(tos),
777			    dev_net(dev), tunnel->parms.link,
778			    tunnel->fwmark, skb_get_hash(skb), 0);
779
780	if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0)
781		goto tx_error;
782
783	if (connected && md) {
784		use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
785		if (use_cache)
786			rt = dst_cache_get_ip4(&tun_info->dst_cache,
787					       &fl4.saddr);
788	} else {
789		rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
790						&fl4.saddr) : NULL;
791	}
792
793	if (!rt) {
794		rt = ip_route_output_key(tunnel->net, &fl4);
795
796		if (IS_ERR(rt)) {
797			DEV_STATS_INC(dev, tx_carrier_errors);
798			goto tx_error;
799		}
800		if (use_cache)
801			dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
802					  fl4.saddr);
803		else if (!md && connected)
804			dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
805					  fl4.saddr);
806	}
807
808	if (rt->dst.dev == dev) {
809		ip_rt_put(rt);
810		DEV_STATS_INC(dev, collisions);
811		goto tx_error;
812	}
813
814	df = tnl_params->frag_off;
815	if (payload_protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
816		df |= (inner_iph->frag_off & htons(IP_DF));
817
818	if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) {
819		ip_rt_put(rt);
820		goto tx_error;
821	}
822
823	if (tunnel->err_count > 0) {
824		if (time_before(jiffies,
825				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
826			tunnel->err_count--;
827
828			dst_link_failure(skb);
829		} else
830			tunnel->err_count = 0;
831	}
832
833	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
834	ttl = tnl_params->ttl;
835	if (ttl == 0) {
836		if (payload_protocol == htons(ETH_P_IP))
837			ttl = inner_iph->ttl;
838#if IS_ENABLED(CONFIG_IPV6)
839		else if (payload_protocol == htons(ETH_P_IPV6))
840			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
841#endif
842		else
843			ttl = ip4_dst_hoplimit(&rt->dst);
844	}
845
846	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
847			+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
848
849	if (skb_cow_head(skb, max_headroom)) {
850		ip_rt_put(rt);
851		DEV_STATS_INC(dev, tx_dropped);
852		kfree_skb(skb);
853		return;
854	}
855
856	ip_tunnel_adj_headroom(dev, max_headroom);
857
858	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
859		      df, !net_eq(tunnel->net, dev_net(dev)));
860	return;
861
862#if IS_ENABLED(CONFIG_IPV6)
863tx_error_icmp:
864	dst_link_failure(skb);
865#endif
866tx_error:
867	DEV_STATS_INC(dev, tx_errors);
868	kfree_skb(skb);
869}
870EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
871
872static void ip_tunnel_update(struct ip_tunnel_net *itn,
873			     struct ip_tunnel *t,
874			     struct net_device *dev,
875			     struct ip_tunnel_parm *p,
876			     bool set_mtu,
877			     __u32 fwmark)
878{
879	ip_tunnel_del(itn, t);
880	t->parms.iph.saddr = p->iph.saddr;
881	t->parms.iph.daddr = p->iph.daddr;
882	t->parms.i_key = p->i_key;
883	t->parms.o_key = p->o_key;
884	if (dev->type != ARPHRD_ETHER) {
885		__dev_addr_set(dev, &p->iph.saddr, 4);
886		memcpy(dev->broadcast, &p->iph.daddr, 4);
887	}
888	ip_tunnel_add(itn, t);
889
890	t->parms.iph.ttl = p->iph.ttl;
891	t->parms.iph.tos = p->iph.tos;
892	t->parms.iph.frag_off = p->iph.frag_off;
893
894	if (t->parms.link != p->link || t->fwmark != fwmark) {
895		int mtu;
896
897		t->parms.link = p->link;
898		t->fwmark = fwmark;
899		mtu = ip_tunnel_bind_dev(dev);
900		if (set_mtu)
901			dev->mtu = mtu;
902	}
903	dst_cache_reset(&t->dst_cache);
904	netdev_state_change(dev);
905}
906
907int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
908{
909	int err = 0;
910	struct ip_tunnel *t = netdev_priv(dev);
911	struct net *net = t->net;
912	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
913
914	switch (cmd) {
915	case SIOCGETTUNNEL:
916		if (dev == itn->fb_tunnel_dev) {
917			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
918			if (!t)
919				t = netdev_priv(dev);
920		}
921		memcpy(p, &t->parms, sizeof(*p));
922		break;
923
924	case SIOCADDTUNNEL:
925	case SIOCCHGTUNNEL:
926		err = -EPERM;
927		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
928			goto done;
929		if (p->iph.ttl)
930			p->iph.frag_off |= htons(IP_DF);
931		if (!(p->i_flags & VTI_ISVTI)) {
932			if (!(p->i_flags & TUNNEL_KEY))
933				p->i_key = 0;
934			if (!(p->o_flags & TUNNEL_KEY))
935				p->o_key = 0;
936		}
937
938		t = ip_tunnel_find(itn, p, itn->type);
939
940		if (cmd == SIOCADDTUNNEL) {
941			if (!t) {
942				t = ip_tunnel_create(net, itn, p);
943				err = PTR_ERR_OR_ZERO(t);
944				break;
945			}
946
947			err = -EEXIST;
948			break;
949		}
950		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
951			if (t) {
952				if (t->dev != dev) {
953					err = -EEXIST;
954					break;
955				}
956			} else {
957				unsigned int nflags = 0;
958
959				if (ipv4_is_multicast(p->iph.daddr))
960					nflags = IFF_BROADCAST;
961				else if (p->iph.daddr)
962					nflags = IFF_POINTOPOINT;
963
964				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
965					err = -EINVAL;
966					break;
967				}
968
969				t = netdev_priv(dev);
970			}
971		}
972
973		if (t) {
974			err = 0;
975			ip_tunnel_update(itn, t, dev, p, true, 0);
976		} else {
977			err = -ENOENT;
978		}
979		break;
980
981	case SIOCDELTUNNEL:
982		err = -EPERM;
983		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
984			goto done;
985
986		if (dev == itn->fb_tunnel_dev) {
987			err = -ENOENT;
988			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
989			if (!t)
990				goto done;
991			err = -EPERM;
992			if (t == netdev_priv(itn->fb_tunnel_dev))
993				goto done;
994			dev = t->dev;
995		}
996		unregister_netdevice(dev);
997		err = 0;
998		break;
999
1000	default:
1001		err = -EINVAL;
1002	}
1003
1004done:
1005	return err;
1006}
1007EXPORT_SYMBOL_GPL(ip_tunnel_ctl);
1008
1009int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
1010			     void __user *data, int cmd)
1011{
1012	struct ip_tunnel_parm p;
1013	int err;
1014
1015	if (copy_from_user(&p, data, sizeof(p)))
1016		return -EFAULT;
1017	err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd);
1018	if (!err && copy_to_user(data, &p, sizeof(p)))
1019		return -EFAULT;
1020	return err;
1021}
1022EXPORT_SYMBOL_GPL(ip_tunnel_siocdevprivate);
1023
1024int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
1025{
1026	struct ip_tunnel *tunnel = netdev_priv(dev);
1027	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
1028	int max_mtu = IP_MAX_MTU - t_hlen;
1029
1030	if (dev->type == ARPHRD_ETHER)
1031		max_mtu -= dev->hard_header_len;
1032
1033	if (new_mtu < ETH_MIN_MTU)
1034		return -EINVAL;
1035
1036	if (new_mtu > max_mtu) {
1037		if (strict)
1038			return -EINVAL;
1039
1040		new_mtu = max_mtu;
1041	}
1042
1043	dev->mtu = new_mtu;
1044	return 0;
1045}
1046EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
1047
1048int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1049{
1050	return __ip_tunnel_change_mtu(dev, new_mtu, true);
1051}
1052EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
1053
1054static void ip_tunnel_dev_free(struct net_device *dev)
1055{
1056	struct ip_tunnel *tunnel = netdev_priv(dev);
1057
1058	gro_cells_destroy(&tunnel->gro_cells);
1059	dst_cache_destroy(&tunnel->dst_cache);
1060	free_percpu(dev->tstats);
1061}
1062
1063void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
1064{
1065	struct ip_tunnel *tunnel = netdev_priv(dev);
1066	struct ip_tunnel_net *itn;
1067
1068	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
1069
1070	if (itn->fb_tunnel_dev != dev) {
1071		ip_tunnel_del(itn, netdev_priv(dev));
1072		unregister_netdevice_queue(dev, head);
1073	}
1074}
1075EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1076
1077struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1078{
1079	struct ip_tunnel *tunnel = netdev_priv(dev);
1080
1081	return tunnel->net;
1082}
1083EXPORT_SYMBOL(ip_tunnel_get_link_net);
1084
1085int ip_tunnel_get_iflink(const struct net_device *dev)
1086{
1087	struct ip_tunnel *tunnel = netdev_priv(dev);
1088
1089	return tunnel->parms.link;
1090}
1091EXPORT_SYMBOL(ip_tunnel_get_iflink);
1092
1093int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1094				  struct rtnl_link_ops *ops, char *devname)
1095{
1096	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1097	struct ip_tunnel_parm parms;
1098	unsigned int i;
1099
1100	itn->rtnl_link_ops = ops;
1101	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1102		INIT_HLIST_HEAD(&itn->tunnels[i]);
1103
1104	if (!ops || !net_has_fallback_tunnels(net)) {
1105		struct ip_tunnel_net *it_init_net;
1106
1107		it_init_net = net_generic(&init_net, ip_tnl_net_id);
1108		itn->type = it_init_net->type;
1109		itn->fb_tunnel_dev = NULL;
1110		return 0;
1111	}
1112
1113	memset(&parms, 0, sizeof(parms));
1114	if (devname)
1115		strscpy(parms.name, devname, IFNAMSIZ);
1116
1117	rtnl_lock();
1118	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1119	/* FB netdevice is special: we have one, and only one per netns.
1120	 * Allowing to move it to another netns is clearly unsafe.
1121	 */
1122	if (!IS_ERR(itn->fb_tunnel_dev)) {
1123		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1124		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1125		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1126		itn->type = itn->fb_tunnel_dev->type;
1127	}
1128	rtnl_unlock();
1129
1130	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1131}
1132EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1133
1134static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1135			      struct list_head *head,
1136			      struct rtnl_link_ops *ops)
1137{
1138	struct net_device *dev, *aux;
1139	int h;
1140
1141	for_each_netdev_safe(net, dev, aux)
1142		if (dev->rtnl_link_ops == ops)
1143			unregister_netdevice_queue(dev, head);
1144
1145	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1146		struct ip_tunnel *t;
1147		struct hlist_node *n;
1148		struct hlist_head *thead = &itn->tunnels[h];
1149
1150		hlist_for_each_entry_safe(t, n, thead, hash_node)
1151			/* If dev is in the same netns, it has already
1152			 * been added to the list by the previous loop.
1153			 */
1154			if (!net_eq(dev_net(t->dev), net))
1155				unregister_netdevice_queue(t->dev, head);
1156	}
1157}
1158
1159void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1160			   struct rtnl_link_ops *ops)
1161{
1162	struct ip_tunnel_net *itn;
1163	struct net *net;
1164	LIST_HEAD(list);
1165
1166	rtnl_lock();
1167	list_for_each_entry(net, net_list, exit_list) {
1168		itn = net_generic(net, id);
1169		ip_tunnel_destroy(net, itn, &list, ops);
1170	}
1171	unregister_netdevice_many(&list);
1172	rtnl_unlock();
1173}
1174EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1175
1176int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1177		      struct ip_tunnel_parm *p, __u32 fwmark)
1178{
1179	struct ip_tunnel *nt;
1180	struct net *net = dev_net(dev);
1181	struct ip_tunnel_net *itn;
1182	int mtu;
1183	int err;
1184
1185	nt = netdev_priv(dev);
1186	itn = net_generic(net, nt->ip_tnl_net_id);
1187
1188	if (nt->collect_md) {
1189		if (rtnl_dereference(itn->collect_md_tun))
1190			return -EEXIST;
1191	} else {
1192		if (ip_tunnel_find(itn, p, dev->type))
1193			return -EEXIST;
1194	}
1195
1196	nt->net = net;
1197	nt->parms = *p;
1198	nt->fwmark = fwmark;
1199	err = register_netdevice(dev);
1200	if (err)
1201		goto err_register_netdevice;
1202
1203	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1204		eth_hw_addr_random(dev);
1205
1206	mtu = ip_tunnel_bind_dev(dev);
1207	if (tb[IFLA_MTU]) {
1208		unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr));
1209
1210		if (dev->type == ARPHRD_ETHER)
1211			max -= dev->hard_header_len;
1212
1213		mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max);
1214	}
1215
1216	err = dev_set_mtu(dev, mtu);
1217	if (err)
1218		goto err_dev_set_mtu;
1219
1220	ip_tunnel_add(itn, nt);
1221	return 0;
1222
1223err_dev_set_mtu:
1224	unregister_netdevice(dev);
1225err_register_netdevice:
1226	return err;
1227}
1228EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1229
1230int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1231			 struct ip_tunnel_parm *p, __u32 fwmark)
1232{
1233	struct ip_tunnel *t;
1234	struct ip_tunnel *tunnel = netdev_priv(dev);
1235	struct net *net = tunnel->net;
1236	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1237
1238	if (dev == itn->fb_tunnel_dev)
1239		return -EINVAL;
1240
1241	t = ip_tunnel_find(itn, p, dev->type);
1242
1243	if (t) {
1244		if (t->dev != dev)
1245			return -EEXIST;
1246	} else {
1247		t = tunnel;
1248
1249		if (dev->type != ARPHRD_ETHER) {
1250			unsigned int nflags = 0;
1251
1252			if (ipv4_is_multicast(p->iph.daddr))
1253				nflags = IFF_BROADCAST;
1254			else if (p->iph.daddr)
1255				nflags = IFF_POINTOPOINT;
1256
1257			if ((dev->flags ^ nflags) &
1258			    (IFF_POINTOPOINT | IFF_BROADCAST))
1259				return -EINVAL;
1260		}
1261	}
1262
1263	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1264	return 0;
1265}
1266EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1267
1268int ip_tunnel_init(struct net_device *dev)
1269{
1270	struct ip_tunnel *tunnel = netdev_priv(dev);
1271	struct iphdr *iph = &tunnel->parms.iph;
1272	int err;
1273
1274	dev->needs_free_netdev = true;
1275	dev->priv_destructor = ip_tunnel_dev_free;
1276	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1277	if (!dev->tstats)
1278		return -ENOMEM;
1279
1280	err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1281	if (err) {
1282		free_percpu(dev->tstats);
1283		return err;
1284	}
1285
1286	err = gro_cells_init(&tunnel->gro_cells, dev);
1287	if (err) {
1288		dst_cache_destroy(&tunnel->dst_cache);
1289		free_percpu(dev->tstats);
1290		return err;
1291	}
1292
1293	tunnel->dev = dev;
1294	tunnel->net = dev_net(dev);
1295	strcpy(tunnel->parms.name, dev->name);
1296	iph->version		= 4;
1297	iph->ihl		= 5;
1298
1299	if (tunnel->collect_md)
1300		netif_keep_dst(dev);
1301	return 0;
1302}
1303EXPORT_SYMBOL_GPL(ip_tunnel_init);
1304
1305void ip_tunnel_uninit(struct net_device *dev)
1306{
1307	struct ip_tunnel *tunnel = netdev_priv(dev);
1308	struct net *net = tunnel->net;
1309	struct ip_tunnel_net *itn;
1310
1311	itn = net_generic(net, tunnel->ip_tnl_net_id);
1312	ip_tunnel_del(itn, netdev_priv(dev));
1313	if (itn->fb_tunnel_dev == dev)
1314		WRITE_ONCE(itn->fb_tunnel_dev, NULL);
1315
1316	dst_cache_reset(&tunnel->dst_cache);
1317}
1318EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1319
1320/* Do least required initialization, rest of init is done in tunnel_init call */
1321void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1322{
1323	struct ip_tunnel *tunnel = netdev_priv(dev);
1324	tunnel->ip_tnl_net_id = net_id;
1325}
1326EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1327
1328MODULE_LICENSE("GPL");
1329