xref: /kernel/linux/linux-5.10/net/ipv4/ip_gre.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 *	Linux NET3:	GRE over IP protocol decoder.
4 *
5 *	Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
6 */
7
8#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9
10#include <linux/capability.h>
11#include <linux/module.h>
12#include <linux/types.h>
13#include <linux/kernel.h>
14#include <linux/slab.h>
15#include <linux/uaccess.h>
16#include <linux/skbuff.h>
17#include <linux/netdevice.h>
18#include <linux/in.h>
19#include <linux/tcp.h>
20#include <linux/udp.h>
21#include <linux/if_arp.h>
22#include <linux/if_vlan.h>
23#include <linux/init.h>
24#include <linux/in6.h>
25#include <linux/inetdevice.h>
26#include <linux/igmp.h>
27#include <linux/netfilter_ipv4.h>
28#include <linux/etherdevice.h>
29#include <linux/if_ether.h>
30
31#include <net/sock.h>
32#include <net/ip.h>
33#include <net/icmp.h>
34#include <net/protocol.h>
35#include <net/ip_tunnels.h>
36#include <net/arp.h>
37#include <net/checksum.h>
38#include <net/dsfield.h>
39#include <net/inet_ecn.h>
40#include <net/xfrm.h>
41#include <net/net_namespace.h>
42#include <net/netns/generic.h>
43#include <net/rtnetlink.h>
44#include <net/gre.h>
45#include <net/dst_metadata.h>
46#include <net/erspan.h>
47
48/*
49   Problems & solutions
50   --------------------
51
52   1. The most important issue is detecting local dead loops.
53   They would cause complete host lockup in transmit, which
54   would be "resolved" by stack overflow or, if queueing is enabled,
55   with infinite looping in net_bh.
56
57   We cannot track such dead loops during route installation,
58   it is infeasible task. The most general solutions would be
59   to keep skb->encapsulation counter (sort of local ttl),
60   and silently drop packet when it expires. It is a good
61   solution, but it supposes maintaining new variable in ALL
62   skb, even if no tunneling is used.
63
64   Current solution: xmit_recursion breaks dead loops. This is a percpu
65   counter, since when we enter the first ndo_xmit(), cpu migration is
66   forbidden. We force an exit if this counter reaches RECURSION_LIMIT
67
68   2. Networking dead loops would not kill routers, but would really
69   kill network. IP hop limit plays role of "t->recursion" in this case,
70   if we copy it from packet being encapsulated to upper header.
71   It is very good solution, but it introduces two problems:
72
73   - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
74     do not work over tunnels.
75   - traceroute does not work. I planned to relay ICMP from tunnel,
76     so that this problem would be solved and traceroute output
77     would even more informative. This idea appeared to be wrong:
78     only Linux complies to rfc1812 now (yes, guys, Linux is the only
79     true router now :-)), all routers (at least, in neighbourhood of mine)
80     return only 8 bytes of payload. It is the end.
81
82   Hence, if we want that OSPF worked or traceroute said something reasonable,
83   we should search for another solution.
84
85   One of them is to parse packet trying to detect inner encapsulation
86   made by our node. It is difficult or even impossible, especially,
87   taking into account fragmentation. TO be short, ttl is not solution at all.
88
89   Current solution: The solution was UNEXPECTEDLY SIMPLE.
90   We force DF flag on tunnels with preconfigured hop limit,
91   that is ALL. :-) Well, it does not remove the problem completely,
92   but exponential growth of network traffic is changed to linear
93   (branches, that exceed pmtu are pruned) and tunnel mtu
94   rapidly degrades to value <68, where looping stops.
95   Yes, it is not good if there exists a router in the loop,
96   which does not force DF, even when encapsulating packets have DF set.
97   But it is not our problem! Nobody could accuse us, we made
98   all that we could make. Even if it is your gated who injected
99   fatal route to network, even if it were you who configured
100   fatal static route: you are innocent. :-)
101
102   Alexey Kuznetsov.
103 */
104
105static bool log_ecn_error = true;
106module_param(log_ecn_error, bool, 0644);
107MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
108
109static struct rtnl_link_ops ipgre_link_ops __read_mostly;
110static int ipgre_tunnel_init(struct net_device *dev);
111static void erspan_build_header(struct sk_buff *skb,
112				u32 id, u32 index,
113				bool truncate, bool is_ipv4);
114
115static unsigned int ipgre_net_id __read_mostly;
116static unsigned int gre_tap_net_id __read_mostly;
117static unsigned int erspan_net_id __read_mostly;
118
119static int ipgre_err(struct sk_buff *skb, u32 info,
120		     const struct tnl_ptk_info *tpi)
121{
122
123	/* All the routers (except for Linux) return only
124	   8 bytes of packet payload. It means, that precise relaying of
125	   ICMP in the real Internet is absolutely infeasible.
126
127	   Moreover, Cisco "wise men" put GRE key to the third word
128	   in GRE header. It makes impossible maintaining even soft
129	   state for keyed GRE tunnels with enabled checksum. Tell
130	   them "thank you".
131
132	   Well, I wonder, rfc1812 was written by Cisco employee,
133	   what the hell these idiots break standards established
134	   by themselves???
135	   */
136	struct net *net = dev_net(skb->dev);
137	struct ip_tunnel_net *itn;
138	const struct iphdr *iph;
139	const int type = icmp_hdr(skb)->type;
140	const int code = icmp_hdr(skb)->code;
141	unsigned int data_len = 0;
142	struct ip_tunnel *t;
143
144	if (tpi->proto == htons(ETH_P_TEB))
145		itn = net_generic(net, gre_tap_net_id);
146	else if (tpi->proto == htons(ETH_P_ERSPAN) ||
147		 tpi->proto == htons(ETH_P_ERSPAN2))
148		itn = net_generic(net, erspan_net_id);
149	else
150		itn = net_generic(net, ipgre_net_id);
151
152	iph = (const struct iphdr *)(icmp_hdr(skb) + 1);
153	t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
154			     iph->daddr, iph->saddr, tpi->key);
155
156	if (!t)
157		return -ENOENT;
158
159	switch (type) {
160	default:
161	case ICMP_PARAMETERPROB:
162		return 0;
163
164	case ICMP_DEST_UNREACH:
165		switch (code) {
166		case ICMP_SR_FAILED:
167		case ICMP_PORT_UNREACH:
168			/* Impossible event. */
169			return 0;
170		default:
171			/* All others are translated to HOST_UNREACH.
172			   rfc2003 contains "deep thoughts" about NET_UNREACH,
173			   I believe they are just ether pollution. --ANK
174			 */
175			break;
176		}
177		break;
178
179	case ICMP_TIME_EXCEEDED:
180		if (code != ICMP_EXC_TTL)
181			return 0;
182		data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */
183		break;
184
185	case ICMP_REDIRECT:
186		break;
187	}
188
189#if IS_ENABLED(CONFIG_IPV6)
190       if (tpi->proto == htons(ETH_P_IPV6) &&
191           !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len,
192				       type, data_len))
193               return 0;
194#endif
195
196	if (t->parms.iph.daddr == 0 ||
197	    ipv4_is_multicast(t->parms.iph.daddr))
198		return 0;
199
200	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
201		return 0;
202
203	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
204		t->err_count++;
205	else
206		t->err_count = 1;
207	t->err_time = jiffies;
208
209	return 0;
210}
211
212static void gre_err(struct sk_buff *skb, u32 info)
213{
214	/* All the routers (except for Linux) return only
215	 * 8 bytes of packet payload. It means, that precise relaying of
216	 * ICMP in the real Internet is absolutely infeasible.
217	 *
218	 * Moreover, Cisco "wise men" put GRE key to the third word
219	 * in GRE header. It makes impossible maintaining even soft
220	 * state for keyed
221	 * GRE tunnels with enabled checksum. Tell them "thank you".
222	 *
223	 * Well, I wonder, rfc1812 was written by Cisco employee,
224	 * what the hell these idiots break standards established
225	 * by themselves???
226	 */
227
228	const struct iphdr *iph = (struct iphdr *)skb->data;
229	const int type = icmp_hdr(skb)->type;
230	const int code = icmp_hdr(skb)->code;
231	struct tnl_ptk_info tpi;
232
233	if (gre_parse_header(skb, &tpi, NULL, htons(ETH_P_IP),
234			     iph->ihl * 4) < 0)
235		return;
236
237	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
238		ipv4_update_pmtu(skb, dev_net(skb->dev), info,
239				 skb->dev->ifindex, IPPROTO_GRE);
240		return;
241	}
242	if (type == ICMP_REDIRECT) {
243		ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex,
244			      IPPROTO_GRE);
245		return;
246	}
247
248	ipgre_err(skb, info, &tpi);
249}
250
251static bool is_erspan_type1(int gre_hdr_len)
252{
253	/* Both ERSPAN type I (version 0) and type II (version 1) use
254	 * protocol 0x88BE, but the type I has only 4-byte GRE header,
255	 * while type II has 8-byte.
256	 */
257	return gre_hdr_len == 4;
258}
259
260static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
261		      int gre_hdr_len)
262{
263	struct net *net = dev_net(skb->dev);
264	struct metadata_dst *tun_dst = NULL;
265	struct erspan_base_hdr *ershdr;
266	struct ip_tunnel_net *itn;
267	struct ip_tunnel *tunnel;
268	const struct iphdr *iph;
269	struct erspan_md2 *md2;
270	int ver;
271	int len;
272
273	itn = net_generic(net, erspan_net_id);
274	iph = ip_hdr(skb);
275	if (is_erspan_type1(gre_hdr_len)) {
276		ver = 0;
277		tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
278					  tpi->flags | TUNNEL_NO_KEY,
279					  iph->saddr, iph->daddr, 0);
280	} else {
281		ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len);
282		ver = ershdr->ver;
283		tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
284					  tpi->flags | TUNNEL_KEY,
285					  iph->saddr, iph->daddr, tpi->key);
286	}
287
288	if (tunnel) {
289		if (is_erspan_type1(gre_hdr_len))
290			len = gre_hdr_len;
291		else
292			len = gre_hdr_len + erspan_hdr_len(ver);
293
294		if (unlikely(!pskb_may_pull(skb, len)))
295			return PACKET_REJECT;
296
297		if (__iptunnel_pull_header(skb,
298					   len,
299					   htons(ETH_P_TEB),
300					   false, false) < 0)
301			goto drop;
302
303		if (tunnel->collect_md) {
304			struct erspan_metadata *pkt_md, *md;
305			struct ip_tunnel_info *info;
306			unsigned char *gh;
307			__be64 tun_id;
308			__be16 flags;
309
310			tpi->flags |= TUNNEL_KEY;
311			flags = tpi->flags;
312			tun_id = key32_to_tunnel_id(tpi->key);
313
314			tun_dst = ip_tun_rx_dst(skb, flags,
315						tun_id, sizeof(*md));
316			if (!tun_dst)
317				return PACKET_REJECT;
318
319			/* skb can be uncloned in __iptunnel_pull_header, so
320			 * old pkt_md is no longer valid and we need to reset
321			 * it
322			 */
323			gh = skb_network_header(skb) +
324			     skb_network_header_len(skb);
325			pkt_md = (struct erspan_metadata *)(gh + gre_hdr_len +
326							    sizeof(*ershdr));
327			md = ip_tunnel_info_opts(&tun_dst->u.tun_info);
328			md->version = ver;
329			md2 = &md->u.md2;
330			memcpy(md2, pkt_md, ver == 1 ? ERSPAN_V1_MDSIZE :
331						       ERSPAN_V2_MDSIZE);
332
333			info = &tun_dst->u.tun_info;
334			info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
335			info->options_len = sizeof(*md);
336		}
337
338		skb_reset_mac_header(skb);
339		ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
340		return PACKET_RCVD;
341	}
342	return PACKET_REJECT;
343
344drop:
345	kfree_skb(skb);
346	return PACKET_RCVD;
347}
348
349static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
350		       struct ip_tunnel_net *itn, int hdr_len, bool raw_proto)
351{
352	struct metadata_dst *tun_dst = NULL;
353	const struct iphdr *iph;
354	struct ip_tunnel *tunnel;
355
356	iph = ip_hdr(skb);
357	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
358				  iph->saddr, iph->daddr, tpi->key);
359
360	if (tunnel) {
361		const struct iphdr *tnl_params;
362
363		if (__iptunnel_pull_header(skb, hdr_len, tpi->proto,
364					   raw_proto, false) < 0)
365			goto drop;
366
367		if (tunnel->dev->type != ARPHRD_NONE)
368			skb_pop_mac_header(skb);
369		else
370			skb_reset_mac_header(skb);
371
372		tnl_params = &tunnel->parms.iph;
373		if (tunnel->collect_md || tnl_params->daddr == 0) {
374			__be16 flags;
375			__be64 tun_id;
376
377			flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY);
378			tun_id = key32_to_tunnel_id(tpi->key);
379			tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
380			if (!tun_dst)
381				return PACKET_REJECT;
382		}
383
384		ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
385		return PACKET_RCVD;
386	}
387	return PACKET_NEXT;
388
389drop:
390	kfree_skb(skb);
391	return PACKET_RCVD;
392}
393
394static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
395		     int hdr_len)
396{
397	struct net *net = dev_net(skb->dev);
398	struct ip_tunnel_net *itn;
399	int res;
400
401	if (tpi->proto == htons(ETH_P_TEB))
402		itn = net_generic(net, gre_tap_net_id);
403	else
404		itn = net_generic(net, ipgre_net_id);
405
406	res = __ipgre_rcv(skb, tpi, itn, hdr_len, false);
407	if (res == PACKET_NEXT && tpi->proto == htons(ETH_P_TEB)) {
408		/* ipgre tunnels in collect metadata mode should receive
409		 * also ETH_P_TEB traffic.
410		 */
411		itn = net_generic(net, ipgre_net_id);
412		res = __ipgre_rcv(skb, tpi, itn, hdr_len, true);
413	}
414	return res;
415}
416
417static int gre_rcv(struct sk_buff *skb)
418{
419	struct tnl_ptk_info tpi;
420	bool csum_err = false;
421	int hdr_len;
422
423#ifdef CONFIG_NET_IPGRE_BROADCAST
424	if (ipv4_is_multicast(ip_hdr(skb)->daddr)) {
425		/* Looped back packet, drop it! */
426		if (rt_is_output_route(skb_rtable(skb)))
427			goto drop;
428	}
429#endif
430
431	hdr_len = gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP), 0);
432	if (hdr_len < 0)
433		goto drop;
434
435	if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) ||
436		     tpi.proto == htons(ETH_P_ERSPAN2))) {
437		if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
438			return 0;
439		goto out;
440	}
441
442	if (ipgre_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
443		return 0;
444
445out:
446	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
447drop:
448	kfree_skb(skb);
449	return 0;
450}
451
452static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
453		       const struct iphdr *tnl_params,
454		       __be16 proto)
455{
456	struct ip_tunnel *tunnel = netdev_priv(dev);
457	__be16 flags = tunnel->parms.o_flags;
458
459	/* Push GRE header. */
460	gre_build_header(skb, tunnel->tun_hlen,
461			 flags, proto, tunnel->parms.o_key,
462			 (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0);
463
464	ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
465}
466
467static int gre_handle_offloads(struct sk_buff *skb, bool csum)
468{
469	return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
470}
471
472static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
473			__be16 proto)
474{
475	struct ip_tunnel *tunnel = netdev_priv(dev);
476	struct ip_tunnel_info *tun_info;
477	const struct ip_tunnel_key *key;
478	int tunnel_hlen;
479	__be16 flags;
480
481	tun_info = skb_tunnel_info(skb);
482	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
483		     ip_tunnel_info_af(tun_info) != AF_INET))
484		goto err_free_skb;
485
486	key = &tun_info->key;
487	tunnel_hlen = gre_calc_hlen(key->tun_flags);
488
489	if (skb_cow_head(skb, dev->needed_headroom))
490		goto err_free_skb;
491
492	/* Push Tunnel header. */
493	if (gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM)))
494		goto err_free_skb;
495
496	flags = tun_info->key.tun_flags &
497		(TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ);
498	gre_build_header(skb, tunnel_hlen, flags, proto,
499			 tunnel_id_to_key32(tun_info->key.tun_id),
500			 (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0);
501
502	ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen);
503
504	return;
505
506err_free_skb:
507	kfree_skb(skb);
508	dev->stats.tx_dropped++;
509}
510
511static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev)
512{
513	struct ip_tunnel *tunnel = netdev_priv(dev);
514	struct ip_tunnel_info *tun_info;
515	const struct ip_tunnel_key *key;
516	struct erspan_metadata *md;
517	bool truncate = false;
518	__be16 proto;
519	int tunnel_hlen;
520	int version;
521	int nhoff;
522
523	tun_info = skb_tunnel_info(skb);
524	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
525		     ip_tunnel_info_af(tun_info) != AF_INET))
526		goto err_free_skb;
527
528	key = &tun_info->key;
529	if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT))
530		goto err_free_skb;
531	if (tun_info->options_len < sizeof(*md))
532		goto err_free_skb;
533	md = ip_tunnel_info_opts(tun_info);
534
535	/* ERSPAN has fixed 8 byte GRE header */
536	version = md->version;
537	tunnel_hlen = 8 + erspan_hdr_len(version);
538
539	if (skb_cow_head(skb, dev->needed_headroom))
540		goto err_free_skb;
541
542	if (gre_handle_offloads(skb, false))
543		goto err_free_skb;
544
545	if (skb->len > dev->mtu + dev->hard_header_len) {
546		pskb_trim(skb, dev->mtu + dev->hard_header_len);
547		truncate = true;
548	}
549
550	nhoff = skb_network_offset(skb);
551	if (skb->protocol == htons(ETH_P_IP) &&
552	    (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff))
553		truncate = true;
554
555	if (skb->protocol == htons(ETH_P_IPV6)) {
556		int thoff;
557
558		if (skb_transport_header_was_set(skb))
559			thoff = skb_transport_offset(skb);
560		else
561			thoff = nhoff + sizeof(struct ipv6hdr);
562		if (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff)
563			truncate = true;
564	}
565
566	if (version == 1) {
567		erspan_build_header(skb, ntohl(tunnel_id_to_key32(key->tun_id)),
568				    ntohl(md->u.index), truncate, true);
569		proto = htons(ETH_P_ERSPAN);
570	} else if (version == 2) {
571		erspan_build_header_v2(skb,
572				       ntohl(tunnel_id_to_key32(key->tun_id)),
573				       md->u.md2.dir,
574				       get_hwid(&md->u.md2),
575				       truncate, true);
576		proto = htons(ETH_P_ERSPAN2);
577	} else {
578		goto err_free_skb;
579	}
580
581	gre_build_header(skb, 8, TUNNEL_SEQ,
582			 proto, 0, htonl(atomic_fetch_inc(&tunnel->o_seqno)));
583
584	ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen);
585
586	return;
587
588err_free_skb:
589	kfree_skb(skb);
590	dev->stats.tx_dropped++;
591}
592
593static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
594{
595	struct ip_tunnel_info *info = skb_tunnel_info(skb);
596	const struct ip_tunnel_key *key;
597	struct rtable *rt;
598	struct flowi4 fl4;
599
600	if (ip_tunnel_info_af(info) != AF_INET)
601		return -EINVAL;
602
603	key = &info->key;
604	ip_tunnel_init_flow(&fl4, IPPROTO_GRE, key->u.ipv4.dst, key->u.ipv4.src,
605			    tunnel_id_to_key32(key->tun_id),
606			    key->tos & ~INET_ECN_MASK, 0, skb->mark,
607			    skb_get_hash(skb));
608	rt = ip_route_output_key(dev_net(dev), &fl4);
609	if (IS_ERR(rt))
610		return PTR_ERR(rt);
611
612	ip_rt_put(rt);
613	info->key.u.ipv4.src = fl4.saddr;
614	return 0;
615}
616
617static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
618			      struct net_device *dev)
619{
620	struct ip_tunnel *tunnel = netdev_priv(dev);
621	const struct iphdr *tnl_params;
622
623	if (!pskb_inet_may_pull(skb))
624		goto free_skb;
625
626	if (tunnel->collect_md) {
627		gre_fb_xmit(skb, dev, skb->protocol);
628		return NETDEV_TX_OK;
629	}
630
631	if (dev->header_ops) {
632		int pull_len = tunnel->hlen + sizeof(struct iphdr);
633
634		if (skb_cow_head(skb, 0))
635			goto free_skb;
636
637		tnl_params = (const struct iphdr *)skb->data;
638
639		if (!pskb_network_may_pull(skb, pull_len))
640			goto free_skb;
641
642		/* ip_tunnel_xmit() needs skb->data pointing to gre header. */
643		skb_pull(skb, pull_len);
644		skb_reset_mac_header(skb);
645
646		if (skb->ip_summed == CHECKSUM_PARTIAL &&
647		    skb_checksum_start(skb) < skb->data)
648			goto free_skb;
649	} else {
650		if (skb_cow_head(skb, dev->needed_headroom))
651			goto free_skb;
652
653		tnl_params = &tunnel->parms.iph;
654	}
655
656	if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
657		goto free_skb;
658
659	__gre_xmit(skb, dev, tnl_params, skb->protocol);
660	return NETDEV_TX_OK;
661
662free_skb:
663	kfree_skb(skb);
664	dev->stats.tx_dropped++;
665	return NETDEV_TX_OK;
666}
667
668static netdev_tx_t erspan_xmit(struct sk_buff *skb,
669			       struct net_device *dev)
670{
671	struct ip_tunnel *tunnel = netdev_priv(dev);
672	bool truncate = false;
673	__be16 proto;
674
675	if (!pskb_inet_may_pull(skb))
676		goto free_skb;
677
678	if (tunnel->collect_md) {
679		erspan_fb_xmit(skb, dev);
680		return NETDEV_TX_OK;
681	}
682
683	if (gre_handle_offloads(skb, false))
684		goto free_skb;
685
686	if (skb_cow_head(skb, dev->needed_headroom))
687		goto free_skb;
688
689	if (skb->len > dev->mtu + dev->hard_header_len) {
690		pskb_trim(skb, dev->mtu + dev->hard_header_len);
691		truncate = true;
692	}
693
694	/* Push ERSPAN header */
695	if (tunnel->erspan_ver == 0) {
696		proto = htons(ETH_P_ERSPAN);
697		tunnel->parms.o_flags &= ~TUNNEL_SEQ;
698	} else if (tunnel->erspan_ver == 1) {
699		erspan_build_header(skb, ntohl(tunnel->parms.o_key),
700				    tunnel->index,
701				    truncate, true);
702		proto = htons(ETH_P_ERSPAN);
703	} else if (tunnel->erspan_ver == 2) {
704		erspan_build_header_v2(skb, ntohl(tunnel->parms.o_key),
705				       tunnel->dir, tunnel->hwid,
706				       truncate, true);
707		proto = htons(ETH_P_ERSPAN2);
708	} else {
709		goto free_skb;
710	}
711
712	tunnel->parms.o_flags &= ~TUNNEL_KEY;
713	__gre_xmit(skb, dev, &tunnel->parms.iph, proto);
714	return NETDEV_TX_OK;
715
716free_skb:
717	kfree_skb(skb);
718	dev->stats.tx_dropped++;
719	return NETDEV_TX_OK;
720}
721
722static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
723				struct net_device *dev)
724{
725	struct ip_tunnel *tunnel = netdev_priv(dev);
726
727	if (!pskb_inet_may_pull(skb))
728		goto free_skb;
729
730	if (tunnel->collect_md) {
731		gre_fb_xmit(skb, dev, htons(ETH_P_TEB));
732		return NETDEV_TX_OK;
733	}
734
735	if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
736		goto free_skb;
737
738	if (skb_cow_head(skb, dev->needed_headroom))
739		goto free_skb;
740
741	__gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB));
742	return NETDEV_TX_OK;
743
744free_skb:
745	kfree_skb(skb);
746	dev->stats.tx_dropped++;
747	return NETDEV_TX_OK;
748}
749
750static void ipgre_link_update(struct net_device *dev, bool set_mtu)
751{
752	struct ip_tunnel *tunnel = netdev_priv(dev);
753	int len;
754
755	len = tunnel->tun_hlen;
756	tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
757	len = tunnel->tun_hlen - len;
758	tunnel->hlen = tunnel->hlen + len;
759
760	if (dev->header_ops)
761		dev->hard_header_len += len;
762	else
763		dev->needed_headroom += len;
764
765	if (set_mtu)
766		dev->mtu = max_t(int, dev->mtu - len, 68);
767
768	if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
769		if (!(tunnel->parms.o_flags & TUNNEL_CSUM) ||
770		    tunnel->encap.type == TUNNEL_ENCAP_NONE) {
771			dev->features |= NETIF_F_GSO_SOFTWARE;
772			dev->hw_features |= NETIF_F_GSO_SOFTWARE;
773		} else {
774			dev->features &= ~NETIF_F_GSO_SOFTWARE;
775			dev->hw_features &= ~NETIF_F_GSO_SOFTWARE;
776		}
777		dev->features |= NETIF_F_LLTX;
778	} else {
779		dev->hw_features &= ~NETIF_F_GSO_SOFTWARE;
780		dev->features &= ~(NETIF_F_LLTX | NETIF_F_GSO_SOFTWARE);
781	}
782}
783
784static int ipgre_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p,
785			    int cmd)
786{
787	int err;
788
789	if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
790		if (p->iph.version != 4 || p->iph.protocol != IPPROTO_GRE ||
791		    p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF)) ||
792		    ((p->i_flags | p->o_flags) & (GRE_VERSION | GRE_ROUTING)))
793			return -EINVAL;
794	}
795
796	p->i_flags = gre_flags_to_tnl_flags(p->i_flags);
797	p->o_flags = gre_flags_to_tnl_flags(p->o_flags);
798
799	err = ip_tunnel_ctl(dev, p, cmd);
800	if (err)
801		return err;
802
803	if (cmd == SIOCCHGTUNNEL) {
804		struct ip_tunnel *t = netdev_priv(dev);
805
806		t->parms.i_flags = p->i_flags;
807		t->parms.o_flags = p->o_flags;
808
809		if (strcmp(dev->rtnl_link_ops->kind, "erspan"))
810			ipgre_link_update(dev, true);
811	}
812
813	p->i_flags = gre_tnl_flags_to_gre_flags(p->i_flags);
814	p->o_flags = gre_tnl_flags_to_gre_flags(p->o_flags);
815	return 0;
816}
817
818/* Nice toy. Unfortunately, useless in real life :-)
819   It allows to construct virtual multiprotocol broadcast "LAN"
820   over the Internet, provided multicast routing is tuned.
821
822
823   I have no idea was this bicycle invented before me,
824   so that I had to set ARPHRD_IPGRE to a random value.
825   I have an impression, that Cisco could make something similar,
826   but this feature is apparently missing in IOS<=11.2(8).
827
828   I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
829   with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
830
831   ping -t 255 224.66.66.66
832
833   If nobody answers, mbone does not work.
834
835   ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
836   ip addr add 10.66.66.<somewhat>/24 dev Universe
837   ifconfig Universe up
838   ifconfig Universe add fe80::<Your_real_addr>/10
839   ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
840   ftp 10.66.66.66
841   ...
842   ftp fec0:6666:6666::193.233.7.65
843   ...
844 */
845static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
846			unsigned short type,
847			const void *daddr, const void *saddr, unsigned int len)
848{
849	struct ip_tunnel *t = netdev_priv(dev);
850	struct iphdr *iph;
851	struct gre_base_hdr *greh;
852
853	iph = skb_push(skb, t->hlen + sizeof(*iph));
854	greh = (struct gre_base_hdr *)(iph+1);
855	greh->flags = gre_tnl_flags_to_gre_flags(t->parms.o_flags);
856	greh->protocol = htons(type);
857
858	memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
859
860	/* Set the source hardware address. */
861	if (saddr)
862		memcpy(&iph->saddr, saddr, 4);
863	if (daddr)
864		memcpy(&iph->daddr, daddr, 4);
865	if (iph->daddr)
866		return t->hlen + sizeof(*iph);
867
868	return -(t->hlen + sizeof(*iph));
869}
870
871static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
872{
873	const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
874	memcpy(haddr, &iph->saddr, 4);
875	return 4;
876}
877
878static const struct header_ops ipgre_header_ops = {
879	.create	= ipgre_header,
880	.parse	= ipgre_header_parse,
881};
882
883#ifdef CONFIG_NET_IPGRE_BROADCAST
884static int ipgre_open(struct net_device *dev)
885{
886	struct ip_tunnel *t = netdev_priv(dev);
887
888	if (ipv4_is_multicast(t->parms.iph.daddr)) {
889		struct flowi4 fl4;
890		struct rtable *rt;
891
892		rt = ip_route_output_gre(t->net, &fl4,
893					 t->parms.iph.daddr,
894					 t->parms.iph.saddr,
895					 t->parms.o_key,
896					 RT_TOS(t->parms.iph.tos),
897					 t->parms.link);
898		if (IS_ERR(rt))
899			return -EADDRNOTAVAIL;
900		dev = rt->dst.dev;
901		ip_rt_put(rt);
902		if (!__in_dev_get_rtnl(dev))
903			return -EADDRNOTAVAIL;
904		t->mlink = dev->ifindex;
905		ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
906	}
907	return 0;
908}
909
910static int ipgre_close(struct net_device *dev)
911{
912	struct ip_tunnel *t = netdev_priv(dev);
913
914	if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
915		struct in_device *in_dev;
916		in_dev = inetdev_by_index(t->net, t->mlink);
917		if (in_dev)
918			ip_mc_dec_group(in_dev, t->parms.iph.daddr);
919	}
920	return 0;
921}
922#endif
923
924static const struct net_device_ops ipgre_netdev_ops = {
925	.ndo_init		= ipgre_tunnel_init,
926	.ndo_uninit		= ip_tunnel_uninit,
927#ifdef CONFIG_NET_IPGRE_BROADCAST
928	.ndo_open		= ipgre_open,
929	.ndo_stop		= ipgre_close,
930#endif
931	.ndo_start_xmit		= ipgre_xmit,
932	.ndo_do_ioctl		= ip_tunnel_ioctl,
933	.ndo_change_mtu		= ip_tunnel_change_mtu,
934	.ndo_get_stats64	= ip_tunnel_get_stats64,
935	.ndo_get_iflink		= ip_tunnel_get_iflink,
936	.ndo_tunnel_ctl		= ipgre_tunnel_ctl,
937};
938
939#define GRE_FEATURES (NETIF_F_SG |		\
940		      NETIF_F_FRAGLIST |	\
941		      NETIF_F_HIGHDMA |		\
942		      NETIF_F_HW_CSUM)
943
944static void ipgre_tunnel_setup(struct net_device *dev)
945{
946	dev->netdev_ops		= &ipgre_netdev_ops;
947	dev->type		= ARPHRD_IPGRE;
948	ip_tunnel_setup(dev, ipgre_net_id);
949}
950
951static void __gre_tunnel_init(struct net_device *dev)
952{
953	struct ip_tunnel *tunnel;
954
955	tunnel = netdev_priv(dev);
956	tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
957	tunnel->parms.iph.protocol = IPPROTO_GRE;
958
959	tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
960	dev->needed_headroom = tunnel->hlen + sizeof(tunnel->parms.iph);
961
962	dev->features		|= GRE_FEATURES;
963	dev->hw_features	|= GRE_FEATURES;
964
965	if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
966		/* TCP offload with GRE SEQ is not supported, nor
967		 * can we support 2 levels of outer headers requiring
968		 * an update.
969		 */
970		if (!(tunnel->parms.o_flags & TUNNEL_CSUM) ||
971		    (tunnel->encap.type == TUNNEL_ENCAP_NONE)) {
972			dev->features    |= NETIF_F_GSO_SOFTWARE;
973			dev->hw_features |= NETIF_F_GSO_SOFTWARE;
974		}
975
976		/* Can use a lockless transmit, unless we generate
977		 * output sequences
978		 */
979		dev->features |= NETIF_F_LLTX;
980	}
981}
982
983static int ipgre_tunnel_init(struct net_device *dev)
984{
985	struct ip_tunnel *tunnel = netdev_priv(dev);
986	struct iphdr *iph = &tunnel->parms.iph;
987
988	__gre_tunnel_init(dev);
989
990	memcpy(dev->dev_addr, &iph->saddr, 4);
991	memcpy(dev->broadcast, &iph->daddr, 4);
992
993	dev->flags		= IFF_NOARP;
994	netif_keep_dst(dev);
995	dev->addr_len		= 4;
996
997	if (iph->daddr && !tunnel->collect_md) {
998#ifdef CONFIG_NET_IPGRE_BROADCAST
999		if (ipv4_is_multicast(iph->daddr)) {
1000			if (!iph->saddr)
1001				return -EINVAL;
1002			dev->flags = IFF_BROADCAST;
1003			dev->header_ops = &ipgre_header_ops;
1004			dev->hard_header_len = tunnel->hlen + sizeof(*iph);
1005			dev->needed_headroom = 0;
1006		}
1007#endif
1008	} else if (!tunnel->collect_md) {
1009		dev->header_ops = &ipgre_header_ops;
1010		dev->hard_header_len = tunnel->hlen + sizeof(*iph);
1011		dev->needed_headroom = 0;
1012	}
1013
1014	return ip_tunnel_init(dev);
1015}
1016
1017static const struct gre_protocol ipgre_protocol = {
1018	.handler     = gre_rcv,
1019	.err_handler = gre_err,
1020};
1021
1022static int __net_init ipgre_init_net(struct net *net)
1023{
1024	return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL);
1025}
1026
1027static void __net_exit ipgre_exit_batch_net(struct list_head *list_net)
1028{
1029	ip_tunnel_delete_nets(list_net, ipgre_net_id, &ipgre_link_ops);
1030}
1031
1032static struct pernet_operations ipgre_net_ops = {
1033	.init = ipgre_init_net,
1034	.exit_batch = ipgre_exit_batch_net,
1035	.id   = &ipgre_net_id,
1036	.size = sizeof(struct ip_tunnel_net),
1037};
1038
1039static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
1040				 struct netlink_ext_ack *extack)
1041{
1042	__be16 flags;
1043
1044	if (!data)
1045		return 0;
1046
1047	flags = 0;
1048	if (data[IFLA_GRE_IFLAGS])
1049		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1050	if (data[IFLA_GRE_OFLAGS])
1051		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1052	if (flags & (GRE_VERSION|GRE_ROUTING))
1053		return -EINVAL;
1054
1055	if (data[IFLA_GRE_COLLECT_METADATA] &&
1056	    data[IFLA_GRE_ENCAP_TYPE] &&
1057	    nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]) != TUNNEL_ENCAP_NONE)
1058		return -EINVAL;
1059
1060	return 0;
1061}
1062
1063static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[],
1064			      struct netlink_ext_ack *extack)
1065{
1066	__be32 daddr;
1067
1068	if (tb[IFLA_ADDRESS]) {
1069		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1070			return -EINVAL;
1071		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1072			return -EADDRNOTAVAIL;
1073	}
1074
1075	if (!data)
1076		goto out;
1077
1078	if (data[IFLA_GRE_REMOTE]) {
1079		memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1080		if (!daddr)
1081			return -EINVAL;
1082	}
1083
1084out:
1085	return ipgre_tunnel_validate(tb, data, extack);
1086}
1087
1088static int erspan_validate(struct nlattr *tb[], struct nlattr *data[],
1089			   struct netlink_ext_ack *extack)
1090{
1091	__be16 flags = 0;
1092	int ret;
1093
1094	if (!data)
1095		return 0;
1096
1097	ret = ipgre_tap_validate(tb, data, extack);
1098	if (ret)
1099		return ret;
1100
1101	if (data[IFLA_GRE_ERSPAN_VER] &&
1102	    nla_get_u8(data[IFLA_GRE_ERSPAN_VER]) == 0)
1103		return 0;
1104
1105	/* ERSPAN type II/III should only have GRE sequence and key flag */
1106	if (data[IFLA_GRE_OFLAGS])
1107		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1108	if (data[IFLA_GRE_IFLAGS])
1109		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1110	if (!data[IFLA_GRE_COLLECT_METADATA] &&
1111	    flags != (GRE_SEQ | GRE_KEY))
1112		return -EINVAL;
1113
1114	/* ERSPAN Session ID only has 10-bit. Since we reuse
1115	 * 32-bit key field as ID, check it's range.
1116	 */
1117	if (data[IFLA_GRE_IKEY] &&
1118	    (ntohl(nla_get_be32(data[IFLA_GRE_IKEY])) & ~ID_MASK))
1119		return -EINVAL;
1120
1121	if (data[IFLA_GRE_OKEY] &&
1122	    (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK))
1123		return -EINVAL;
1124
1125	return 0;
1126}
1127
1128static int ipgre_netlink_parms(struct net_device *dev,
1129				struct nlattr *data[],
1130				struct nlattr *tb[],
1131				struct ip_tunnel_parm *parms,
1132				__u32 *fwmark)
1133{
1134	struct ip_tunnel *t = netdev_priv(dev);
1135
1136	memset(parms, 0, sizeof(*parms));
1137
1138	parms->iph.protocol = IPPROTO_GRE;
1139
1140	if (!data)
1141		return 0;
1142
1143	if (data[IFLA_GRE_LINK])
1144		parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1145
1146	if (data[IFLA_GRE_IFLAGS])
1147		parms->i_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_IFLAGS]));
1148
1149	if (data[IFLA_GRE_OFLAGS])
1150		parms->o_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_OFLAGS]));
1151
1152	if (data[IFLA_GRE_IKEY])
1153		parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1154
1155	if (data[IFLA_GRE_OKEY])
1156		parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1157
1158	if (data[IFLA_GRE_LOCAL])
1159		parms->iph.saddr = nla_get_in_addr(data[IFLA_GRE_LOCAL]);
1160
1161	if (data[IFLA_GRE_REMOTE])
1162		parms->iph.daddr = nla_get_in_addr(data[IFLA_GRE_REMOTE]);
1163
1164	if (data[IFLA_GRE_TTL])
1165		parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1166
1167	if (data[IFLA_GRE_TOS])
1168		parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1169
1170	if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) {
1171		if (t->ignore_df)
1172			return -EINVAL;
1173		parms->iph.frag_off = htons(IP_DF);
1174	}
1175
1176	if (data[IFLA_GRE_COLLECT_METADATA]) {
1177		t->collect_md = true;
1178		if (dev->type == ARPHRD_IPGRE)
1179			dev->type = ARPHRD_NONE;
1180	}
1181
1182	if (data[IFLA_GRE_IGNORE_DF]) {
1183		if (nla_get_u8(data[IFLA_GRE_IGNORE_DF])
1184		  && (parms->iph.frag_off & htons(IP_DF)))
1185			return -EINVAL;
1186		t->ignore_df = !!nla_get_u8(data[IFLA_GRE_IGNORE_DF]);
1187	}
1188
1189	if (data[IFLA_GRE_FWMARK])
1190		*fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]);
1191
1192	return 0;
1193}
1194
1195static int erspan_netlink_parms(struct net_device *dev,
1196				struct nlattr *data[],
1197				struct nlattr *tb[],
1198				struct ip_tunnel_parm *parms,
1199				__u32 *fwmark)
1200{
1201	struct ip_tunnel *t = netdev_priv(dev);
1202	int err;
1203
1204	err = ipgre_netlink_parms(dev, data, tb, parms, fwmark);
1205	if (err)
1206		return err;
1207	if (!data)
1208		return 0;
1209
1210	if (data[IFLA_GRE_ERSPAN_VER]) {
1211		t->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]);
1212
1213		if (t->erspan_ver > 2)
1214			return -EINVAL;
1215	}
1216
1217	if (t->erspan_ver == 1) {
1218		if (data[IFLA_GRE_ERSPAN_INDEX]) {
1219			t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
1220			if (t->index & ~INDEX_MASK)
1221				return -EINVAL;
1222		}
1223	} else if (t->erspan_ver == 2) {
1224		if (data[IFLA_GRE_ERSPAN_DIR]) {
1225			t->dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]);
1226			if (t->dir & ~(DIR_MASK >> DIR_OFFSET))
1227				return -EINVAL;
1228		}
1229		if (data[IFLA_GRE_ERSPAN_HWID]) {
1230			t->hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]);
1231			if (t->hwid & ~(HWID_MASK >> HWID_OFFSET))
1232				return -EINVAL;
1233		}
1234	}
1235
1236	return 0;
1237}
1238
1239/* This function returns true when ENCAP attributes are present in the nl msg */
1240static bool ipgre_netlink_encap_parms(struct nlattr *data[],
1241				      struct ip_tunnel_encap *ipencap)
1242{
1243	bool ret = false;
1244
1245	memset(ipencap, 0, sizeof(*ipencap));
1246
1247	if (!data)
1248		return ret;
1249
1250	if (data[IFLA_GRE_ENCAP_TYPE]) {
1251		ret = true;
1252		ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]);
1253	}
1254
1255	if (data[IFLA_GRE_ENCAP_FLAGS]) {
1256		ret = true;
1257		ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]);
1258	}
1259
1260	if (data[IFLA_GRE_ENCAP_SPORT]) {
1261		ret = true;
1262		ipencap->sport = nla_get_be16(data[IFLA_GRE_ENCAP_SPORT]);
1263	}
1264
1265	if (data[IFLA_GRE_ENCAP_DPORT]) {
1266		ret = true;
1267		ipencap->dport = nla_get_be16(data[IFLA_GRE_ENCAP_DPORT]);
1268	}
1269
1270	return ret;
1271}
1272
1273static int gre_tap_init(struct net_device *dev)
1274{
1275	__gre_tunnel_init(dev);
1276	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1277	netif_keep_dst(dev);
1278
1279	return ip_tunnel_init(dev);
1280}
1281
1282static const struct net_device_ops gre_tap_netdev_ops = {
1283	.ndo_init		= gre_tap_init,
1284	.ndo_uninit		= ip_tunnel_uninit,
1285	.ndo_start_xmit		= gre_tap_xmit,
1286	.ndo_set_mac_address 	= eth_mac_addr,
1287	.ndo_validate_addr	= eth_validate_addr,
1288	.ndo_change_mtu		= ip_tunnel_change_mtu,
1289	.ndo_get_stats64	= ip_tunnel_get_stats64,
1290	.ndo_get_iflink		= ip_tunnel_get_iflink,
1291	.ndo_fill_metadata_dst	= gre_fill_metadata_dst,
1292};
1293
1294static int erspan_tunnel_init(struct net_device *dev)
1295{
1296	struct ip_tunnel *tunnel = netdev_priv(dev);
1297
1298	if (tunnel->erspan_ver == 0)
1299		tunnel->tun_hlen = 4; /* 4-byte GRE hdr. */
1300	else
1301		tunnel->tun_hlen = 8; /* 8-byte GRE hdr. */
1302
1303	tunnel->parms.iph.protocol = IPPROTO_GRE;
1304	tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
1305		       erspan_hdr_len(tunnel->erspan_ver);
1306
1307	dev->features		|= GRE_FEATURES;
1308	dev->hw_features	|= GRE_FEATURES;
1309	dev->priv_flags		|= IFF_LIVE_ADDR_CHANGE;
1310	netif_keep_dst(dev);
1311
1312	return ip_tunnel_init(dev);
1313}
1314
1315static const struct net_device_ops erspan_netdev_ops = {
1316	.ndo_init		= erspan_tunnel_init,
1317	.ndo_uninit		= ip_tunnel_uninit,
1318	.ndo_start_xmit		= erspan_xmit,
1319	.ndo_set_mac_address	= eth_mac_addr,
1320	.ndo_validate_addr	= eth_validate_addr,
1321	.ndo_change_mtu		= ip_tunnel_change_mtu,
1322	.ndo_get_stats64	= ip_tunnel_get_stats64,
1323	.ndo_get_iflink		= ip_tunnel_get_iflink,
1324	.ndo_fill_metadata_dst	= gre_fill_metadata_dst,
1325};
1326
1327static void ipgre_tap_setup(struct net_device *dev)
1328{
1329	ether_setup(dev);
1330	dev->max_mtu = 0;
1331	dev->netdev_ops	= &gre_tap_netdev_ops;
1332	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1333	dev->priv_flags	|= IFF_LIVE_ADDR_CHANGE;
1334	ip_tunnel_setup(dev, gre_tap_net_id);
1335}
1336
1337static int
1338ipgre_newlink_encap_setup(struct net_device *dev, struct nlattr *data[])
1339{
1340	struct ip_tunnel_encap ipencap;
1341
1342	if (ipgre_netlink_encap_parms(data, &ipencap)) {
1343		struct ip_tunnel *t = netdev_priv(dev);
1344		int err = ip_tunnel_encap_setup(t, &ipencap);
1345
1346		if (err < 0)
1347			return err;
1348	}
1349
1350	return 0;
1351}
1352
1353static int ipgre_newlink(struct net *src_net, struct net_device *dev,
1354			 struct nlattr *tb[], struct nlattr *data[],
1355			 struct netlink_ext_ack *extack)
1356{
1357	struct ip_tunnel_parm p;
1358	__u32 fwmark = 0;
1359	int err;
1360
1361	err = ipgre_newlink_encap_setup(dev, data);
1362	if (err)
1363		return err;
1364
1365	err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
1366	if (err < 0)
1367		return err;
1368	return ip_tunnel_newlink(dev, tb, &p, fwmark);
1369}
1370
1371static int erspan_newlink(struct net *src_net, struct net_device *dev,
1372			  struct nlattr *tb[], struct nlattr *data[],
1373			  struct netlink_ext_ack *extack)
1374{
1375	struct ip_tunnel_parm p;
1376	__u32 fwmark = 0;
1377	int err;
1378
1379	err = ipgre_newlink_encap_setup(dev, data);
1380	if (err)
1381		return err;
1382
1383	err = erspan_netlink_parms(dev, data, tb, &p, &fwmark);
1384	if (err)
1385		return err;
1386	return ip_tunnel_newlink(dev, tb, &p, fwmark);
1387}
1388
1389static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1390			    struct nlattr *data[],
1391			    struct netlink_ext_ack *extack)
1392{
1393	struct ip_tunnel *t = netdev_priv(dev);
1394	__u32 fwmark = t->fwmark;
1395	struct ip_tunnel_parm p;
1396	int err;
1397
1398	err = ipgre_newlink_encap_setup(dev, data);
1399	if (err)
1400		return err;
1401
1402	err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
1403	if (err < 0)
1404		return err;
1405
1406	err = ip_tunnel_changelink(dev, tb, &p, fwmark);
1407	if (err < 0)
1408		return err;
1409
1410	t->parms.i_flags = p.i_flags;
1411	t->parms.o_flags = p.o_flags;
1412
1413	ipgre_link_update(dev, !tb[IFLA_MTU]);
1414
1415	return 0;
1416}
1417
1418static int erspan_changelink(struct net_device *dev, struct nlattr *tb[],
1419			     struct nlattr *data[],
1420			     struct netlink_ext_ack *extack)
1421{
1422	struct ip_tunnel *t = netdev_priv(dev);
1423	__u32 fwmark = t->fwmark;
1424	struct ip_tunnel_parm p;
1425	int err;
1426
1427	err = ipgre_newlink_encap_setup(dev, data);
1428	if (err)
1429		return err;
1430
1431	err = erspan_netlink_parms(dev, data, tb, &p, &fwmark);
1432	if (err < 0)
1433		return err;
1434
1435	err = ip_tunnel_changelink(dev, tb, &p, fwmark);
1436	if (err < 0)
1437		return err;
1438
1439	t->parms.i_flags = p.i_flags;
1440	t->parms.o_flags = p.o_flags;
1441
1442	return 0;
1443}
1444
1445static size_t ipgre_get_size(const struct net_device *dev)
1446{
1447	return
1448		/* IFLA_GRE_LINK */
1449		nla_total_size(4) +
1450		/* IFLA_GRE_IFLAGS */
1451		nla_total_size(2) +
1452		/* IFLA_GRE_OFLAGS */
1453		nla_total_size(2) +
1454		/* IFLA_GRE_IKEY */
1455		nla_total_size(4) +
1456		/* IFLA_GRE_OKEY */
1457		nla_total_size(4) +
1458		/* IFLA_GRE_LOCAL */
1459		nla_total_size(4) +
1460		/* IFLA_GRE_REMOTE */
1461		nla_total_size(4) +
1462		/* IFLA_GRE_TTL */
1463		nla_total_size(1) +
1464		/* IFLA_GRE_TOS */
1465		nla_total_size(1) +
1466		/* IFLA_GRE_PMTUDISC */
1467		nla_total_size(1) +
1468		/* IFLA_GRE_ENCAP_TYPE */
1469		nla_total_size(2) +
1470		/* IFLA_GRE_ENCAP_FLAGS */
1471		nla_total_size(2) +
1472		/* IFLA_GRE_ENCAP_SPORT */
1473		nla_total_size(2) +
1474		/* IFLA_GRE_ENCAP_DPORT */
1475		nla_total_size(2) +
1476		/* IFLA_GRE_COLLECT_METADATA */
1477		nla_total_size(0) +
1478		/* IFLA_GRE_IGNORE_DF */
1479		nla_total_size(1) +
1480		/* IFLA_GRE_FWMARK */
1481		nla_total_size(4) +
1482		/* IFLA_GRE_ERSPAN_INDEX */
1483		nla_total_size(4) +
1484		/* IFLA_GRE_ERSPAN_VER */
1485		nla_total_size(1) +
1486		/* IFLA_GRE_ERSPAN_DIR */
1487		nla_total_size(1) +
1488		/* IFLA_GRE_ERSPAN_HWID */
1489		nla_total_size(2) +
1490		0;
1491}
1492
1493static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1494{
1495	struct ip_tunnel *t = netdev_priv(dev);
1496	struct ip_tunnel_parm *p = &t->parms;
1497	__be16 o_flags = p->o_flags;
1498
1499	if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
1500	    nla_put_be16(skb, IFLA_GRE_IFLAGS,
1501			 gre_tnl_flags_to_gre_flags(p->i_flags)) ||
1502	    nla_put_be16(skb, IFLA_GRE_OFLAGS,
1503			 gre_tnl_flags_to_gre_flags(o_flags)) ||
1504	    nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
1505	    nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
1506	    nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
1507	    nla_put_in_addr(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
1508	    nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
1509	    nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
1510	    nla_put_u8(skb, IFLA_GRE_PMTUDISC,
1511		       !!(p->iph.frag_off & htons(IP_DF))) ||
1512	    nla_put_u32(skb, IFLA_GRE_FWMARK, t->fwmark))
1513		goto nla_put_failure;
1514
1515	if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE,
1516			t->encap.type) ||
1517	    nla_put_be16(skb, IFLA_GRE_ENCAP_SPORT,
1518			 t->encap.sport) ||
1519	    nla_put_be16(skb, IFLA_GRE_ENCAP_DPORT,
1520			 t->encap.dport) ||
1521	    nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS,
1522			t->encap.flags))
1523		goto nla_put_failure;
1524
1525	if (nla_put_u8(skb, IFLA_GRE_IGNORE_DF, t->ignore_df))
1526		goto nla_put_failure;
1527
1528	if (t->collect_md) {
1529		if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA))
1530			goto nla_put_failure;
1531	}
1532
1533	return 0;
1534
1535nla_put_failure:
1536	return -EMSGSIZE;
1537}
1538
1539static int erspan_fill_info(struct sk_buff *skb, const struct net_device *dev)
1540{
1541	struct ip_tunnel *t = netdev_priv(dev);
1542
1543	if (t->erspan_ver <= 2) {
1544		if (t->erspan_ver != 0 && !t->collect_md)
1545			t->parms.o_flags |= TUNNEL_KEY;
1546
1547		if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver))
1548			goto nla_put_failure;
1549
1550		if (t->erspan_ver == 1) {
1551			if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index))
1552				goto nla_put_failure;
1553		} else if (t->erspan_ver == 2) {
1554			if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir))
1555				goto nla_put_failure;
1556			if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid))
1557				goto nla_put_failure;
1558		}
1559	}
1560
1561	return ipgre_fill_info(skb, dev);
1562
1563nla_put_failure:
1564	return -EMSGSIZE;
1565}
1566
1567static void erspan_setup(struct net_device *dev)
1568{
1569	struct ip_tunnel *t = netdev_priv(dev);
1570
1571	ether_setup(dev);
1572	dev->max_mtu = 0;
1573	dev->netdev_ops = &erspan_netdev_ops;
1574	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1575	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1576	ip_tunnel_setup(dev, erspan_net_id);
1577	t->erspan_ver = 1;
1578}
1579
1580static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1581	[IFLA_GRE_LINK]		= { .type = NLA_U32 },
1582	[IFLA_GRE_IFLAGS]	= { .type = NLA_U16 },
1583	[IFLA_GRE_OFLAGS]	= { .type = NLA_U16 },
1584	[IFLA_GRE_IKEY]		= { .type = NLA_U32 },
1585	[IFLA_GRE_OKEY]		= { .type = NLA_U32 },
1586	[IFLA_GRE_LOCAL]	= { .len = sizeof_field(struct iphdr, saddr) },
1587	[IFLA_GRE_REMOTE]	= { .len = sizeof_field(struct iphdr, daddr) },
1588	[IFLA_GRE_TTL]		= { .type = NLA_U8 },
1589	[IFLA_GRE_TOS]		= { .type = NLA_U8 },
1590	[IFLA_GRE_PMTUDISC]	= { .type = NLA_U8 },
1591	[IFLA_GRE_ENCAP_TYPE]	= { .type = NLA_U16 },
1592	[IFLA_GRE_ENCAP_FLAGS]	= { .type = NLA_U16 },
1593	[IFLA_GRE_ENCAP_SPORT]	= { .type = NLA_U16 },
1594	[IFLA_GRE_ENCAP_DPORT]	= { .type = NLA_U16 },
1595	[IFLA_GRE_COLLECT_METADATA]	= { .type = NLA_FLAG },
1596	[IFLA_GRE_IGNORE_DF]	= { .type = NLA_U8 },
1597	[IFLA_GRE_FWMARK]	= { .type = NLA_U32 },
1598	[IFLA_GRE_ERSPAN_INDEX]	= { .type = NLA_U32 },
1599	[IFLA_GRE_ERSPAN_VER]	= { .type = NLA_U8 },
1600	[IFLA_GRE_ERSPAN_DIR]	= { .type = NLA_U8 },
1601	[IFLA_GRE_ERSPAN_HWID]	= { .type = NLA_U16 },
1602};
1603
1604static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1605	.kind		= "gre",
1606	.maxtype	= IFLA_GRE_MAX,
1607	.policy		= ipgre_policy,
1608	.priv_size	= sizeof(struct ip_tunnel),
1609	.setup		= ipgre_tunnel_setup,
1610	.validate	= ipgre_tunnel_validate,
1611	.newlink	= ipgre_newlink,
1612	.changelink	= ipgre_changelink,
1613	.dellink	= ip_tunnel_dellink,
1614	.get_size	= ipgre_get_size,
1615	.fill_info	= ipgre_fill_info,
1616	.get_link_net	= ip_tunnel_get_link_net,
1617};
1618
1619static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1620	.kind		= "gretap",
1621	.maxtype	= IFLA_GRE_MAX,
1622	.policy		= ipgre_policy,
1623	.priv_size	= sizeof(struct ip_tunnel),
1624	.setup		= ipgre_tap_setup,
1625	.validate	= ipgre_tap_validate,
1626	.newlink	= ipgre_newlink,
1627	.changelink	= ipgre_changelink,
1628	.dellink	= ip_tunnel_dellink,
1629	.get_size	= ipgre_get_size,
1630	.fill_info	= ipgre_fill_info,
1631	.get_link_net	= ip_tunnel_get_link_net,
1632};
1633
1634static struct rtnl_link_ops erspan_link_ops __read_mostly = {
1635	.kind		= "erspan",
1636	.maxtype	= IFLA_GRE_MAX,
1637	.policy		= ipgre_policy,
1638	.priv_size	= sizeof(struct ip_tunnel),
1639	.setup		= erspan_setup,
1640	.validate	= erspan_validate,
1641	.newlink	= erspan_newlink,
1642	.changelink	= erspan_changelink,
1643	.dellink	= ip_tunnel_dellink,
1644	.get_size	= ipgre_get_size,
1645	.fill_info	= erspan_fill_info,
1646	.get_link_net	= ip_tunnel_get_link_net,
1647};
1648
1649struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
1650					u8 name_assign_type)
1651{
1652	struct nlattr *tb[IFLA_MAX + 1];
1653	struct net_device *dev;
1654	LIST_HEAD(list_kill);
1655	struct ip_tunnel *t;
1656	int err;
1657
1658	memset(&tb, 0, sizeof(tb));
1659
1660	dev = rtnl_create_link(net, name, name_assign_type,
1661			       &ipgre_tap_ops, tb, NULL);
1662	if (IS_ERR(dev))
1663		return dev;
1664
1665	/* Configure flow based GRE device. */
1666	t = netdev_priv(dev);
1667	t->collect_md = true;
1668
1669	err = ipgre_newlink(net, dev, tb, NULL, NULL);
1670	if (err < 0) {
1671		free_netdev(dev);
1672		return ERR_PTR(err);
1673	}
1674
1675	/* openvswitch users expect packet sizes to be unrestricted,
1676	 * so set the largest MTU we can.
1677	 */
1678	err = __ip_tunnel_change_mtu(dev, IP_MAX_MTU, false);
1679	if (err)
1680		goto out;
1681
1682	err = rtnl_configure_link(dev, NULL);
1683	if (err < 0)
1684		goto out;
1685
1686	return dev;
1687out:
1688	ip_tunnel_dellink(dev, &list_kill);
1689	unregister_netdevice_many(&list_kill);
1690	return ERR_PTR(err);
1691}
1692EXPORT_SYMBOL_GPL(gretap_fb_dev_create);
1693
1694static int __net_init ipgre_tap_init_net(struct net *net)
1695{
1696	return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0");
1697}
1698
1699static void __net_exit ipgre_tap_exit_batch_net(struct list_head *list_net)
1700{
1701	ip_tunnel_delete_nets(list_net, gre_tap_net_id, &ipgre_tap_ops);
1702}
1703
1704static struct pernet_operations ipgre_tap_net_ops = {
1705	.init = ipgre_tap_init_net,
1706	.exit_batch = ipgre_tap_exit_batch_net,
1707	.id   = &gre_tap_net_id,
1708	.size = sizeof(struct ip_tunnel_net),
1709};
1710
1711static int __net_init erspan_init_net(struct net *net)
1712{
1713	return ip_tunnel_init_net(net, erspan_net_id,
1714				  &erspan_link_ops, "erspan0");
1715}
1716
1717static void __net_exit erspan_exit_batch_net(struct list_head *net_list)
1718{
1719	ip_tunnel_delete_nets(net_list, erspan_net_id, &erspan_link_ops);
1720}
1721
1722static struct pernet_operations erspan_net_ops = {
1723	.init = erspan_init_net,
1724	.exit_batch = erspan_exit_batch_net,
1725	.id   = &erspan_net_id,
1726	.size = sizeof(struct ip_tunnel_net),
1727};
1728
1729static int __init ipgre_init(void)
1730{
1731	int err;
1732
1733	pr_info("GRE over IPv4 tunneling driver\n");
1734
1735	err = register_pernet_device(&ipgre_net_ops);
1736	if (err < 0)
1737		return err;
1738
1739	err = register_pernet_device(&ipgre_tap_net_ops);
1740	if (err < 0)
1741		goto pnet_tap_failed;
1742
1743	err = register_pernet_device(&erspan_net_ops);
1744	if (err < 0)
1745		goto pnet_erspan_failed;
1746
1747	err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1748	if (err < 0) {
1749		pr_info("%s: can't add protocol\n", __func__);
1750		goto add_proto_failed;
1751	}
1752
1753	err = rtnl_link_register(&ipgre_link_ops);
1754	if (err < 0)
1755		goto rtnl_link_failed;
1756
1757	err = rtnl_link_register(&ipgre_tap_ops);
1758	if (err < 0)
1759		goto tap_ops_failed;
1760
1761	err = rtnl_link_register(&erspan_link_ops);
1762	if (err < 0)
1763		goto erspan_link_failed;
1764
1765	return 0;
1766
1767erspan_link_failed:
1768	rtnl_link_unregister(&ipgre_tap_ops);
1769tap_ops_failed:
1770	rtnl_link_unregister(&ipgre_link_ops);
1771rtnl_link_failed:
1772	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1773add_proto_failed:
1774	unregister_pernet_device(&erspan_net_ops);
1775pnet_erspan_failed:
1776	unregister_pernet_device(&ipgre_tap_net_ops);
1777pnet_tap_failed:
1778	unregister_pernet_device(&ipgre_net_ops);
1779	return err;
1780}
1781
1782static void __exit ipgre_fini(void)
1783{
1784	rtnl_link_unregister(&ipgre_tap_ops);
1785	rtnl_link_unregister(&ipgre_link_ops);
1786	rtnl_link_unregister(&erspan_link_ops);
1787	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1788	unregister_pernet_device(&ipgre_tap_net_ops);
1789	unregister_pernet_device(&ipgre_net_ops);
1790	unregister_pernet_device(&erspan_net_ops);
1791}
1792
1793module_init(ipgre_init);
1794module_exit(ipgre_fini);
1795MODULE_LICENSE("GPL");
1796MODULE_ALIAS_RTNL_LINK("gre");
1797MODULE_ALIAS_RTNL_LINK("gretap");
1798MODULE_ALIAS_RTNL_LINK("erspan");
1799MODULE_ALIAS_NETDEV("gre0");
1800MODULE_ALIAS_NETDEV("gretap0");
1801MODULE_ALIAS_NETDEV("erspan0");
1802