1// SPDX-License-Identifier: GPL-2.0-only 2/* 3 * Copyright (c) 2013 Nicira, Inc. 4 */ 5 6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7 8#include <linux/capability.h> 9#include <linux/module.h> 10#include <linux/types.h> 11#include <linux/kernel.h> 12#include <linux/slab.h> 13#include <linux/uaccess.h> 14#include <linux/skbuff.h> 15#include <linux/netdevice.h> 16#include <linux/in.h> 17#include <linux/tcp.h> 18#include <linux/udp.h> 19#include <linux/if_arp.h> 20#include <linux/init.h> 21#include <linux/in6.h> 22#include <linux/inetdevice.h> 23#include <linux/igmp.h> 24#include <linux/netfilter_ipv4.h> 25#include <linux/etherdevice.h> 26#include <linux/if_ether.h> 27#include <linux/if_vlan.h> 28#include <linux/rculist.h> 29#include <linux/err.h> 30 31#include <net/sock.h> 32#include <net/ip.h> 33#include <net/icmp.h> 34#include <net/protocol.h> 35#include <net/ip_tunnels.h> 36#include <net/arp.h> 37#include <net/checksum.h> 38#include <net/dsfield.h> 39#include <net/inet_ecn.h> 40#include <net/xfrm.h> 41#include <net/net_namespace.h> 42#include <net/netns/generic.h> 43#include <net/rtnetlink.h> 44#include <net/udp.h> 45#include <net/dst_metadata.h> 46 47#if IS_ENABLED(CONFIG_IPV6) 48#include <net/ipv6.h> 49#include <net/ip6_fib.h> 50#include <net/ip6_route.h> 51#endif 52 53static unsigned int ip_tunnel_hash(__be32 key, __be32 remote) 54{ 55 return hash_32((__force u32)key ^ (__force u32)remote, 56 IP_TNL_HASH_BITS); 57} 58 59static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p, 60 __be16 flags, __be32 key) 61{ 62 if (p->i_flags & TUNNEL_KEY) { 63 if (flags & TUNNEL_KEY) 64 return key == p->i_key; 65 else 66 /* key expected, none present */ 67 return false; 68 } else 69 return !(flags & TUNNEL_KEY); 70} 71 72/* Fallback tunnel: no source, no destination, no key, no options 73 74 Tunnel hash table: 75 We require exact key match i.e. if a key is present in packet 76 it will match only tunnel with the same key; if it is not present, 77 it will match only keyless tunnel. 78 79 All keysless packets, if not matched configured keyless tunnels 80 will match fallback tunnel. 81 Given src, dst and key, find appropriate for input tunnel. 82*/ 83struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, 84 int link, __be16 flags, 85 __be32 remote, __be32 local, 86 __be32 key) 87{ 88 struct ip_tunnel *t, *cand = NULL; 89 struct hlist_head *head; 90 struct net_device *ndev; 91 unsigned int hash; 92 93 hash = ip_tunnel_hash(key, remote); 94 head = &itn->tunnels[hash]; 95 96 hlist_for_each_entry_rcu(t, head, hash_node) { 97 if (local != t->parms.iph.saddr || 98 remote != t->parms.iph.daddr || 99 !(t->dev->flags & IFF_UP)) 100 continue; 101 102 if (!ip_tunnel_key_match(&t->parms, flags, key)) 103 continue; 104 105 if (t->parms.link == link) 106 return t; 107 else 108 cand = t; 109 } 110 111 hlist_for_each_entry_rcu(t, head, hash_node) { 112 if (remote != t->parms.iph.daddr || 113 t->parms.iph.saddr != 0 || 114 !(t->dev->flags & IFF_UP)) 115 continue; 116 117 if (!ip_tunnel_key_match(&t->parms, flags, key)) 118 continue; 119 120 if (t->parms.link == link) 121 return t; 122 else if (!cand) 123 cand = t; 124 } 125 126 hash = ip_tunnel_hash(key, 0); 127 head = &itn->tunnels[hash]; 128 129 hlist_for_each_entry_rcu(t, head, hash_node) { 130 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) && 131 (local != t->parms.iph.daddr || !ipv4_is_multicast(local))) 132 continue; 133 134 if (!(t->dev->flags & IFF_UP)) 135 continue; 136 137 if (!ip_tunnel_key_match(&t->parms, flags, key)) 138 continue; 139 140 if (t->parms.link == link) 141 return t; 142 else if (!cand) 143 cand = t; 144 } 145 146 hlist_for_each_entry_rcu(t, head, hash_node) { 147 if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) || 148 t->parms.iph.saddr != 0 || 149 t->parms.iph.daddr != 0 || 150 !(t->dev->flags & IFF_UP)) 151 continue; 152 153 if (t->parms.link == link) 154 return t; 155 else if (!cand) 156 cand = t; 157 } 158 159 if (cand) 160 return cand; 161 162 t = rcu_dereference(itn->collect_md_tun); 163 if (t && t->dev->flags & IFF_UP) 164 return t; 165 166 ndev = READ_ONCE(itn->fb_tunnel_dev); 167 if (ndev && ndev->flags & IFF_UP) 168 return netdev_priv(ndev); 169 170 return NULL; 171} 172EXPORT_SYMBOL_GPL(ip_tunnel_lookup); 173 174static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn, 175 struct ip_tunnel_parm *parms) 176{ 177 unsigned int h; 178 __be32 remote; 179 __be32 i_key = parms->i_key; 180 181 if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr)) 182 remote = parms->iph.daddr; 183 else 184 remote = 0; 185 186 if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI)) 187 i_key = 0; 188 189 h = ip_tunnel_hash(i_key, remote); 190 return &itn->tunnels[h]; 191} 192 193static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t) 194{ 195 struct hlist_head *head = ip_bucket(itn, &t->parms); 196 197 if (t->collect_md) 198 rcu_assign_pointer(itn->collect_md_tun, t); 199 hlist_add_head_rcu(&t->hash_node, head); 200} 201 202static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t) 203{ 204 if (t->collect_md) 205 rcu_assign_pointer(itn->collect_md_tun, NULL); 206 hlist_del_init_rcu(&t->hash_node); 207} 208 209static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn, 210 struct ip_tunnel_parm *parms, 211 int type) 212{ 213 __be32 remote = parms->iph.daddr; 214 __be32 local = parms->iph.saddr; 215 __be32 key = parms->i_key; 216 __be16 flags = parms->i_flags; 217 int link = parms->link; 218 struct ip_tunnel *t = NULL; 219 struct hlist_head *head = ip_bucket(itn, parms); 220 221 hlist_for_each_entry_rcu(t, head, hash_node) { 222 if (local == t->parms.iph.saddr && 223 remote == t->parms.iph.daddr && 224 link == t->parms.link && 225 type == t->dev->type && 226 ip_tunnel_key_match(&t->parms, flags, key)) 227 break; 228 } 229 return t; 230} 231 232static struct net_device *__ip_tunnel_create(struct net *net, 233 const struct rtnl_link_ops *ops, 234 struct ip_tunnel_parm *parms) 235{ 236 int err; 237 struct ip_tunnel *tunnel; 238 struct net_device *dev; 239 char name[IFNAMSIZ]; 240 241 err = -E2BIG; 242 if (parms->name[0]) { 243 if (!dev_valid_name(parms->name)) 244 goto failed; 245 strlcpy(name, parms->name, IFNAMSIZ); 246 } else { 247 if (strlen(ops->kind) > (IFNAMSIZ - 3)) 248 goto failed; 249 strcpy(name, ops->kind); 250 strcat(name, "%d"); 251 } 252 253 ASSERT_RTNL(); 254 dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup); 255 if (!dev) { 256 err = -ENOMEM; 257 goto failed; 258 } 259 dev_net_set(dev, net); 260 261 dev->rtnl_link_ops = ops; 262 263 tunnel = netdev_priv(dev); 264 tunnel->parms = *parms; 265 tunnel->net = net; 266 267 err = register_netdevice(dev); 268 if (err) 269 goto failed_free; 270 271 return dev; 272 273failed_free: 274 free_netdev(dev); 275failed: 276 return ERR_PTR(err); 277} 278 279static int ip_tunnel_bind_dev(struct net_device *dev) 280{ 281 struct net_device *tdev = NULL; 282 struct ip_tunnel *tunnel = netdev_priv(dev); 283 const struct iphdr *iph; 284 int hlen = LL_MAX_HEADER; 285 int mtu = ETH_DATA_LEN; 286 int t_hlen = tunnel->hlen + sizeof(struct iphdr); 287 288 iph = &tunnel->parms.iph; 289 290 /* Guess output device to choose reasonable mtu and needed_headroom */ 291 if (iph->daddr) { 292 struct flowi4 fl4; 293 struct rtable *rt; 294 295 ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr, 296 iph->saddr, tunnel->parms.o_key, 297 RT_TOS(iph->tos), tunnel->parms.link, 298 tunnel->fwmark, 0); 299 rt = ip_route_output_key(tunnel->net, &fl4); 300 301 if (!IS_ERR(rt)) { 302 tdev = rt->dst.dev; 303 ip_rt_put(rt); 304 } 305 if (dev->type != ARPHRD_ETHER) 306 dev->flags |= IFF_POINTOPOINT; 307 308 dst_cache_reset(&tunnel->dst_cache); 309 } 310 311 if (!tdev && tunnel->parms.link) 312 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link); 313 314 if (tdev) { 315 hlen = tdev->hard_header_len + tdev->needed_headroom; 316 mtu = min(tdev->mtu, IP_MAX_MTU); 317 } 318 319 dev->needed_headroom = t_hlen + hlen; 320 mtu -= t_hlen + (dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0); 321 322 if (mtu < IPV4_MIN_MTU) 323 mtu = IPV4_MIN_MTU; 324 325 return mtu; 326} 327 328static struct ip_tunnel *ip_tunnel_create(struct net *net, 329 struct ip_tunnel_net *itn, 330 struct ip_tunnel_parm *parms) 331{ 332 struct ip_tunnel *nt; 333 struct net_device *dev; 334 int t_hlen; 335 int mtu; 336 int err; 337 338 dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms); 339 if (IS_ERR(dev)) 340 return ERR_CAST(dev); 341 342 mtu = ip_tunnel_bind_dev(dev); 343 err = dev_set_mtu(dev, mtu); 344 if (err) 345 goto err_dev_set_mtu; 346 347 nt = netdev_priv(dev); 348 t_hlen = nt->hlen + sizeof(struct iphdr); 349 dev->min_mtu = ETH_MIN_MTU; 350 dev->max_mtu = IP_MAX_MTU - t_hlen; 351 if (dev->type == ARPHRD_ETHER) 352 dev->max_mtu -= dev->hard_header_len; 353 354 ip_tunnel_add(itn, nt); 355 return nt; 356 357err_dev_set_mtu: 358 unregister_netdevice(dev); 359 return ERR_PTR(err); 360} 361 362int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, 363 const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, 364 bool log_ecn_error) 365{ 366 const struct iphdr *iph = ip_hdr(skb); 367 int nh, err; 368 369#ifdef CONFIG_NET_IPGRE_BROADCAST 370 if (ipv4_is_multicast(iph->daddr)) { 371 tunnel->dev->stats.multicast++; 372 skb->pkt_type = PACKET_BROADCAST; 373 } 374#endif 375 376 if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) || 377 ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) { 378 tunnel->dev->stats.rx_crc_errors++; 379 tunnel->dev->stats.rx_errors++; 380 goto drop; 381 } 382 383 if (tunnel->parms.i_flags&TUNNEL_SEQ) { 384 if (!(tpi->flags&TUNNEL_SEQ) || 385 (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) { 386 tunnel->dev->stats.rx_fifo_errors++; 387 tunnel->dev->stats.rx_errors++; 388 goto drop; 389 } 390 tunnel->i_seqno = ntohl(tpi->seq) + 1; 391 } 392 393 /* Save offset of outer header relative to skb->head, 394 * because we are going to reset the network header to the inner header 395 * and might change skb->head. 396 */ 397 nh = skb_network_header(skb) - skb->head; 398 399 skb_set_network_header(skb, (tunnel->dev->type == ARPHRD_ETHER) ? ETH_HLEN : 0); 400 401 if (!pskb_inet_may_pull(skb)) { 402 DEV_STATS_INC(tunnel->dev, rx_length_errors); 403 DEV_STATS_INC(tunnel->dev, rx_errors); 404 goto drop; 405 } 406 iph = (struct iphdr *)(skb->head + nh); 407 408 err = IP_ECN_decapsulate(iph, skb); 409 if (unlikely(err)) { 410 if (log_ecn_error) 411 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", 412 &iph->saddr, iph->tos); 413 if (err > 1) { 414 ++tunnel->dev->stats.rx_frame_errors; 415 ++tunnel->dev->stats.rx_errors; 416 goto drop; 417 } 418 } 419 420 dev_sw_netstats_rx_add(tunnel->dev, skb->len); 421 skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev))); 422 423 if (tunnel->dev->type == ARPHRD_ETHER) { 424 skb->protocol = eth_type_trans(skb, tunnel->dev); 425 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); 426 } else { 427 skb->dev = tunnel->dev; 428 } 429 430 if (tun_dst) 431 skb_dst_set(skb, (struct dst_entry *)tun_dst); 432 433 gro_cells_receive(&tunnel->gro_cells, skb); 434 return 0; 435 436drop: 437 if (tun_dst) 438 dst_release((struct dst_entry *)tun_dst); 439 kfree_skb(skb); 440 return 0; 441} 442EXPORT_SYMBOL_GPL(ip_tunnel_rcv); 443 444int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops, 445 unsigned int num) 446{ 447 if (num >= MAX_IPTUN_ENCAP_OPS) 448 return -ERANGE; 449 450 return !cmpxchg((const struct ip_tunnel_encap_ops **) 451 &iptun_encaps[num], 452 NULL, ops) ? 0 : -1; 453} 454EXPORT_SYMBOL(ip_tunnel_encap_add_ops); 455 456int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops, 457 unsigned int num) 458{ 459 int ret; 460 461 if (num >= MAX_IPTUN_ENCAP_OPS) 462 return -ERANGE; 463 464 ret = (cmpxchg((const struct ip_tunnel_encap_ops **) 465 &iptun_encaps[num], 466 ops, NULL) == ops) ? 0 : -1; 467 468 synchronize_net(); 469 470 return ret; 471} 472EXPORT_SYMBOL(ip_tunnel_encap_del_ops); 473 474int ip_tunnel_encap_setup(struct ip_tunnel *t, 475 struct ip_tunnel_encap *ipencap) 476{ 477 int hlen; 478 479 memset(&t->encap, 0, sizeof(t->encap)); 480 481 hlen = ip_encap_hlen(ipencap); 482 if (hlen < 0) 483 return hlen; 484 485 t->encap.type = ipencap->type; 486 t->encap.sport = ipencap->sport; 487 t->encap.dport = ipencap->dport; 488 t->encap.flags = ipencap->flags; 489 490 t->encap_hlen = hlen; 491 t->hlen = t->encap_hlen + t->tun_hlen; 492 493 return 0; 494} 495EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup); 496 497static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, 498 struct rtable *rt, __be16 df, 499 const struct iphdr *inner_iph, 500 int tunnel_hlen, __be32 dst, bool md) 501{ 502 struct ip_tunnel *tunnel = netdev_priv(dev); 503 int pkt_size; 504 int mtu; 505 506 tunnel_hlen = md ? tunnel_hlen : tunnel->hlen; 507 pkt_size = skb->len - tunnel_hlen; 508 pkt_size -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0; 509 510 if (df) { 511 mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen); 512 mtu -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0; 513 } else { 514 mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; 515 } 516 517 if (skb_valid_dst(skb)) 518 skb_dst_update_pmtu_no_confirm(skb, mtu); 519 520 if (skb->protocol == htons(ETH_P_IP)) { 521 if (!skb_is_gso(skb) && 522 (inner_iph->frag_off & htons(IP_DF)) && 523 mtu < pkt_size) { 524 icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); 525 return -E2BIG; 526 } 527 } 528#if IS_ENABLED(CONFIG_IPV6) 529 else if (skb->protocol == htons(ETH_P_IPV6)) { 530 struct rt6_info *rt6; 531 __be32 daddr; 532 533 rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) : 534 NULL; 535 daddr = md ? dst : tunnel->parms.iph.daddr; 536 537 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && 538 mtu >= IPV6_MIN_MTU) { 539 if ((daddr && !ipv4_is_multicast(daddr)) || 540 rt6->rt6i_dst.plen == 128) { 541 rt6->rt6i_flags |= RTF_MODIFIED; 542 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu); 543 } 544 } 545 546 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU && 547 mtu < pkt_size) { 548 icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 549 return -E2BIG; 550 } 551 } 552#endif 553 return 0; 554} 555 556static void ip_tunnel_adj_headroom(struct net_device *dev, unsigned int headroom) 557{ 558 /* we must cap headroom to some upperlimit, else pskb_expand_head 559 * will overflow header offsets in skb_headers_offset_update(). 560 */ 561 static const unsigned int max_allowed = 512; 562 563 if (headroom > max_allowed) 564 headroom = max_allowed; 565 566 if (headroom > READ_ONCE(dev->needed_headroom)) 567 WRITE_ONCE(dev->needed_headroom, headroom); 568} 569 570void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, 571 u8 proto, int tunnel_hlen) 572{ 573 struct ip_tunnel *tunnel = netdev_priv(dev); 574 u32 headroom = sizeof(struct iphdr); 575 struct ip_tunnel_info *tun_info; 576 const struct ip_tunnel_key *key; 577 const struct iphdr *inner_iph; 578 struct rtable *rt = NULL; 579 struct flowi4 fl4; 580 __be16 df = 0; 581 u8 tos, ttl; 582 bool use_cache; 583 584 tun_info = skb_tunnel_info(skb); 585 if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || 586 ip_tunnel_info_af(tun_info) != AF_INET)) 587 goto tx_error; 588 key = &tun_info->key; 589 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 590 inner_iph = (const struct iphdr *)skb_inner_network_header(skb); 591 tos = key->tos; 592 if (tos == 1) { 593 if (skb->protocol == htons(ETH_P_IP)) 594 tos = inner_iph->tos; 595 else if (skb->protocol == htons(ETH_P_IPV6)) 596 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); 597 } 598 ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 599 tunnel_id_to_key32(key->tun_id), RT_TOS(tos), 600 0, skb->mark, skb_get_hash(skb)); 601 if (tunnel->encap.type != TUNNEL_ENCAP_NONE) 602 goto tx_error; 603 604 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); 605 if (use_cache) 606 rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr); 607 if (!rt) { 608 rt = ip_route_output_key(tunnel->net, &fl4); 609 if (IS_ERR(rt)) { 610 dev->stats.tx_carrier_errors++; 611 goto tx_error; 612 } 613 if (use_cache) 614 dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, 615 fl4.saddr); 616 } 617 if (rt->dst.dev == dev) { 618 ip_rt_put(rt); 619 dev->stats.collisions++; 620 goto tx_error; 621 } 622 623 if (key->tun_flags & TUNNEL_DONT_FRAGMENT) 624 df = htons(IP_DF); 625 if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen, 626 key->u.ipv4.dst, true)) { 627 ip_rt_put(rt); 628 goto tx_error; 629 } 630 631 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb); 632 ttl = key->ttl; 633 if (ttl == 0) { 634 if (skb->protocol == htons(ETH_P_IP)) 635 ttl = inner_iph->ttl; 636 else if (skb->protocol == htons(ETH_P_IPV6)) 637 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit; 638 else 639 ttl = ip4_dst_hoplimit(&rt->dst); 640 } 641 642 headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len; 643 if (skb_cow_head(skb, headroom)) { 644 ip_rt_put(rt); 645 goto tx_dropped; 646 } 647 648 ip_tunnel_adj_headroom(dev, headroom); 649 650 iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl, 651 df, !net_eq(tunnel->net, dev_net(dev))); 652 return; 653tx_error: 654 dev->stats.tx_errors++; 655 goto kfree; 656tx_dropped: 657 dev->stats.tx_dropped++; 658kfree: 659 kfree_skb(skb); 660} 661EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit); 662 663void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, 664 const struct iphdr *tnl_params, u8 protocol) 665{ 666 struct ip_tunnel *tunnel = netdev_priv(dev); 667 struct ip_tunnel_info *tun_info = NULL; 668 const struct iphdr *inner_iph; 669 unsigned int max_headroom; /* The extra header space needed */ 670 struct rtable *rt = NULL; /* Route to the other host */ 671 bool use_cache = false; 672 struct flowi4 fl4; 673 bool md = false; 674 bool connected; 675 u8 tos, ttl; 676 __be32 dst; 677 __be16 df; 678 679 inner_iph = (const struct iphdr *)skb_inner_network_header(skb); 680 connected = (tunnel->parms.iph.daddr != 0); 681 682 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 683 684 dst = tnl_params->daddr; 685 if (dst == 0) { 686 /* NBMA tunnel */ 687 688 if (!skb_dst(skb)) { 689 dev->stats.tx_fifo_errors++; 690 goto tx_error; 691 } 692 693 tun_info = skb_tunnel_info(skb); 694 if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) && 695 ip_tunnel_info_af(tun_info) == AF_INET && 696 tun_info->key.u.ipv4.dst) { 697 dst = tun_info->key.u.ipv4.dst; 698 md = true; 699 connected = true; 700 } 701 else if (skb->protocol == htons(ETH_P_IP)) { 702 rt = skb_rtable(skb); 703 dst = rt_nexthop(rt, inner_iph->daddr); 704 } 705#if IS_ENABLED(CONFIG_IPV6) 706 else if (skb->protocol == htons(ETH_P_IPV6)) { 707 const struct in6_addr *addr6; 708 struct neighbour *neigh; 709 bool do_tx_error_icmp; 710 int addr_type; 711 712 neigh = dst_neigh_lookup(skb_dst(skb), 713 &ipv6_hdr(skb)->daddr); 714 if (!neigh) 715 goto tx_error; 716 717 addr6 = (const struct in6_addr *)&neigh->primary_key; 718 addr_type = ipv6_addr_type(addr6); 719 720 if (addr_type == IPV6_ADDR_ANY) { 721 addr6 = &ipv6_hdr(skb)->daddr; 722 addr_type = ipv6_addr_type(addr6); 723 } 724 725 if ((addr_type & IPV6_ADDR_COMPATv4) == 0) 726 do_tx_error_icmp = true; 727 else { 728 do_tx_error_icmp = false; 729 dst = addr6->s6_addr32[3]; 730 } 731 neigh_release(neigh); 732 if (do_tx_error_icmp) 733 goto tx_error_icmp; 734 } 735#endif 736 else 737 goto tx_error; 738 739 if (!md) 740 connected = false; 741 } 742 743 tos = tnl_params->tos; 744 if (tos & 0x1) { 745 tos &= ~0x1; 746 if (skb->protocol == htons(ETH_P_IP)) { 747 tos = inner_iph->tos; 748 connected = false; 749 } else if (skb->protocol == htons(ETH_P_IPV6)) { 750 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); 751 connected = false; 752 } 753 } 754 755 ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr, 756 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link, 757 tunnel->fwmark, skb_get_hash(skb)); 758 759 if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) 760 goto tx_error; 761 762 if (connected && md) { 763 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); 764 if (use_cache) 765 rt = dst_cache_get_ip4(&tun_info->dst_cache, 766 &fl4.saddr); 767 } else { 768 rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, 769 &fl4.saddr) : NULL; 770 } 771 772 if (!rt) { 773 rt = ip_route_output_key(tunnel->net, &fl4); 774 775 if (IS_ERR(rt)) { 776 dev->stats.tx_carrier_errors++; 777 goto tx_error; 778 } 779 if (use_cache) 780 dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, 781 fl4.saddr); 782 else if (!md && connected) 783 dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst, 784 fl4.saddr); 785 } 786 787 if (rt->dst.dev == dev) { 788 ip_rt_put(rt); 789 dev->stats.collisions++; 790 goto tx_error; 791 } 792 793 df = tnl_params->frag_off; 794 if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df) 795 df |= (inner_iph->frag_off & htons(IP_DF)); 796 797 if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) { 798 ip_rt_put(rt); 799 goto tx_error; 800 } 801 802 if (tunnel->err_count > 0) { 803 if (time_before(jiffies, 804 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { 805 tunnel->err_count--; 806 807 dst_link_failure(skb); 808 } else 809 tunnel->err_count = 0; 810 } 811 812 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb); 813 ttl = tnl_params->ttl; 814 if (ttl == 0) { 815 if (skb->protocol == htons(ETH_P_IP)) 816 ttl = inner_iph->ttl; 817#if IS_ENABLED(CONFIG_IPV6) 818 else if (skb->protocol == htons(ETH_P_IPV6)) 819 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit; 820#endif 821 else 822 ttl = ip4_dst_hoplimit(&rt->dst); 823 } 824 825 max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr) 826 + rt->dst.header_len + ip_encap_hlen(&tunnel->encap); 827 828 if (skb_cow_head(skb, max_headroom)) { 829 ip_rt_put(rt); 830 dev->stats.tx_dropped++; 831 kfree_skb(skb); 832 return; 833 } 834 835 ip_tunnel_adj_headroom(dev, max_headroom); 836 837 iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, 838 df, !net_eq(tunnel->net, dev_net(dev))); 839 return; 840 841#if IS_ENABLED(CONFIG_IPV6) 842tx_error_icmp: 843 dst_link_failure(skb); 844#endif 845tx_error: 846 dev->stats.tx_errors++; 847 kfree_skb(skb); 848} 849EXPORT_SYMBOL_GPL(ip_tunnel_xmit); 850 851static void ip_tunnel_update(struct ip_tunnel_net *itn, 852 struct ip_tunnel *t, 853 struct net_device *dev, 854 struct ip_tunnel_parm *p, 855 bool set_mtu, 856 __u32 fwmark) 857{ 858 ip_tunnel_del(itn, t); 859 t->parms.iph.saddr = p->iph.saddr; 860 t->parms.iph.daddr = p->iph.daddr; 861 t->parms.i_key = p->i_key; 862 t->parms.o_key = p->o_key; 863 if (dev->type != ARPHRD_ETHER) { 864 memcpy(dev->dev_addr, &p->iph.saddr, 4); 865 memcpy(dev->broadcast, &p->iph.daddr, 4); 866 } 867 ip_tunnel_add(itn, t); 868 869 t->parms.iph.ttl = p->iph.ttl; 870 t->parms.iph.tos = p->iph.tos; 871 t->parms.iph.frag_off = p->iph.frag_off; 872 873 if (t->parms.link != p->link || t->fwmark != fwmark) { 874 int mtu; 875 876 t->parms.link = p->link; 877 t->fwmark = fwmark; 878 mtu = ip_tunnel_bind_dev(dev); 879 if (set_mtu) 880 dev->mtu = mtu; 881 } 882 dst_cache_reset(&t->dst_cache); 883 netdev_state_change(dev); 884} 885 886int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) 887{ 888 int err = 0; 889 struct ip_tunnel *t = netdev_priv(dev); 890 struct net *net = t->net; 891 struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id); 892 893 switch (cmd) { 894 case SIOCGETTUNNEL: 895 if (dev == itn->fb_tunnel_dev) { 896 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); 897 if (!t) 898 t = netdev_priv(dev); 899 } 900 memcpy(p, &t->parms, sizeof(*p)); 901 break; 902 903 case SIOCADDTUNNEL: 904 case SIOCCHGTUNNEL: 905 err = -EPERM; 906 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 907 goto done; 908 if (p->iph.ttl) 909 p->iph.frag_off |= htons(IP_DF); 910 if (!(p->i_flags & VTI_ISVTI)) { 911 if (!(p->i_flags & TUNNEL_KEY)) 912 p->i_key = 0; 913 if (!(p->o_flags & TUNNEL_KEY)) 914 p->o_key = 0; 915 } 916 917 t = ip_tunnel_find(itn, p, itn->type); 918 919 if (cmd == SIOCADDTUNNEL) { 920 if (!t) { 921 t = ip_tunnel_create(net, itn, p); 922 err = PTR_ERR_OR_ZERO(t); 923 break; 924 } 925 926 err = -EEXIST; 927 break; 928 } 929 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { 930 if (t) { 931 if (t->dev != dev) { 932 err = -EEXIST; 933 break; 934 } 935 } else { 936 unsigned int nflags = 0; 937 938 if (ipv4_is_multicast(p->iph.daddr)) 939 nflags = IFF_BROADCAST; 940 else if (p->iph.daddr) 941 nflags = IFF_POINTOPOINT; 942 943 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) { 944 err = -EINVAL; 945 break; 946 } 947 948 t = netdev_priv(dev); 949 } 950 } 951 952 if (t) { 953 err = 0; 954 ip_tunnel_update(itn, t, dev, p, true, 0); 955 } else { 956 err = -ENOENT; 957 } 958 break; 959 960 case SIOCDELTUNNEL: 961 err = -EPERM; 962 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 963 goto done; 964 965 if (dev == itn->fb_tunnel_dev) { 966 err = -ENOENT; 967 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); 968 if (!t) 969 goto done; 970 err = -EPERM; 971 if (t == netdev_priv(itn->fb_tunnel_dev)) 972 goto done; 973 dev = t->dev; 974 } 975 unregister_netdevice(dev); 976 err = 0; 977 break; 978 979 default: 980 err = -EINVAL; 981 } 982 983done: 984 return err; 985} 986EXPORT_SYMBOL_GPL(ip_tunnel_ctl); 987 988int ip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) 989{ 990 struct ip_tunnel_parm p; 991 int err; 992 993 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) 994 return -EFAULT; 995 err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd); 996 if (!err && copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) 997 return -EFAULT; 998 return err; 999} 1000EXPORT_SYMBOL_GPL(ip_tunnel_ioctl); 1001 1002int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict) 1003{ 1004 struct ip_tunnel *tunnel = netdev_priv(dev); 1005 int t_hlen = tunnel->hlen + sizeof(struct iphdr); 1006 int max_mtu = IP_MAX_MTU - t_hlen; 1007 1008 if (dev->type == ARPHRD_ETHER) 1009 max_mtu -= dev->hard_header_len; 1010 1011 if (new_mtu < ETH_MIN_MTU) 1012 return -EINVAL; 1013 1014 if (new_mtu > max_mtu) { 1015 if (strict) 1016 return -EINVAL; 1017 1018 new_mtu = max_mtu; 1019 } 1020 1021 dev->mtu = new_mtu; 1022 return 0; 1023} 1024EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu); 1025 1026int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu) 1027{ 1028 return __ip_tunnel_change_mtu(dev, new_mtu, true); 1029} 1030EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu); 1031 1032static void ip_tunnel_dev_free(struct net_device *dev) 1033{ 1034 struct ip_tunnel *tunnel = netdev_priv(dev); 1035 1036 gro_cells_destroy(&tunnel->gro_cells); 1037 dst_cache_destroy(&tunnel->dst_cache); 1038 free_percpu(dev->tstats); 1039} 1040 1041void ip_tunnel_dellink(struct net_device *dev, struct list_head *head) 1042{ 1043 struct ip_tunnel *tunnel = netdev_priv(dev); 1044 struct ip_tunnel_net *itn; 1045 1046 itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id); 1047 1048 if (itn->fb_tunnel_dev != dev) { 1049 ip_tunnel_del(itn, netdev_priv(dev)); 1050 unregister_netdevice_queue(dev, head); 1051 } 1052} 1053EXPORT_SYMBOL_GPL(ip_tunnel_dellink); 1054 1055struct net *ip_tunnel_get_link_net(const struct net_device *dev) 1056{ 1057 struct ip_tunnel *tunnel = netdev_priv(dev); 1058 1059 return tunnel->net; 1060} 1061EXPORT_SYMBOL(ip_tunnel_get_link_net); 1062 1063int ip_tunnel_get_iflink(const struct net_device *dev) 1064{ 1065 struct ip_tunnel *tunnel = netdev_priv(dev); 1066 1067 return tunnel->parms.link; 1068} 1069EXPORT_SYMBOL(ip_tunnel_get_iflink); 1070 1071int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, 1072 struct rtnl_link_ops *ops, char *devname) 1073{ 1074 struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id); 1075 struct ip_tunnel_parm parms; 1076 unsigned int i; 1077 1078 itn->rtnl_link_ops = ops; 1079 for (i = 0; i < IP_TNL_HASH_SIZE; i++) 1080 INIT_HLIST_HEAD(&itn->tunnels[i]); 1081 1082 if (!ops || !net_has_fallback_tunnels(net)) { 1083 struct ip_tunnel_net *it_init_net; 1084 1085 it_init_net = net_generic(&init_net, ip_tnl_net_id); 1086 itn->type = it_init_net->type; 1087 itn->fb_tunnel_dev = NULL; 1088 return 0; 1089 } 1090 1091 memset(&parms, 0, sizeof(parms)); 1092 if (devname) 1093 strlcpy(parms.name, devname, IFNAMSIZ); 1094 1095 rtnl_lock(); 1096 itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms); 1097 /* FB netdevice is special: we have one, and only one per netns. 1098 * Allowing to move it to another netns is clearly unsafe. 1099 */ 1100 if (!IS_ERR(itn->fb_tunnel_dev)) { 1101 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; 1102 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev); 1103 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev)); 1104 itn->type = itn->fb_tunnel_dev->type; 1105 } 1106 rtnl_unlock(); 1107 1108 return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev); 1109} 1110EXPORT_SYMBOL_GPL(ip_tunnel_init_net); 1111 1112static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn, 1113 struct list_head *head, 1114 struct rtnl_link_ops *ops) 1115{ 1116 struct net_device *dev, *aux; 1117 int h; 1118 1119 for_each_netdev_safe(net, dev, aux) 1120 if (dev->rtnl_link_ops == ops) 1121 unregister_netdevice_queue(dev, head); 1122 1123 for (h = 0; h < IP_TNL_HASH_SIZE; h++) { 1124 struct ip_tunnel *t; 1125 struct hlist_node *n; 1126 struct hlist_head *thead = &itn->tunnels[h]; 1127 1128 hlist_for_each_entry_safe(t, n, thead, hash_node) 1129 /* If dev is in the same netns, it has already 1130 * been added to the list by the previous loop. 1131 */ 1132 if (!net_eq(dev_net(t->dev), net)) 1133 unregister_netdevice_queue(t->dev, head); 1134 } 1135} 1136 1137void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id, 1138 struct rtnl_link_ops *ops) 1139{ 1140 struct ip_tunnel_net *itn; 1141 struct net *net; 1142 LIST_HEAD(list); 1143 1144 rtnl_lock(); 1145 list_for_each_entry(net, net_list, exit_list) { 1146 itn = net_generic(net, id); 1147 ip_tunnel_destroy(net, itn, &list, ops); 1148 } 1149 unregister_netdevice_many(&list); 1150 rtnl_unlock(); 1151} 1152EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets); 1153 1154int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], 1155 struct ip_tunnel_parm *p, __u32 fwmark) 1156{ 1157 struct ip_tunnel *nt; 1158 struct net *net = dev_net(dev); 1159 struct ip_tunnel_net *itn; 1160 int mtu; 1161 int err; 1162 1163 nt = netdev_priv(dev); 1164 itn = net_generic(net, nt->ip_tnl_net_id); 1165 1166 if (nt->collect_md) { 1167 if (rtnl_dereference(itn->collect_md_tun)) 1168 return -EEXIST; 1169 } else { 1170 if (ip_tunnel_find(itn, p, dev->type)) 1171 return -EEXIST; 1172 } 1173 1174 nt->net = net; 1175 nt->parms = *p; 1176 nt->fwmark = fwmark; 1177 err = register_netdevice(dev); 1178 if (err) 1179 goto err_register_netdevice; 1180 1181 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) 1182 eth_hw_addr_random(dev); 1183 1184 mtu = ip_tunnel_bind_dev(dev); 1185 if (tb[IFLA_MTU]) { 1186 unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr)); 1187 1188 if (dev->type == ARPHRD_ETHER) 1189 max -= dev->hard_header_len; 1190 1191 mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max); 1192 } 1193 1194 err = dev_set_mtu(dev, mtu); 1195 if (err) 1196 goto err_dev_set_mtu; 1197 1198 ip_tunnel_add(itn, nt); 1199 return 0; 1200 1201err_dev_set_mtu: 1202 unregister_netdevice(dev); 1203err_register_netdevice: 1204 return err; 1205} 1206EXPORT_SYMBOL_GPL(ip_tunnel_newlink); 1207 1208int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], 1209 struct ip_tunnel_parm *p, __u32 fwmark) 1210{ 1211 struct ip_tunnel *t; 1212 struct ip_tunnel *tunnel = netdev_priv(dev); 1213 struct net *net = tunnel->net; 1214 struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id); 1215 1216 if (dev == itn->fb_tunnel_dev) 1217 return -EINVAL; 1218 1219 t = ip_tunnel_find(itn, p, dev->type); 1220 1221 if (t) { 1222 if (t->dev != dev) 1223 return -EEXIST; 1224 } else { 1225 t = tunnel; 1226 1227 if (dev->type != ARPHRD_ETHER) { 1228 unsigned int nflags = 0; 1229 1230 if (ipv4_is_multicast(p->iph.daddr)) 1231 nflags = IFF_BROADCAST; 1232 else if (p->iph.daddr) 1233 nflags = IFF_POINTOPOINT; 1234 1235 if ((dev->flags ^ nflags) & 1236 (IFF_POINTOPOINT | IFF_BROADCAST)) 1237 return -EINVAL; 1238 } 1239 } 1240 1241 ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark); 1242 return 0; 1243} 1244EXPORT_SYMBOL_GPL(ip_tunnel_changelink); 1245 1246int ip_tunnel_init(struct net_device *dev) 1247{ 1248 struct ip_tunnel *tunnel = netdev_priv(dev); 1249 struct iphdr *iph = &tunnel->parms.iph; 1250 int err; 1251 1252 dev->needs_free_netdev = true; 1253 dev->priv_destructor = ip_tunnel_dev_free; 1254 dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); 1255 if (!dev->tstats) 1256 return -ENOMEM; 1257 1258 err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); 1259 if (err) { 1260 free_percpu(dev->tstats); 1261 return err; 1262 } 1263 1264 err = gro_cells_init(&tunnel->gro_cells, dev); 1265 if (err) { 1266 dst_cache_destroy(&tunnel->dst_cache); 1267 free_percpu(dev->tstats); 1268 return err; 1269 } 1270 1271 tunnel->dev = dev; 1272 tunnel->net = dev_net(dev); 1273 strcpy(tunnel->parms.name, dev->name); 1274 iph->version = 4; 1275 iph->ihl = 5; 1276 1277 if (tunnel->collect_md) 1278 netif_keep_dst(dev); 1279 return 0; 1280} 1281EXPORT_SYMBOL_GPL(ip_tunnel_init); 1282 1283void ip_tunnel_uninit(struct net_device *dev) 1284{ 1285 struct ip_tunnel *tunnel = netdev_priv(dev); 1286 struct net *net = tunnel->net; 1287 struct ip_tunnel_net *itn; 1288 1289 itn = net_generic(net, tunnel->ip_tnl_net_id); 1290 ip_tunnel_del(itn, netdev_priv(dev)); 1291 if (itn->fb_tunnel_dev == dev) 1292 WRITE_ONCE(itn->fb_tunnel_dev, NULL); 1293 1294 dst_cache_reset(&tunnel->dst_cache); 1295} 1296EXPORT_SYMBOL_GPL(ip_tunnel_uninit); 1297 1298/* Do least required initialization, rest of init is done in tunnel_init call */ 1299void ip_tunnel_setup(struct net_device *dev, unsigned int net_id) 1300{ 1301 struct ip_tunnel *tunnel = netdev_priv(dev); 1302 tunnel->ip_tnl_net_id = net_id; 1303} 1304EXPORT_SYMBOL_GPL(ip_tunnel_setup); 1305 1306MODULE_LICENSE("GPL"); 1307