1// SPDX-License-Identifier: GPL-2.0-or-later 2/* 3 * vrf.c: device driver to encapsulate a VRF space 4 * 5 * Copyright (c) 2015 Cumulus Networks. All rights reserved. 6 * Copyright (c) 2015 Shrijeet Mukherjee <shm@cumulusnetworks.com> 7 * Copyright (c) 2015 David Ahern <dsa@cumulusnetworks.com> 8 * 9 * Based on dummy, team and ipvlan drivers 10 */ 11 12#include <linux/module.h> 13#include <linux/kernel.h> 14#include <linux/netdevice.h> 15#include <linux/etherdevice.h> 16#include <linux/ip.h> 17#include <linux/init.h> 18#include <linux/moduleparam.h> 19#include <linux/netfilter.h> 20#include <linux/rtnetlink.h> 21#include <net/rtnetlink.h> 22#include <linux/u64_stats_sync.h> 23#include <linux/hashtable.h> 24#include <linux/spinlock_types.h> 25 26#include <linux/inetdevice.h> 27#include <net/arp.h> 28#include <net/ip.h> 29#include <net/ip_fib.h> 30#include <net/ip6_fib.h> 31#include <net/ip6_route.h> 32#include <net/route.h> 33#include <net/addrconf.h> 34#include <net/l3mdev.h> 35#include <net/fib_rules.h> 36#include <net/netns/generic.h> 37#include <net/netfilter/nf_conntrack.h> 38 39#define DRV_NAME "vrf" 40#define DRV_VERSION "1.1" 41 42#define FIB_RULE_PREF 1000 /* default preference for FIB rules */ 43 44#define HT_MAP_BITS 4 45#define HASH_INITVAL ((u32)0xcafef00d) 46 47struct vrf_map { 48 DECLARE_HASHTABLE(ht, HT_MAP_BITS); 49 spinlock_t vmap_lock; 50 51 /* shared_tables: 52 * count how many distinct tables do not comply with the strict mode 53 * requirement. 54 * shared_tables value must be 0 in order to enable the strict mode. 55 * 56 * example of the evolution of shared_tables: 57 * | time 58 * add vrf0 --> table 100 shared_tables = 0 | t0 59 * add vrf1 --> table 101 shared_tables = 0 | t1 60 * add vrf2 --> table 100 shared_tables = 1 | t2 61 * add vrf3 --> table 100 shared_tables = 1 | t3 62 * add vrf4 --> table 101 shared_tables = 2 v t4 63 * 64 * shared_tables is a "step function" (or "staircase function") 65 * and it is increased by one when the second vrf is associated to a 66 * table. 67 * 68 * at t2, vrf0 and vrf2 are bound to table 100: shared_tables = 1. 69 * 70 * at t3, another dev (vrf3) is bound to the same table 100 but the 71 * value of shared_tables is still 1. 72 * This means that no matter how many new vrfs will register on the 73 * table 100, the shared_tables will not increase (considering only 74 * table 100). 75 * 76 * at t4, vrf4 is bound to table 101, and shared_tables = 2. 77 * 78 * Looking at the value of shared_tables we can immediately know if 79 * the strict_mode can or cannot be enforced. Indeed, strict_mode 80 * can be enforced iff shared_tables = 0. 81 * 82 * Conversely, shared_tables is decreased when a vrf is de-associated 83 * from a table with exactly two associated vrfs. 84 */ 85 u32 shared_tables; 86 87 bool strict_mode; 88}; 89 90struct vrf_map_elem { 91 struct hlist_node hnode; 92 struct list_head vrf_list; /* VRFs registered to this table */ 93 94 u32 table_id; 95 int users; 96 int ifindex; 97}; 98 99static unsigned int vrf_net_id; 100 101/* per netns vrf data */ 102struct netns_vrf { 103 /* protected by rtnl lock */ 104 bool add_fib_rules; 105 106 struct vrf_map vmap; 107 struct ctl_table_header *ctl_hdr; 108}; 109 110struct net_vrf { 111 struct rtable __rcu *rth; 112 struct rt6_info __rcu *rt6; 113#if IS_ENABLED(CONFIG_IPV6) 114 struct fib6_table *fib6_table; 115#endif 116 u32 tb_id; 117 118 struct list_head me_list; /* entry in vrf_map_elem */ 119 int ifindex; 120}; 121 122struct pcpu_dstats { 123 u64 tx_pkts; 124 u64 tx_bytes; 125 u64 tx_drps; 126 u64 rx_pkts; 127 u64 rx_bytes; 128 u64 rx_drps; 129 struct u64_stats_sync syncp; 130}; 131 132static void vrf_rx_stats(struct net_device *dev, int len) 133{ 134 struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); 135 136 u64_stats_update_begin(&dstats->syncp); 137 dstats->rx_pkts++; 138 dstats->rx_bytes += len; 139 u64_stats_update_end(&dstats->syncp); 140} 141 142static void vrf_tx_error(struct net_device *vrf_dev, struct sk_buff *skb) 143{ 144 vrf_dev->stats.tx_errors++; 145 kfree_skb(skb); 146} 147 148static void vrf_get_stats64(struct net_device *dev, 149 struct rtnl_link_stats64 *stats) 150{ 151 int i; 152 153 for_each_possible_cpu(i) { 154 const struct pcpu_dstats *dstats; 155 u64 tbytes, tpkts, tdrops, rbytes, rpkts; 156 unsigned int start; 157 158 dstats = per_cpu_ptr(dev->dstats, i); 159 do { 160 start = u64_stats_fetch_begin_irq(&dstats->syncp); 161 tbytes = dstats->tx_bytes; 162 tpkts = dstats->tx_pkts; 163 tdrops = dstats->tx_drps; 164 rbytes = dstats->rx_bytes; 165 rpkts = dstats->rx_pkts; 166 } while (u64_stats_fetch_retry_irq(&dstats->syncp, start)); 167 stats->tx_bytes += tbytes; 168 stats->tx_packets += tpkts; 169 stats->tx_dropped += tdrops; 170 stats->rx_bytes += rbytes; 171 stats->rx_packets += rpkts; 172 } 173} 174 175static struct vrf_map *netns_vrf_map(struct net *net) 176{ 177 struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id); 178 179 return &nn_vrf->vmap; 180} 181 182static struct vrf_map *netns_vrf_map_by_dev(struct net_device *dev) 183{ 184 return netns_vrf_map(dev_net(dev)); 185} 186 187static int vrf_map_elem_get_vrf_ifindex(struct vrf_map_elem *me) 188{ 189 struct list_head *me_head = &me->vrf_list; 190 struct net_vrf *vrf; 191 192 if (list_empty(me_head)) 193 return -ENODEV; 194 195 vrf = list_first_entry(me_head, struct net_vrf, me_list); 196 197 return vrf->ifindex; 198} 199 200static struct vrf_map_elem *vrf_map_elem_alloc(gfp_t flags) 201{ 202 struct vrf_map_elem *me; 203 204 me = kmalloc(sizeof(*me), flags); 205 if (!me) 206 return NULL; 207 208 return me; 209} 210 211static void vrf_map_elem_free(struct vrf_map_elem *me) 212{ 213 kfree(me); 214} 215 216static void vrf_map_elem_init(struct vrf_map_elem *me, int table_id, 217 int ifindex, int users) 218{ 219 me->table_id = table_id; 220 me->ifindex = ifindex; 221 me->users = users; 222 INIT_LIST_HEAD(&me->vrf_list); 223} 224 225static struct vrf_map_elem *vrf_map_lookup_elem(struct vrf_map *vmap, 226 u32 table_id) 227{ 228 struct vrf_map_elem *me; 229 u32 key; 230 231 key = jhash_1word(table_id, HASH_INITVAL); 232 hash_for_each_possible(vmap->ht, me, hnode, key) { 233 if (me->table_id == table_id) 234 return me; 235 } 236 237 return NULL; 238} 239 240static void vrf_map_add_elem(struct vrf_map *vmap, struct vrf_map_elem *me) 241{ 242 u32 table_id = me->table_id; 243 u32 key; 244 245 key = jhash_1word(table_id, HASH_INITVAL); 246 hash_add(vmap->ht, &me->hnode, key); 247} 248 249static void vrf_map_del_elem(struct vrf_map_elem *me) 250{ 251 hash_del(&me->hnode); 252} 253 254static void vrf_map_lock(struct vrf_map *vmap) __acquires(&vmap->vmap_lock) 255{ 256 spin_lock(&vmap->vmap_lock); 257} 258 259static void vrf_map_unlock(struct vrf_map *vmap) __releases(&vmap->vmap_lock) 260{ 261 spin_unlock(&vmap->vmap_lock); 262} 263 264/* called with rtnl lock held */ 265static int 266vrf_map_register_dev(struct net_device *dev, struct netlink_ext_ack *extack) 267{ 268 struct vrf_map *vmap = netns_vrf_map_by_dev(dev); 269 struct net_vrf *vrf = netdev_priv(dev); 270 struct vrf_map_elem *new_me, *me; 271 u32 table_id = vrf->tb_id; 272 bool free_new_me = false; 273 int users; 274 int res; 275 276 /* we pre-allocate elements used in the spin-locked section (so that we 277 * keep the spinlock as short as possibile). 278 */ 279 new_me = vrf_map_elem_alloc(GFP_KERNEL); 280 if (!new_me) 281 return -ENOMEM; 282 283 vrf_map_elem_init(new_me, table_id, dev->ifindex, 0); 284 285 vrf_map_lock(vmap); 286 287 me = vrf_map_lookup_elem(vmap, table_id); 288 if (!me) { 289 me = new_me; 290 vrf_map_add_elem(vmap, me); 291 goto link_vrf; 292 } 293 294 /* we already have an entry in the vrf_map, so it means there is (at 295 * least) a vrf registered on the specific table. 296 */ 297 free_new_me = true; 298 if (vmap->strict_mode) { 299 /* vrfs cannot share the same table */ 300 NL_SET_ERR_MSG(extack, "Table is used by another VRF"); 301 res = -EBUSY; 302 goto unlock; 303 } 304 305link_vrf: 306 users = ++me->users; 307 if (users == 2) 308 ++vmap->shared_tables; 309 310 list_add(&vrf->me_list, &me->vrf_list); 311 312 res = 0; 313 314unlock: 315 vrf_map_unlock(vmap); 316 317 /* clean-up, if needed */ 318 if (free_new_me) 319 vrf_map_elem_free(new_me); 320 321 return res; 322} 323 324/* called with rtnl lock held */ 325static void vrf_map_unregister_dev(struct net_device *dev) 326{ 327 struct vrf_map *vmap = netns_vrf_map_by_dev(dev); 328 struct net_vrf *vrf = netdev_priv(dev); 329 u32 table_id = vrf->tb_id; 330 struct vrf_map_elem *me; 331 int users; 332 333 vrf_map_lock(vmap); 334 335 me = vrf_map_lookup_elem(vmap, table_id); 336 if (!me) 337 goto unlock; 338 339 list_del(&vrf->me_list); 340 341 users = --me->users; 342 if (users == 1) { 343 --vmap->shared_tables; 344 } else if (users == 0) { 345 vrf_map_del_elem(me); 346 347 /* no one will refer to this element anymore */ 348 vrf_map_elem_free(me); 349 } 350 351unlock: 352 vrf_map_unlock(vmap); 353} 354 355/* return the vrf device index associated with the table_id */ 356static int vrf_ifindex_lookup_by_table_id(struct net *net, u32 table_id) 357{ 358 struct vrf_map *vmap = netns_vrf_map(net); 359 struct vrf_map_elem *me; 360 int ifindex; 361 362 vrf_map_lock(vmap); 363 364 if (!vmap->strict_mode) { 365 ifindex = -EPERM; 366 goto unlock; 367 } 368 369 me = vrf_map_lookup_elem(vmap, table_id); 370 if (!me) { 371 ifindex = -ENODEV; 372 goto unlock; 373 } 374 375 ifindex = vrf_map_elem_get_vrf_ifindex(me); 376 377unlock: 378 vrf_map_unlock(vmap); 379 380 return ifindex; 381} 382 383/* by default VRF devices do not have a qdisc and are expected 384 * to be created with only a single queue. 385 */ 386static bool qdisc_tx_is_default(const struct net_device *dev) 387{ 388 struct netdev_queue *txq; 389 struct Qdisc *qdisc; 390 391 if (dev->num_tx_queues > 1) 392 return false; 393 394 txq = netdev_get_tx_queue(dev, 0); 395 qdisc = rcu_access_pointer(txq->qdisc); 396 397 return !qdisc->enqueue; 398} 399 400/* Local traffic destined to local address. Reinsert the packet to rx 401 * path, similar to loopback handling. 402 */ 403static int vrf_local_xmit(struct sk_buff *skb, struct net_device *dev, 404 struct dst_entry *dst) 405{ 406 int len = skb->len; 407 408 skb_orphan(skb); 409 410 skb_dst_set(skb, dst); 411 412 /* set pkt_type to avoid skb hitting packet taps twice - 413 * once on Tx and again in Rx processing 414 */ 415 skb->pkt_type = PACKET_LOOPBACK; 416 417 skb->protocol = eth_type_trans(skb, dev); 418 419 if (likely(netif_rx(skb) == NET_RX_SUCCESS)) 420 vrf_rx_stats(dev, len); 421 else 422 this_cpu_inc(dev->dstats->rx_drps); 423 424 return NETDEV_TX_OK; 425} 426 427static void vrf_nf_set_untracked(struct sk_buff *skb) 428{ 429 if (skb_get_nfct(skb) == 0) 430 nf_ct_set(skb, NULL, IP_CT_UNTRACKED); 431} 432 433static void vrf_nf_reset_ct(struct sk_buff *skb) 434{ 435 if (skb_get_nfct(skb) == IP_CT_UNTRACKED) 436 nf_reset_ct(skb); 437} 438 439#if IS_ENABLED(CONFIG_IPV6) 440static int vrf_ip6_local_out(struct net *net, struct sock *sk, 441 struct sk_buff *skb) 442{ 443 int err; 444 445 vrf_nf_reset_ct(skb); 446 447 err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, 448 sk, skb, NULL, skb_dst(skb)->dev, dst_output); 449 450 if (likely(err == 1)) 451 err = dst_output(net, sk, skb); 452 453 return err; 454} 455 456static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb, 457 struct net_device *dev) 458{ 459 const struct ipv6hdr *iph; 460 struct net *net = dev_net(skb->dev); 461 struct flowi6 fl6; 462 int ret = NET_XMIT_DROP; 463 struct dst_entry *dst; 464 struct dst_entry *dst_null = &net->ipv6.ip6_null_entry->dst; 465 466 if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct ipv6hdr))) 467 goto err; 468 469 iph = ipv6_hdr(skb); 470 471 memset(&fl6, 0, sizeof(fl6)); 472 /* needed to match OIF rule */ 473 fl6.flowi6_oif = dev->ifindex; 474 fl6.flowi6_iif = LOOPBACK_IFINDEX; 475 fl6.daddr = iph->daddr; 476 fl6.saddr = iph->saddr; 477 fl6.flowlabel = ip6_flowinfo(iph); 478 fl6.flowi6_mark = skb->mark; 479 fl6.flowi6_proto = iph->nexthdr; 480 fl6.flowi6_flags = FLOWI_FLAG_SKIP_NH_OIF; 481 482 dst = ip6_dst_lookup_flow(net, NULL, &fl6, NULL); 483 if (IS_ERR(dst) || dst == dst_null) 484 goto err; 485 486 skb_dst_drop(skb); 487 488 /* if dst.dev is loopback or the VRF device again this is locally 489 * originated traffic destined to a local address. Short circuit 490 * to Rx path 491 */ 492 if (dst->dev == dev) 493 return vrf_local_xmit(skb, dev, dst); 494 495 skb_dst_set(skb, dst); 496 497 /* strip the ethernet header added for pass through VRF device */ 498 __skb_pull(skb, skb_network_offset(skb)); 499 500 memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); 501 ret = vrf_ip6_local_out(net, skb->sk, skb); 502 if (unlikely(net_xmit_eval(ret))) 503 dev->stats.tx_errors++; 504 else 505 ret = NET_XMIT_SUCCESS; 506 507 return ret; 508err: 509 vrf_tx_error(dev, skb); 510 return NET_XMIT_DROP; 511} 512#else 513static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb, 514 struct net_device *dev) 515{ 516 vrf_tx_error(dev, skb); 517 return NET_XMIT_DROP; 518} 519#endif 520 521/* based on ip_local_out; can't use it b/c the dst is switched pointing to us */ 522static int vrf_ip_local_out(struct net *net, struct sock *sk, 523 struct sk_buff *skb) 524{ 525 int err; 526 527 vrf_nf_reset_ct(skb); 528 529 err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, 530 skb, NULL, skb_dst(skb)->dev, dst_output); 531 if (likely(err == 1)) 532 err = dst_output(net, sk, skb); 533 534 return err; 535} 536 537static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb, 538 struct net_device *vrf_dev) 539{ 540 struct iphdr *ip4h; 541 int ret = NET_XMIT_DROP; 542 struct flowi4 fl4; 543 struct net *net = dev_net(vrf_dev); 544 struct rtable *rt; 545 546 if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct iphdr))) 547 goto err; 548 549 ip4h = ip_hdr(skb); 550 551 memset(&fl4, 0, sizeof(fl4)); 552 /* needed to match OIF rule */ 553 fl4.flowi4_oif = vrf_dev->ifindex; 554 fl4.flowi4_iif = LOOPBACK_IFINDEX; 555 fl4.flowi4_tos = RT_TOS(ip4h->tos); 556 fl4.flowi4_flags = FLOWI_FLAG_ANYSRC | FLOWI_FLAG_SKIP_NH_OIF; 557 fl4.flowi4_proto = ip4h->protocol; 558 fl4.daddr = ip4h->daddr; 559 fl4.saddr = ip4h->saddr; 560 561 rt = ip_route_output_flow(net, &fl4, NULL); 562 if (IS_ERR(rt)) 563 goto err; 564 565 skb_dst_drop(skb); 566 567 /* if dst.dev is loopback or the VRF device again this is locally 568 * originated traffic destined to a local address. Short circuit 569 * to Rx path 570 */ 571 if (rt->dst.dev == vrf_dev) 572 return vrf_local_xmit(skb, vrf_dev, &rt->dst); 573 574 skb_dst_set(skb, &rt->dst); 575 576 /* strip the ethernet header added for pass through VRF device */ 577 __skb_pull(skb, skb_network_offset(skb)); 578 579 if (!ip4h->saddr) { 580 ip4h->saddr = inet_select_addr(skb_dst(skb)->dev, 0, 581 RT_SCOPE_LINK); 582 } 583 584 memset(IPCB(skb), 0, sizeof(*IPCB(skb))); 585 ret = vrf_ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); 586 if (unlikely(net_xmit_eval(ret))) 587 vrf_dev->stats.tx_errors++; 588 else 589 ret = NET_XMIT_SUCCESS; 590 591out: 592 return ret; 593err: 594 vrf_tx_error(vrf_dev, skb); 595 goto out; 596} 597 598static netdev_tx_t is_ip_tx_frame(struct sk_buff *skb, struct net_device *dev) 599{ 600 switch (skb->protocol) { 601 case htons(ETH_P_IP): 602 return vrf_process_v4_outbound(skb, dev); 603 case htons(ETH_P_IPV6): 604 return vrf_process_v6_outbound(skb, dev); 605 default: 606 vrf_tx_error(dev, skb); 607 return NET_XMIT_DROP; 608 } 609} 610 611static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev) 612{ 613 int len = skb->len; 614 netdev_tx_t ret = is_ip_tx_frame(skb, dev); 615 616 if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { 617 struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); 618 619 u64_stats_update_begin(&dstats->syncp); 620 dstats->tx_pkts++; 621 dstats->tx_bytes += len; 622 u64_stats_update_end(&dstats->syncp); 623 } else { 624 this_cpu_inc(dev->dstats->tx_drps); 625 } 626 627 return ret; 628} 629 630static void vrf_finish_direct(struct sk_buff *skb) 631{ 632 struct net_device *vrf_dev = skb->dev; 633 634 if (!list_empty(&vrf_dev->ptype_all) && 635 likely(skb_headroom(skb) >= ETH_HLEN)) { 636 struct ethhdr *eth = skb_push(skb, ETH_HLEN); 637 638 ether_addr_copy(eth->h_source, vrf_dev->dev_addr); 639 eth_zero_addr(eth->h_dest); 640 eth->h_proto = skb->protocol; 641 642 rcu_read_lock_bh(); 643 dev_queue_xmit_nit(skb, vrf_dev); 644 rcu_read_unlock_bh(); 645 646 skb_pull(skb, ETH_HLEN); 647 } 648 649 vrf_nf_reset_ct(skb); 650} 651 652#if IS_ENABLED(CONFIG_IPV6) 653/* modelled after ip6_finish_output2 */ 654static int vrf_finish_output6(struct net *net, struct sock *sk, 655 struct sk_buff *skb) 656{ 657 struct dst_entry *dst = skb_dst(skb); 658 struct net_device *dev = dst->dev; 659 const struct in6_addr *nexthop; 660 struct neighbour *neigh; 661 int ret; 662 663 vrf_nf_reset_ct(skb); 664 665 skb->protocol = htons(ETH_P_IPV6); 666 skb->dev = dev; 667 668 rcu_read_lock_bh(); 669 nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr); 670 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); 671 if (unlikely(!neigh)) 672 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); 673 if (!IS_ERR(neigh)) { 674 sock_confirm_neigh(skb, neigh); 675 ret = neigh_output(neigh, skb, false); 676 rcu_read_unlock_bh(); 677 return ret; 678 } 679 rcu_read_unlock_bh(); 680 681 IP6_INC_STATS(dev_net(dst->dev), 682 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); 683 kfree_skb(skb); 684 return -EINVAL; 685} 686 687/* modelled after ip6_output */ 688static int vrf_output6(struct net *net, struct sock *sk, struct sk_buff *skb) 689{ 690 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, 691 net, sk, skb, NULL, skb_dst(skb)->dev, 692 vrf_finish_output6, 693 !(IP6CB(skb)->flags & IP6SKB_REROUTED)); 694} 695 696/* set dst on skb to send packet to us via dev_xmit path. Allows 697 * packet to go through device based features such as qdisc, netfilter 698 * hooks and packet sockets with skb->dev set to vrf device. 699 */ 700static struct sk_buff *vrf_ip6_out_redirect(struct net_device *vrf_dev, 701 struct sk_buff *skb) 702{ 703 struct net_vrf *vrf = netdev_priv(vrf_dev); 704 struct dst_entry *dst = NULL; 705 struct rt6_info *rt6; 706 707 rcu_read_lock(); 708 709 rt6 = rcu_dereference(vrf->rt6); 710 if (likely(rt6)) { 711 dst = &rt6->dst; 712 dst_hold(dst); 713 } 714 715 rcu_read_unlock(); 716 717 if (unlikely(!dst)) { 718 vrf_tx_error(vrf_dev, skb); 719 return NULL; 720 } 721 722 skb_dst_drop(skb); 723 skb_dst_set(skb, dst); 724 725 return skb; 726} 727 728static int vrf_output6_direct_finish(struct net *net, struct sock *sk, 729 struct sk_buff *skb) 730{ 731 vrf_finish_direct(skb); 732 733 return vrf_ip6_local_out(net, sk, skb); 734} 735 736static int vrf_output6_direct(struct net *net, struct sock *sk, 737 struct sk_buff *skb) 738{ 739 int err = 1; 740 741 skb->protocol = htons(ETH_P_IPV6); 742 743 if (!(IPCB(skb)->flags & IPSKB_REROUTED)) 744 err = nf_hook(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, sk, skb, 745 NULL, skb->dev, vrf_output6_direct_finish); 746 747 if (likely(err == 1)) 748 vrf_finish_direct(skb); 749 750 return err; 751} 752 753static int vrf_ip6_out_direct_finish(struct net *net, struct sock *sk, 754 struct sk_buff *skb) 755{ 756 int err; 757 758 err = vrf_output6_direct(net, sk, skb); 759 if (likely(err == 1)) 760 err = vrf_ip6_local_out(net, sk, skb); 761 762 return err; 763} 764 765static struct sk_buff *vrf_ip6_out_direct(struct net_device *vrf_dev, 766 struct sock *sk, 767 struct sk_buff *skb) 768{ 769 struct net *net = dev_net(vrf_dev); 770 int err; 771 772 skb->dev = vrf_dev; 773 774 err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, 775 skb, NULL, vrf_dev, vrf_ip6_out_direct_finish); 776 777 if (likely(err == 1)) 778 err = vrf_output6_direct(net, sk, skb); 779 780 if (likely(err == 1)) 781 return skb; 782 783 return NULL; 784} 785 786static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev, 787 struct sock *sk, 788 struct sk_buff *skb) 789{ 790 /* don't divert link scope packets */ 791 if (rt6_need_strict(&ipv6_hdr(skb)->daddr)) 792 return skb; 793 794 vrf_nf_set_untracked(skb); 795 796 if (qdisc_tx_is_default(vrf_dev) || 797 IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) 798 return vrf_ip6_out_direct(vrf_dev, sk, skb); 799 800 return vrf_ip6_out_redirect(vrf_dev, skb); 801} 802 803/* holding rtnl */ 804static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf) 805{ 806 struct rt6_info *rt6 = rtnl_dereference(vrf->rt6); 807 struct net *net = dev_net(dev); 808 struct dst_entry *dst; 809 810 RCU_INIT_POINTER(vrf->rt6, NULL); 811 synchronize_rcu(); 812 813 /* move dev in dst's to loopback so this VRF device can be deleted 814 * - based on dst_ifdown 815 */ 816 if (rt6) { 817 dst = &rt6->dst; 818 dev_put(dst->dev); 819 dst->dev = net->loopback_dev; 820 dev_hold(dst->dev); 821 dst_release(dst); 822 } 823} 824 825static int vrf_rt6_create(struct net_device *dev) 826{ 827 int flags = DST_NOPOLICY | DST_NOXFRM; 828 struct net_vrf *vrf = netdev_priv(dev); 829 struct net *net = dev_net(dev); 830 struct rt6_info *rt6; 831 int rc = -ENOMEM; 832 833 /* IPv6 can be CONFIG enabled and then disabled runtime */ 834 if (!ipv6_mod_enabled()) 835 return 0; 836 837 vrf->fib6_table = fib6_new_table(net, vrf->tb_id); 838 if (!vrf->fib6_table) 839 goto out; 840 841 /* create a dst for routing packets out a VRF device */ 842 rt6 = ip6_dst_alloc(net, dev, flags); 843 if (!rt6) 844 goto out; 845 846 rt6->dst.output = vrf_output6; 847 848 rcu_assign_pointer(vrf->rt6, rt6); 849 850 rc = 0; 851out: 852 return rc; 853} 854#else 855static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev, 856 struct sock *sk, 857 struct sk_buff *skb) 858{ 859 return skb; 860} 861 862static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf) 863{ 864} 865 866static int vrf_rt6_create(struct net_device *dev) 867{ 868 return 0; 869} 870#endif 871 872/* modelled after ip_finish_output2 */ 873static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 874{ 875 struct dst_entry *dst = skb_dst(skb); 876 struct rtable *rt = (struct rtable *)dst; 877 struct net_device *dev = dst->dev; 878 unsigned int hh_len = LL_RESERVED_SPACE(dev); 879 struct neighbour *neigh; 880 bool is_v6gw = false; 881 int ret = -EINVAL; 882 883 vrf_nf_reset_ct(skb); 884 885 /* Be paranoid, rather than too clever. */ 886 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { 887 struct sk_buff *skb2; 888 889 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev)); 890 if (!skb2) { 891 ret = -ENOMEM; 892 goto err; 893 } 894 if (skb->sk) 895 skb_set_owner_w(skb2, skb->sk); 896 897 consume_skb(skb); 898 skb = skb2; 899 } 900 901 rcu_read_lock_bh(); 902 903 neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); 904 if (!IS_ERR(neigh)) { 905 sock_confirm_neigh(skb, neigh); 906 /* if crossing protocols, can not use the cached header */ 907 ret = neigh_output(neigh, skb, is_v6gw); 908 rcu_read_unlock_bh(); 909 return ret; 910 } 911 912 rcu_read_unlock_bh(); 913err: 914 vrf_tx_error(skb->dev, skb); 915 return ret; 916} 917 918static int vrf_output(struct net *net, struct sock *sk, struct sk_buff *skb) 919{ 920 struct net_device *dev = skb_dst(skb)->dev; 921 922 IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); 923 924 skb->dev = dev; 925 skb->protocol = htons(ETH_P_IP); 926 927 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, 928 net, sk, skb, NULL, dev, 929 vrf_finish_output, 930 !(IPCB(skb)->flags & IPSKB_REROUTED)); 931} 932 933/* set dst on skb to send packet to us via dev_xmit path. Allows 934 * packet to go through device based features such as qdisc, netfilter 935 * hooks and packet sockets with skb->dev set to vrf device. 936 */ 937static struct sk_buff *vrf_ip_out_redirect(struct net_device *vrf_dev, 938 struct sk_buff *skb) 939{ 940 struct net_vrf *vrf = netdev_priv(vrf_dev); 941 struct dst_entry *dst = NULL; 942 struct rtable *rth; 943 944 rcu_read_lock(); 945 946 rth = rcu_dereference(vrf->rth); 947 if (likely(rth)) { 948 dst = &rth->dst; 949 dst_hold(dst); 950 } 951 952 rcu_read_unlock(); 953 954 if (unlikely(!dst)) { 955 vrf_tx_error(vrf_dev, skb); 956 return NULL; 957 } 958 959 skb_dst_drop(skb); 960 skb_dst_set(skb, dst); 961 962 return skb; 963} 964 965static int vrf_output_direct_finish(struct net *net, struct sock *sk, 966 struct sk_buff *skb) 967{ 968 vrf_finish_direct(skb); 969 970 return vrf_ip_local_out(net, sk, skb); 971} 972 973static int vrf_output_direct(struct net *net, struct sock *sk, 974 struct sk_buff *skb) 975{ 976 int err = 1; 977 978 skb->protocol = htons(ETH_P_IP); 979 980 if (!(IPCB(skb)->flags & IPSKB_REROUTED)) 981 err = nf_hook(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, skb, 982 NULL, skb->dev, vrf_output_direct_finish); 983 984 if (likely(err == 1)) 985 vrf_finish_direct(skb); 986 987 return err; 988} 989 990static int vrf_ip_out_direct_finish(struct net *net, struct sock *sk, 991 struct sk_buff *skb) 992{ 993 int err; 994 995 err = vrf_output_direct(net, sk, skb); 996 if (likely(err == 1)) 997 err = vrf_ip_local_out(net, sk, skb); 998 999 return err; 1000} 1001 1002static struct sk_buff *vrf_ip_out_direct(struct net_device *vrf_dev, 1003 struct sock *sk, 1004 struct sk_buff *skb) 1005{ 1006 struct net *net = dev_net(vrf_dev); 1007 int err; 1008 1009 skb->dev = vrf_dev; 1010 1011 err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, 1012 skb, NULL, vrf_dev, vrf_ip_out_direct_finish); 1013 1014 if (likely(err == 1)) 1015 err = vrf_output_direct(net, sk, skb); 1016 1017 if (likely(err == 1)) 1018 return skb; 1019 1020 return NULL; 1021} 1022 1023static struct sk_buff *vrf_ip_out(struct net_device *vrf_dev, 1024 struct sock *sk, 1025 struct sk_buff *skb) 1026{ 1027 /* don't divert multicast or local broadcast */ 1028 if (ipv4_is_multicast(ip_hdr(skb)->daddr) || 1029 ipv4_is_lbcast(ip_hdr(skb)->daddr)) 1030 return skb; 1031 1032 vrf_nf_set_untracked(skb); 1033 1034 if (qdisc_tx_is_default(vrf_dev) || 1035 IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) 1036 return vrf_ip_out_direct(vrf_dev, sk, skb); 1037 1038 return vrf_ip_out_redirect(vrf_dev, skb); 1039} 1040 1041/* called with rcu lock held */ 1042static struct sk_buff *vrf_l3_out(struct net_device *vrf_dev, 1043 struct sock *sk, 1044 struct sk_buff *skb, 1045 u16 proto) 1046{ 1047 switch (proto) { 1048 case AF_INET: 1049 return vrf_ip_out(vrf_dev, sk, skb); 1050 case AF_INET6: 1051 return vrf_ip6_out(vrf_dev, sk, skb); 1052 } 1053 1054 return skb; 1055} 1056 1057/* holding rtnl */ 1058static void vrf_rtable_release(struct net_device *dev, struct net_vrf *vrf) 1059{ 1060 struct rtable *rth = rtnl_dereference(vrf->rth); 1061 struct net *net = dev_net(dev); 1062 struct dst_entry *dst; 1063 1064 RCU_INIT_POINTER(vrf->rth, NULL); 1065 synchronize_rcu(); 1066 1067 /* move dev in dst's to loopback so this VRF device can be deleted 1068 * - based on dst_ifdown 1069 */ 1070 if (rth) { 1071 dst = &rth->dst; 1072 dev_put(dst->dev); 1073 dst->dev = net->loopback_dev; 1074 dev_hold(dst->dev); 1075 dst_release(dst); 1076 } 1077} 1078 1079static int vrf_rtable_create(struct net_device *dev) 1080{ 1081 struct net_vrf *vrf = netdev_priv(dev); 1082 struct rtable *rth; 1083 1084 if (!fib_new_table(dev_net(dev), vrf->tb_id)) 1085 return -ENOMEM; 1086 1087 /* create a dst for routing packets out through a VRF device */ 1088 rth = rt_dst_alloc(dev, 0, RTN_UNICAST, 1, 1); 1089 if (!rth) 1090 return -ENOMEM; 1091 1092 rth->dst.output = vrf_output; 1093 1094 rcu_assign_pointer(vrf->rth, rth); 1095 1096 return 0; 1097} 1098 1099/**************************** device handling ********************/ 1100 1101/* cycle interface to flush neighbor cache and move routes across tables */ 1102static void cycle_netdev(struct net_device *dev, 1103 struct netlink_ext_ack *extack) 1104{ 1105 unsigned int flags = dev->flags; 1106 int ret; 1107 1108 if (!netif_running(dev)) 1109 return; 1110 1111 ret = dev_change_flags(dev, flags & ~IFF_UP, extack); 1112 if (ret >= 0) 1113 ret = dev_change_flags(dev, flags, extack); 1114 1115 if (ret < 0) { 1116 netdev_err(dev, 1117 "Failed to cycle device %s; route tables might be wrong!\n", 1118 dev->name); 1119 } 1120} 1121 1122static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev, 1123 struct netlink_ext_ack *extack) 1124{ 1125 int ret; 1126 1127 /* do not allow loopback device to be enslaved to a VRF. 1128 * The vrf device acts as the loopback for the vrf. 1129 */ 1130 if (port_dev == dev_net(dev)->loopback_dev) { 1131 NL_SET_ERR_MSG(extack, 1132 "Can not enslave loopback device to a VRF"); 1133 return -EOPNOTSUPP; 1134 } 1135 1136 port_dev->priv_flags |= IFF_L3MDEV_SLAVE; 1137 ret = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL, extack); 1138 if (ret < 0) 1139 goto err; 1140 1141 cycle_netdev(port_dev, extack); 1142 1143 return 0; 1144 1145err: 1146 port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE; 1147 return ret; 1148} 1149 1150static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev, 1151 struct netlink_ext_ack *extack) 1152{ 1153 if (netif_is_l3_master(port_dev)) { 1154 NL_SET_ERR_MSG(extack, 1155 "Can not enslave an L3 master device to a VRF"); 1156 return -EINVAL; 1157 } 1158 1159 if (netif_is_l3_slave(port_dev)) 1160 return -EINVAL; 1161 1162 return do_vrf_add_slave(dev, port_dev, extack); 1163} 1164 1165/* inverse of do_vrf_add_slave */ 1166static int do_vrf_del_slave(struct net_device *dev, struct net_device *port_dev) 1167{ 1168 netdev_upper_dev_unlink(port_dev, dev); 1169 port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE; 1170 1171 cycle_netdev(port_dev, NULL); 1172 1173 return 0; 1174} 1175 1176static int vrf_del_slave(struct net_device *dev, struct net_device *port_dev) 1177{ 1178 return do_vrf_del_slave(dev, port_dev); 1179} 1180 1181static void vrf_dev_uninit(struct net_device *dev) 1182{ 1183 struct net_vrf *vrf = netdev_priv(dev); 1184 1185 vrf_rtable_release(dev, vrf); 1186 vrf_rt6_release(dev, vrf); 1187 1188 free_percpu(dev->dstats); 1189 dev->dstats = NULL; 1190} 1191 1192static int vrf_dev_init(struct net_device *dev) 1193{ 1194 struct net_vrf *vrf = netdev_priv(dev); 1195 1196 dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats); 1197 if (!dev->dstats) 1198 goto out_nomem; 1199 1200 /* create the default dst which points back to us */ 1201 if (vrf_rtable_create(dev) != 0) 1202 goto out_stats; 1203 1204 if (vrf_rt6_create(dev) != 0) 1205 goto out_rth; 1206 1207 dev->flags = IFF_MASTER | IFF_NOARP; 1208 1209 /* similarly, oper state is irrelevant; set to up to avoid confusion */ 1210 dev->operstate = IF_OPER_UP; 1211 netdev_lockdep_set_classes(dev); 1212 return 0; 1213 1214out_rth: 1215 vrf_rtable_release(dev, vrf); 1216out_stats: 1217 free_percpu(dev->dstats); 1218 dev->dstats = NULL; 1219out_nomem: 1220 return -ENOMEM; 1221} 1222 1223static const struct net_device_ops vrf_netdev_ops = { 1224 .ndo_init = vrf_dev_init, 1225 .ndo_uninit = vrf_dev_uninit, 1226 .ndo_start_xmit = vrf_xmit, 1227 .ndo_set_mac_address = eth_mac_addr, 1228 .ndo_get_stats64 = vrf_get_stats64, 1229 .ndo_add_slave = vrf_add_slave, 1230 .ndo_del_slave = vrf_del_slave, 1231}; 1232 1233static u32 vrf_fib_table(const struct net_device *dev) 1234{ 1235 struct net_vrf *vrf = netdev_priv(dev); 1236 1237 return vrf->tb_id; 1238} 1239 1240static int vrf_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) 1241{ 1242 kfree_skb(skb); 1243 return 0; 1244} 1245 1246static struct sk_buff *vrf_rcv_nfhook(u8 pf, unsigned int hook, 1247 struct sk_buff *skb, 1248 struct net_device *dev) 1249{ 1250 struct net *net = dev_net(dev); 1251 1252 if (nf_hook(pf, hook, net, NULL, skb, dev, NULL, vrf_rcv_finish) != 1) 1253 skb = NULL; /* kfree_skb(skb) handled by nf code */ 1254 1255 return skb; 1256} 1257 1258#if IS_ENABLED(CONFIG_IPV6) 1259/* neighbor handling is done with actual device; do not want 1260 * to flip skb->dev for those ndisc packets. This really fails 1261 * for multiple next protocols (e.g., NEXTHDR_HOP). But it is 1262 * a start. 1263 */ 1264static bool ipv6_ndisc_frame(const struct sk_buff *skb) 1265{ 1266 const struct ipv6hdr *iph = ipv6_hdr(skb); 1267 bool rc = false; 1268 1269 if (iph->nexthdr == NEXTHDR_ICMP) { 1270 const struct icmp6hdr *icmph; 1271 struct icmp6hdr _icmph; 1272 1273 icmph = skb_header_pointer(skb, sizeof(*iph), 1274 sizeof(_icmph), &_icmph); 1275 if (!icmph) 1276 goto out; 1277 1278 switch (icmph->icmp6_type) { 1279 case NDISC_ROUTER_SOLICITATION: 1280 case NDISC_ROUTER_ADVERTISEMENT: 1281 case NDISC_NEIGHBOUR_SOLICITATION: 1282 case NDISC_NEIGHBOUR_ADVERTISEMENT: 1283 case NDISC_REDIRECT: 1284 rc = true; 1285 break; 1286 } 1287 } 1288 1289out: 1290 return rc; 1291} 1292 1293static struct rt6_info *vrf_ip6_route_lookup(struct net *net, 1294 const struct net_device *dev, 1295 struct flowi6 *fl6, 1296 int ifindex, 1297 const struct sk_buff *skb, 1298 int flags) 1299{ 1300 struct net_vrf *vrf = netdev_priv(dev); 1301 1302 return ip6_pol_route(net, vrf->fib6_table, ifindex, fl6, skb, flags); 1303} 1304 1305static void vrf_ip6_input_dst(struct sk_buff *skb, struct net_device *vrf_dev, 1306 int ifindex) 1307{ 1308 const struct ipv6hdr *iph = ipv6_hdr(skb); 1309 struct flowi6 fl6 = { 1310 .flowi6_iif = ifindex, 1311 .flowi6_mark = skb->mark, 1312 .flowi6_proto = iph->nexthdr, 1313 .daddr = iph->daddr, 1314 .saddr = iph->saddr, 1315 .flowlabel = ip6_flowinfo(iph), 1316 }; 1317 struct net *net = dev_net(vrf_dev); 1318 struct rt6_info *rt6; 1319 1320 rt6 = vrf_ip6_route_lookup(net, vrf_dev, &fl6, ifindex, skb, 1321 RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_IFACE); 1322 if (unlikely(!rt6)) 1323 return; 1324 1325 if (unlikely(&rt6->dst == &net->ipv6.ip6_null_entry->dst)) 1326 return; 1327 1328 skb_dst_set(skb, &rt6->dst); 1329} 1330 1331static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, 1332 struct sk_buff *skb) 1333{ 1334 int orig_iif = skb->skb_iif; 1335 bool need_strict = rt6_need_strict(&ipv6_hdr(skb)->daddr); 1336 bool is_ndisc = ipv6_ndisc_frame(skb); 1337 1338 /* loopback, multicast & non-ND link-local traffic; do not push through 1339 * packet taps again. Reset pkt_type for upper layers to process skb. 1340 * For strict packets with a source LLA, determine the dst using the 1341 * original ifindex. 1342 */ 1343 if (skb->pkt_type == PACKET_LOOPBACK || (need_strict && !is_ndisc)) { 1344 skb->dev = vrf_dev; 1345 skb->skb_iif = vrf_dev->ifindex; 1346 IP6CB(skb)->flags |= IP6SKB_L3SLAVE; 1347 1348 if (skb->pkt_type == PACKET_LOOPBACK) 1349 skb->pkt_type = PACKET_HOST; 1350 else if (ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL) 1351 vrf_ip6_input_dst(skb, vrf_dev, orig_iif); 1352 1353 goto out; 1354 } 1355 1356 /* if packet is NDISC then keep the ingress interface */ 1357 if (!is_ndisc) { 1358 vrf_rx_stats(vrf_dev, skb->len); 1359 skb->dev = vrf_dev; 1360 skb->skb_iif = vrf_dev->ifindex; 1361 1362 if (!list_empty(&vrf_dev->ptype_all)) { 1363 skb_push(skb, skb->mac_len); 1364 dev_queue_xmit_nit(skb, vrf_dev); 1365 skb_pull(skb, skb->mac_len); 1366 } 1367 1368 IP6CB(skb)->flags |= IP6SKB_L3SLAVE; 1369 } 1370 1371 if (need_strict) 1372 vrf_ip6_input_dst(skb, vrf_dev, orig_iif); 1373 1374 skb = vrf_rcv_nfhook(NFPROTO_IPV6, NF_INET_PRE_ROUTING, skb, vrf_dev); 1375out: 1376 return skb; 1377} 1378 1379#else 1380static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, 1381 struct sk_buff *skb) 1382{ 1383 return skb; 1384} 1385#endif 1386 1387static struct sk_buff *vrf_ip_rcv(struct net_device *vrf_dev, 1388 struct sk_buff *skb) 1389{ 1390 skb->dev = vrf_dev; 1391 skb->skb_iif = vrf_dev->ifindex; 1392 IPCB(skb)->flags |= IPSKB_L3SLAVE; 1393 1394 if (ipv4_is_multicast(ip_hdr(skb)->daddr)) 1395 goto out; 1396 1397 /* loopback traffic; do not push through packet taps again. 1398 * Reset pkt_type for upper layers to process skb 1399 */ 1400 if (skb->pkt_type == PACKET_LOOPBACK) { 1401 skb->pkt_type = PACKET_HOST; 1402 goto out; 1403 } 1404 1405 vrf_rx_stats(vrf_dev, skb->len); 1406 1407 if (!list_empty(&vrf_dev->ptype_all)) { 1408 skb_push(skb, skb->mac_len); 1409 dev_queue_xmit_nit(skb, vrf_dev); 1410 skb_pull(skb, skb->mac_len); 1411 } 1412 1413 skb = vrf_rcv_nfhook(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, vrf_dev); 1414out: 1415 return skb; 1416} 1417 1418/* called with rcu lock held */ 1419static struct sk_buff *vrf_l3_rcv(struct net_device *vrf_dev, 1420 struct sk_buff *skb, 1421 u16 proto) 1422{ 1423 switch (proto) { 1424 case AF_INET: 1425 return vrf_ip_rcv(vrf_dev, skb); 1426 case AF_INET6: 1427 return vrf_ip6_rcv(vrf_dev, skb); 1428 } 1429 1430 return skb; 1431} 1432 1433#if IS_ENABLED(CONFIG_IPV6) 1434/* send to link-local or multicast address via interface enslaved to 1435 * VRF device. Force lookup to VRF table without changing flow struct 1436 * Note: Caller to this function must hold rcu_read_lock() and no refcnt 1437 * is taken on the dst by this function. 1438 */ 1439static struct dst_entry *vrf_link_scope_lookup(const struct net_device *dev, 1440 struct flowi6 *fl6) 1441{ 1442 struct net *net = dev_net(dev); 1443 int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_DST_NOREF; 1444 struct dst_entry *dst = NULL; 1445 struct rt6_info *rt; 1446 1447 /* VRF device does not have a link-local address and 1448 * sending packets to link-local or mcast addresses over 1449 * a VRF device does not make sense 1450 */ 1451 if (fl6->flowi6_oif == dev->ifindex) { 1452 dst = &net->ipv6.ip6_null_entry->dst; 1453 return dst; 1454 } 1455 1456 if (!ipv6_addr_any(&fl6->saddr)) 1457 flags |= RT6_LOOKUP_F_HAS_SADDR; 1458 1459 rt = vrf_ip6_route_lookup(net, dev, fl6, fl6->flowi6_oif, NULL, flags); 1460 if (rt) 1461 dst = &rt->dst; 1462 1463 return dst; 1464} 1465#endif 1466 1467static const struct l3mdev_ops vrf_l3mdev_ops = { 1468 .l3mdev_fib_table = vrf_fib_table, 1469 .l3mdev_l3_rcv = vrf_l3_rcv, 1470 .l3mdev_l3_out = vrf_l3_out, 1471#if IS_ENABLED(CONFIG_IPV6) 1472 .l3mdev_link_scope_lookup = vrf_link_scope_lookup, 1473#endif 1474}; 1475 1476static void vrf_get_drvinfo(struct net_device *dev, 1477 struct ethtool_drvinfo *info) 1478{ 1479 strlcpy(info->driver, DRV_NAME, sizeof(info->driver)); 1480 strlcpy(info->version, DRV_VERSION, sizeof(info->version)); 1481} 1482 1483static const struct ethtool_ops vrf_ethtool_ops = { 1484 .get_drvinfo = vrf_get_drvinfo, 1485}; 1486 1487static inline size_t vrf_fib_rule_nl_size(void) 1488{ 1489 size_t sz; 1490 1491 sz = NLMSG_ALIGN(sizeof(struct fib_rule_hdr)); 1492 sz += nla_total_size(sizeof(u8)); /* FRA_L3MDEV */ 1493 sz += nla_total_size(sizeof(u32)); /* FRA_PRIORITY */ 1494 sz += nla_total_size(sizeof(u8)); /* FRA_PROTOCOL */ 1495 1496 return sz; 1497} 1498 1499static int vrf_fib_rule(const struct net_device *dev, __u8 family, bool add_it) 1500{ 1501 struct fib_rule_hdr *frh; 1502 struct nlmsghdr *nlh; 1503 struct sk_buff *skb; 1504 int err; 1505 1506 if ((family == AF_INET6 || family == RTNL_FAMILY_IP6MR) && 1507 !ipv6_mod_enabled()) 1508 return 0; 1509 1510 skb = nlmsg_new(vrf_fib_rule_nl_size(), GFP_KERNEL); 1511 if (!skb) 1512 return -ENOMEM; 1513 1514 nlh = nlmsg_put(skb, 0, 0, 0, sizeof(*frh), 0); 1515 if (!nlh) 1516 goto nla_put_failure; 1517 1518 /* rule only needs to appear once */ 1519 nlh->nlmsg_flags |= NLM_F_EXCL; 1520 1521 frh = nlmsg_data(nlh); 1522 memset(frh, 0, sizeof(*frh)); 1523 frh->family = family; 1524 frh->action = FR_ACT_TO_TBL; 1525 1526 if (nla_put_u8(skb, FRA_PROTOCOL, RTPROT_KERNEL)) 1527 goto nla_put_failure; 1528 1529 if (nla_put_u8(skb, FRA_L3MDEV, 1)) 1530 goto nla_put_failure; 1531 1532 if (nla_put_u32(skb, FRA_PRIORITY, FIB_RULE_PREF)) 1533 goto nla_put_failure; 1534 1535 nlmsg_end(skb, nlh); 1536 1537 /* fib_nl_{new,del}rule handling looks for net from skb->sk */ 1538 skb->sk = dev_net(dev)->rtnl; 1539 if (add_it) { 1540 err = fib_nl_newrule(skb, nlh, NULL); 1541 if (err == -EEXIST) 1542 err = 0; 1543 } else { 1544 err = fib_nl_delrule(skb, nlh, NULL); 1545 if (err == -ENOENT) 1546 err = 0; 1547 } 1548 nlmsg_free(skb); 1549 1550 return err; 1551 1552nla_put_failure: 1553 nlmsg_free(skb); 1554 1555 return -EMSGSIZE; 1556} 1557 1558static int vrf_add_fib_rules(const struct net_device *dev) 1559{ 1560 int err; 1561 1562 err = vrf_fib_rule(dev, AF_INET, true); 1563 if (err < 0) 1564 goto out_err; 1565 1566 err = vrf_fib_rule(dev, AF_INET6, true); 1567 if (err < 0) 1568 goto ipv6_err; 1569 1570#if IS_ENABLED(CONFIG_IP_MROUTE_MULTIPLE_TABLES) 1571 err = vrf_fib_rule(dev, RTNL_FAMILY_IPMR, true); 1572 if (err < 0) 1573 goto ipmr_err; 1574#endif 1575 1576#if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES) 1577 err = vrf_fib_rule(dev, RTNL_FAMILY_IP6MR, true); 1578 if (err < 0) 1579 goto ip6mr_err; 1580#endif 1581 1582 return 0; 1583 1584#if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES) 1585ip6mr_err: 1586 vrf_fib_rule(dev, RTNL_FAMILY_IPMR, false); 1587#endif 1588 1589#if IS_ENABLED(CONFIG_IP_MROUTE_MULTIPLE_TABLES) 1590ipmr_err: 1591 vrf_fib_rule(dev, AF_INET6, false); 1592#endif 1593 1594ipv6_err: 1595 vrf_fib_rule(dev, AF_INET, false); 1596 1597out_err: 1598 netdev_err(dev, "Failed to add FIB rules.\n"); 1599 return err; 1600} 1601 1602static void vrf_setup(struct net_device *dev) 1603{ 1604 ether_setup(dev); 1605 1606 /* Initialize the device structure. */ 1607 dev->netdev_ops = &vrf_netdev_ops; 1608 dev->l3mdev_ops = &vrf_l3mdev_ops; 1609 dev->ethtool_ops = &vrf_ethtool_ops; 1610 dev->needs_free_netdev = true; 1611 1612 /* Fill in device structure with ethernet-generic values. */ 1613 eth_hw_addr_random(dev); 1614 1615 /* don't acquire vrf device's netif_tx_lock when transmitting */ 1616 dev->features |= NETIF_F_LLTX; 1617 1618 /* don't allow vrf devices to change network namespaces. */ 1619 dev->features |= NETIF_F_NETNS_LOCAL; 1620 1621 /* does not make sense for a VLAN to be added to a vrf device */ 1622 dev->features |= NETIF_F_VLAN_CHALLENGED; 1623 1624 /* enable offload features */ 1625 dev->features |= NETIF_F_GSO_SOFTWARE; 1626 dev->features |= NETIF_F_RXCSUM | NETIF_F_HW_CSUM | NETIF_F_SCTP_CRC; 1627 dev->features |= NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA; 1628 1629 dev->hw_features = dev->features; 1630 dev->hw_enc_features = dev->features; 1631 1632 /* default to no qdisc; user can add if desired */ 1633 dev->priv_flags |= IFF_NO_QUEUE; 1634 dev->priv_flags |= IFF_NO_RX_HANDLER; 1635 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; 1636 1637 /* VRF devices do not care about MTU, but if the MTU is set 1638 * too low then the ipv4 and ipv6 protocols are disabled 1639 * which breaks networking. 1640 */ 1641 dev->min_mtu = IPV6_MIN_MTU; 1642 dev->max_mtu = IP6_MAX_MTU; 1643 dev->mtu = dev->max_mtu; 1644} 1645 1646static int vrf_validate(struct nlattr *tb[], struct nlattr *data[], 1647 struct netlink_ext_ack *extack) 1648{ 1649 if (tb[IFLA_ADDRESS]) { 1650 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) { 1651 NL_SET_ERR_MSG(extack, "Invalid hardware address"); 1652 return -EINVAL; 1653 } 1654 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) { 1655 NL_SET_ERR_MSG(extack, "Invalid hardware address"); 1656 return -EADDRNOTAVAIL; 1657 } 1658 } 1659 return 0; 1660} 1661 1662static void vrf_dellink(struct net_device *dev, struct list_head *head) 1663{ 1664 struct net_device *port_dev; 1665 struct list_head *iter; 1666 1667 netdev_for_each_lower_dev(dev, port_dev, iter) 1668 vrf_del_slave(dev, port_dev); 1669 1670 vrf_map_unregister_dev(dev); 1671 1672 unregister_netdevice_queue(dev, head); 1673} 1674 1675static int vrf_newlink(struct net *src_net, struct net_device *dev, 1676 struct nlattr *tb[], struct nlattr *data[], 1677 struct netlink_ext_ack *extack) 1678{ 1679 struct net_vrf *vrf = netdev_priv(dev); 1680 struct netns_vrf *nn_vrf; 1681 bool *add_fib_rules; 1682 struct net *net; 1683 int err; 1684 1685 if (!data || !data[IFLA_VRF_TABLE]) { 1686 NL_SET_ERR_MSG(extack, "VRF table id is missing"); 1687 return -EINVAL; 1688 } 1689 1690 vrf->tb_id = nla_get_u32(data[IFLA_VRF_TABLE]); 1691 if (vrf->tb_id == RT_TABLE_UNSPEC) { 1692 NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VRF_TABLE], 1693 "Invalid VRF table id"); 1694 return -EINVAL; 1695 } 1696 1697 dev->priv_flags |= IFF_L3MDEV_MASTER; 1698 1699 err = register_netdevice(dev); 1700 if (err) 1701 goto out; 1702 1703 /* mapping between table_id and vrf; 1704 * note: such binding could not be done in the dev init function 1705 * because dev->ifindex id is not available yet. 1706 */ 1707 vrf->ifindex = dev->ifindex; 1708 1709 err = vrf_map_register_dev(dev, extack); 1710 if (err) { 1711 unregister_netdevice(dev); 1712 goto out; 1713 } 1714 1715 net = dev_net(dev); 1716 nn_vrf = net_generic(net, vrf_net_id); 1717 1718 add_fib_rules = &nn_vrf->add_fib_rules; 1719 if (*add_fib_rules) { 1720 err = vrf_add_fib_rules(dev); 1721 if (err) { 1722 vrf_map_unregister_dev(dev); 1723 unregister_netdevice(dev); 1724 goto out; 1725 } 1726 *add_fib_rules = false; 1727 } 1728 1729out: 1730 return err; 1731} 1732 1733static size_t vrf_nl_getsize(const struct net_device *dev) 1734{ 1735 return nla_total_size(sizeof(u32)); /* IFLA_VRF_TABLE */ 1736} 1737 1738static int vrf_fillinfo(struct sk_buff *skb, 1739 const struct net_device *dev) 1740{ 1741 struct net_vrf *vrf = netdev_priv(dev); 1742 1743 return nla_put_u32(skb, IFLA_VRF_TABLE, vrf->tb_id); 1744} 1745 1746static size_t vrf_get_slave_size(const struct net_device *bond_dev, 1747 const struct net_device *slave_dev) 1748{ 1749 return nla_total_size(sizeof(u32)); /* IFLA_VRF_PORT_TABLE */ 1750} 1751 1752static int vrf_fill_slave_info(struct sk_buff *skb, 1753 const struct net_device *vrf_dev, 1754 const struct net_device *slave_dev) 1755{ 1756 struct net_vrf *vrf = netdev_priv(vrf_dev); 1757 1758 if (nla_put_u32(skb, IFLA_VRF_PORT_TABLE, vrf->tb_id)) 1759 return -EMSGSIZE; 1760 1761 return 0; 1762} 1763 1764static const struct nla_policy vrf_nl_policy[IFLA_VRF_MAX + 1] = { 1765 [IFLA_VRF_TABLE] = { .type = NLA_U32 }, 1766}; 1767 1768static struct rtnl_link_ops vrf_link_ops __read_mostly = { 1769 .kind = DRV_NAME, 1770 .priv_size = sizeof(struct net_vrf), 1771 1772 .get_size = vrf_nl_getsize, 1773 .policy = vrf_nl_policy, 1774 .validate = vrf_validate, 1775 .fill_info = vrf_fillinfo, 1776 1777 .get_slave_size = vrf_get_slave_size, 1778 .fill_slave_info = vrf_fill_slave_info, 1779 1780 .newlink = vrf_newlink, 1781 .dellink = vrf_dellink, 1782 .setup = vrf_setup, 1783 .maxtype = IFLA_VRF_MAX, 1784}; 1785 1786static int vrf_device_event(struct notifier_block *unused, 1787 unsigned long event, void *ptr) 1788{ 1789 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 1790 1791 /* only care about unregister events to drop slave references */ 1792 if (event == NETDEV_UNREGISTER) { 1793 struct net_device *vrf_dev; 1794 1795 if (!netif_is_l3_slave(dev)) 1796 goto out; 1797 1798 vrf_dev = netdev_master_upper_dev_get(dev); 1799 vrf_del_slave(vrf_dev, dev); 1800 } 1801out: 1802 return NOTIFY_DONE; 1803} 1804 1805static struct notifier_block vrf_notifier_block __read_mostly = { 1806 .notifier_call = vrf_device_event, 1807}; 1808 1809static int vrf_map_init(struct vrf_map *vmap) 1810{ 1811 spin_lock_init(&vmap->vmap_lock); 1812 hash_init(vmap->ht); 1813 1814 vmap->strict_mode = false; 1815 1816 return 0; 1817} 1818 1819#ifdef CONFIG_SYSCTL 1820static bool vrf_strict_mode(struct vrf_map *vmap) 1821{ 1822 bool strict_mode; 1823 1824 vrf_map_lock(vmap); 1825 strict_mode = vmap->strict_mode; 1826 vrf_map_unlock(vmap); 1827 1828 return strict_mode; 1829} 1830 1831static int vrf_strict_mode_change(struct vrf_map *vmap, bool new_mode) 1832{ 1833 bool *cur_mode; 1834 int res = 0; 1835 1836 vrf_map_lock(vmap); 1837 1838 cur_mode = &vmap->strict_mode; 1839 if (*cur_mode == new_mode) 1840 goto unlock; 1841 1842 if (*cur_mode) { 1843 /* disable strict mode */ 1844 *cur_mode = false; 1845 } else { 1846 if (vmap->shared_tables) { 1847 /* we cannot allow strict_mode because there are some 1848 * vrfs that share one or more tables. 1849 */ 1850 res = -EBUSY; 1851 goto unlock; 1852 } 1853 1854 /* no tables are shared among vrfs, so we can go back 1855 * to 1:1 association between a vrf with its table. 1856 */ 1857 *cur_mode = true; 1858 } 1859 1860unlock: 1861 vrf_map_unlock(vmap); 1862 1863 return res; 1864} 1865 1866static int vrf_shared_table_handler(struct ctl_table *table, int write, 1867 void *buffer, size_t *lenp, loff_t *ppos) 1868{ 1869 struct net *net = (struct net *)table->extra1; 1870 struct vrf_map *vmap = netns_vrf_map(net); 1871 int proc_strict_mode = 0; 1872 struct ctl_table tmp = { 1873 .procname = table->procname, 1874 .data = &proc_strict_mode, 1875 .maxlen = sizeof(int), 1876 .mode = table->mode, 1877 .extra1 = SYSCTL_ZERO, 1878 .extra2 = SYSCTL_ONE, 1879 }; 1880 int ret; 1881 1882 if (!write) 1883 proc_strict_mode = vrf_strict_mode(vmap); 1884 1885 ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); 1886 1887 if (write && ret == 0) 1888 ret = vrf_strict_mode_change(vmap, (bool)proc_strict_mode); 1889 1890 return ret; 1891} 1892 1893static const struct ctl_table vrf_table[] = { 1894 { 1895 .procname = "strict_mode", 1896 .data = NULL, 1897 .maxlen = sizeof(int), 1898 .mode = 0644, 1899 .proc_handler = vrf_shared_table_handler, 1900 /* set by the vrf_netns_init */ 1901 .extra1 = NULL, 1902 }, 1903 { }, 1904}; 1905 1906static int vrf_netns_init_sysctl(struct net *net, struct netns_vrf *nn_vrf) 1907{ 1908 struct ctl_table *table; 1909 1910 table = kmemdup(vrf_table, sizeof(vrf_table), GFP_KERNEL); 1911 if (!table) 1912 return -ENOMEM; 1913 1914 /* init the extra1 parameter with the reference to current netns */ 1915 table[0].extra1 = net; 1916 1917 nn_vrf->ctl_hdr = register_net_sysctl(net, "net/vrf", table); 1918 if (!nn_vrf->ctl_hdr) { 1919 kfree(table); 1920 return -ENOMEM; 1921 } 1922 1923 return 0; 1924} 1925 1926static void vrf_netns_exit_sysctl(struct net *net) 1927{ 1928 struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id); 1929 struct ctl_table *table; 1930 1931 table = nn_vrf->ctl_hdr->ctl_table_arg; 1932 unregister_net_sysctl_table(nn_vrf->ctl_hdr); 1933 kfree(table); 1934} 1935#else 1936static int vrf_netns_init_sysctl(struct net *net, struct netns_vrf *nn_vrf) 1937{ 1938 return 0; 1939} 1940 1941static void vrf_netns_exit_sysctl(struct net *net) 1942{ 1943} 1944#endif 1945 1946/* Initialize per network namespace state */ 1947static int __net_init vrf_netns_init(struct net *net) 1948{ 1949 struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id); 1950 1951 nn_vrf->add_fib_rules = true; 1952 vrf_map_init(&nn_vrf->vmap); 1953 1954 return vrf_netns_init_sysctl(net, nn_vrf); 1955} 1956 1957static void __net_exit vrf_netns_exit(struct net *net) 1958{ 1959 vrf_netns_exit_sysctl(net); 1960} 1961 1962static struct pernet_operations vrf_net_ops __net_initdata = { 1963 .init = vrf_netns_init, 1964 .exit = vrf_netns_exit, 1965 .id = &vrf_net_id, 1966 .size = sizeof(struct netns_vrf), 1967}; 1968 1969static int __init vrf_init_module(void) 1970{ 1971 int rc; 1972 1973 register_netdevice_notifier(&vrf_notifier_block); 1974 1975 rc = register_pernet_subsys(&vrf_net_ops); 1976 if (rc < 0) 1977 goto error; 1978 1979 rc = l3mdev_table_lookup_register(L3MDEV_TYPE_VRF, 1980 vrf_ifindex_lookup_by_table_id); 1981 if (rc < 0) 1982 goto unreg_pernet; 1983 1984 rc = rtnl_link_register(&vrf_link_ops); 1985 if (rc < 0) 1986 goto table_lookup_unreg; 1987 1988 return 0; 1989 1990table_lookup_unreg: 1991 l3mdev_table_lookup_unregister(L3MDEV_TYPE_VRF, 1992 vrf_ifindex_lookup_by_table_id); 1993 1994unreg_pernet: 1995 unregister_pernet_subsys(&vrf_net_ops); 1996 1997error: 1998 unregister_netdevice_notifier(&vrf_notifier_block); 1999 return rc; 2000} 2001 2002module_init(vrf_init_module); 2003MODULE_AUTHOR("Shrijeet Mukherjee, David Ahern"); 2004MODULE_DESCRIPTION("Device driver to instantiate VRF domains"); 2005MODULE_LICENSE("GPL"); 2006MODULE_ALIAS_RTNL_LINK(DRV_NAME); 2007MODULE_VERSION(DRV_VERSION); 2008