1// SPDX-License-Identifier: GPL-2.0-only 2/* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch> 3 */ 4 5#include <linux/kernel.h> 6#include <linux/module.h> 7#include <linux/skbuff.h> 8#include <linux/types.h> 9#include <linux/bpf.h> 10#include <net/lwtunnel.h> 11#include <net/gre.h> 12#include <net/ip6_route.h> 13#include <net/ipv6_stubs.h> 14 15struct bpf_lwt_prog { 16 struct bpf_prog *prog; 17 char *name; 18}; 19 20struct bpf_lwt { 21 struct bpf_lwt_prog in; 22 struct bpf_lwt_prog out; 23 struct bpf_lwt_prog xmit; 24 int family; 25}; 26 27#define MAX_PROG_NAME 256 28 29static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt) 30{ 31 return (struct bpf_lwt *)lwt->data; 32} 33 34#define NO_REDIRECT false 35#define CAN_REDIRECT true 36 37static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, 38 struct dst_entry *dst, bool can_redirect) 39{ 40 int ret; 41 42 /* Migration disable and BH disable are needed to protect per-cpu 43 * redirect_info between BPF prog and skb_do_redirect(). 44 */ 45 migrate_disable(); 46 local_bh_disable(); 47 bpf_compute_data_pointers(skb); 48 ret = bpf_prog_run_save_cb(lwt->prog, skb); 49 50 switch (ret) { 51 case BPF_OK: 52 case BPF_LWT_REROUTE: 53 break; 54 55 case BPF_REDIRECT: 56 if (unlikely(!can_redirect)) { 57 pr_warn_once("Illegal redirect return code in prog %s\n", 58 lwt->name ? : "<unknown>"); 59 ret = BPF_OK; 60 } else { 61 skb_reset_mac_header(skb); 62 skb_do_redirect(skb); 63 ret = BPF_REDIRECT; 64 } 65 break; 66 67 case BPF_DROP: 68 kfree_skb(skb); 69 ret = -EPERM; 70 break; 71 72 default: 73 pr_warn_once("bpf-lwt: Illegal return value %u, expect packet loss\n", ret); 74 kfree_skb(skb); 75 ret = -EINVAL; 76 break; 77 } 78 79 local_bh_enable(); 80 migrate_enable(); 81 82 return ret; 83} 84 85static int bpf_lwt_input_reroute(struct sk_buff *skb) 86{ 87 int err = -EINVAL; 88 89 if (skb->protocol == htons(ETH_P_IP)) { 90 struct net_device *dev = skb_dst(skb)->dev; 91 struct iphdr *iph = ip_hdr(skb); 92 93 dev_hold(dev); 94 skb_dst_drop(skb); 95 err = ip_route_input_noref(skb, iph->daddr, iph->saddr, 96 iph->tos, dev); 97 dev_put(dev); 98 } else if (skb->protocol == htons(ETH_P_IPV6)) { 99 skb_dst_drop(skb); 100 err = ipv6_stub->ipv6_route_input(skb); 101 } else { 102 err = -EAFNOSUPPORT; 103 } 104 105 if (err) 106 goto err; 107 return dst_input(skb); 108 109err: 110 kfree_skb(skb); 111 return err; 112} 113 114static int bpf_input(struct sk_buff *skb) 115{ 116 struct dst_entry *dst = skb_dst(skb); 117 struct bpf_lwt *bpf; 118 int ret; 119 120 bpf = bpf_lwt_lwtunnel(dst->lwtstate); 121 if (bpf->in.prog) { 122 ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT); 123 if (ret < 0) 124 return ret; 125 if (ret == BPF_LWT_REROUTE) 126 return bpf_lwt_input_reroute(skb); 127 } 128 129 if (unlikely(!dst->lwtstate->orig_input)) { 130 kfree_skb(skb); 131 return -EINVAL; 132 } 133 134 return dst->lwtstate->orig_input(skb); 135} 136 137static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb) 138{ 139 struct dst_entry *dst = skb_dst(skb); 140 struct bpf_lwt *bpf; 141 int ret; 142 143 bpf = bpf_lwt_lwtunnel(dst->lwtstate); 144 if (bpf->out.prog) { 145 ret = run_lwt_bpf(skb, &bpf->out, dst, NO_REDIRECT); 146 if (ret < 0) 147 return ret; 148 } 149 150 if (unlikely(!dst->lwtstate->orig_output)) { 151 pr_warn_once("orig_output not set on dst for prog %s\n", 152 bpf->out.name); 153 kfree_skb(skb); 154 return -EINVAL; 155 } 156 157 return dst->lwtstate->orig_output(net, sk, skb); 158} 159 160static int xmit_check_hhlen(struct sk_buff *skb, int hh_len) 161{ 162 if (skb_headroom(skb) < hh_len) { 163 int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); 164 165 if (pskb_expand_head(skb, nhead, 0, GFP_ATOMIC)) 166 return -ENOMEM; 167 } 168 169 return 0; 170} 171 172static int bpf_lwt_xmit_reroute(struct sk_buff *skb) 173{ 174 struct net_device *l3mdev = l3mdev_master_dev_rcu(skb_dst(skb)->dev); 175 int oif = l3mdev ? l3mdev->ifindex : 0; 176 struct dst_entry *dst = NULL; 177 int err = -EAFNOSUPPORT; 178 struct sock *sk; 179 struct net *net; 180 bool ipv4; 181 182 if (skb->protocol == htons(ETH_P_IP)) 183 ipv4 = true; 184 else if (skb->protocol == htons(ETH_P_IPV6)) 185 ipv4 = false; 186 else 187 goto err; 188 189 sk = sk_to_full_sk(skb->sk); 190 if (sk) { 191 if (sk->sk_bound_dev_if) 192 oif = sk->sk_bound_dev_if; 193 net = sock_net(sk); 194 } else { 195 net = dev_net(skb_dst(skb)->dev); 196 } 197 198 if (ipv4) { 199 struct iphdr *iph = ip_hdr(skb); 200 struct flowi4 fl4 = {}; 201 struct rtable *rt; 202 203 fl4.flowi4_oif = oif; 204 fl4.flowi4_mark = skb->mark; 205 fl4.flowi4_uid = sock_net_uid(net, sk); 206 fl4.flowi4_tos = RT_TOS(iph->tos); 207 fl4.flowi4_flags = FLOWI_FLAG_ANYSRC; 208 fl4.flowi4_proto = iph->protocol; 209 fl4.daddr = iph->daddr; 210 fl4.saddr = iph->saddr; 211 212 rt = ip_route_output_key(net, &fl4); 213 if (IS_ERR(rt)) { 214 err = PTR_ERR(rt); 215 goto err; 216 } 217 dst = &rt->dst; 218 } else { 219 struct ipv6hdr *iph6 = ipv6_hdr(skb); 220 struct flowi6 fl6 = {}; 221 222 fl6.flowi6_oif = oif; 223 fl6.flowi6_mark = skb->mark; 224 fl6.flowi6_uid = sock_net_uid(net, sk); 225 fl6.flowlabel = ip6_flowinfo(iph6); 226 fl6.flowi6_proto = iph6->nexthdr; 227 fl6.daddr = iph6->daddr; 228 fl6.saddr = iph6->saddr; 229 230 dst = ipv6_stub->ipv6_dst_lookup_flow(net, skb->sk, &fl6, NULL); 231 if (IS_ERR(dst)) { 232 err = PTR_ERR(dst); 233 goto err; 234 } 235 } 236 if (unlikely(dst->error)) { 237 err = dst->error; 238 dst_release(dst); 239 goto err; 240 } 241 242 /* Although skb header was reserved in bpf_lwt_push_ip_encap(), it 243 * was done for the previous dst, so we are doing it here again, in 244 * case the new dst needs much more space. The call below is a noop 245 * if there is enough header space in skb. 246 */ 247 err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); 248 if (unlikely(err)) 249 goto err; 250 251 skb_dst_drop(skb); 252 skb_dst_set(skb, dst); 253 254 err = dst_output(dev_net(skb_dst(skb)->dev), skb->sk, skb); 255 if (unlikely(err)) 256 return net_xmit_errno(err); 257 258 /* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */ 259 return LWTUNNEL_XMIT_DONE; 260 261err: 262 kfree_skb(skb); 263 return err; 264} 265 266static int bpf_xmit(struct sk_buff *skb) 267{ 268 struct dst_entry *dst = skb_dst(skb); 269 struct bpf_lwt *bpf; 270 271 bpf = bpf_lwt_lwtunnel(dst->lwtstate); 272 if (bpf->xmit.prog) { 273 int hh_len = dst->dev->hard_header_len; 274 __be16 proto = skb->protocol; 275 int ret; 276 277 ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT); 278 switch (ret) { 279 case BPF_OK: 280 /* If the header changed, e.g. via bpf_lwt_push_encap, 281 * BPF_LWT_REROUTE below should have been used if the 282 * protocol was also changed. 283 */ 284 if (skb->protocol != proto) { 285 kfree_skb(skb); 286 return -EINVAL; 287 } 288 /* If the header was expanded, headroom might be too 289 * small for L2 header to come, expand as needed. 290 */ 291 ret = xmit_check_hhlen(skb, hh_len); 292 if (unlikely(ret)) 293 return ret; 294 295 return LWTUNNEL_XMIT_CONTINUE; 296 case BPF_REDIRECT: 297 return LWTUNNEL_XMIT_DONE; 298 case BPF_LWT_REROUTE: 299 return bpf_lwt_xmit_reroute(skb); 300 default: 301 return ret; 302 } 303 } 304 305 return LWTUNNEL_XMIT_CONTINUE; 306} 307 308static void bpf_lwt_prog_destroy(struct bpf_lwt_prog *prog) 309{ 310 if (prog->prog) 311 bpf_prog_put(prog->prog); 312 313 kfree(prog->name); 314} 315 316static void bpf_destroy_state(struct lwtunnel_state *lwt) 317{ 318 struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); 319 320 bpf_lwt_prog_destroy(&bpf->in); 321 bpf_lwt_prog_destroy(&bpf->out); 322 bpf_lwt_prog_destroy(&bpf->xmit); 323} 324 325static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = { 326 [LWT_BPF_PROG_FD] = { .type = NLA_U32, }, 327 [LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING, 328 .len = MAX_PROG_NAME }, 329}; 330 331static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog, 332 enum bpf_prog_type type) 333{ 334 struct nlattr *tb[LWT_BPF_PROG_MAX + 1]; 335 struct bpf_prog *p; 336 int ret; 337 u32 fd; 338 339 ret = nla_parse_nested_deprecated(tb, LWT_BPF_PROG_MAX, attr, 340 bpf_prog_policy, NULL); 341 if (ret < 0) 342 return ret; 343 344 if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME]) 345 return -EINVAL; 346 347 prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_ATOMIC); 348 if (!prog->name) 349 return -ENOMEM; 350 351 fd = nla_get_u32(tb[LWT_BPF_PROG_FD]); 352 p = bpf_prog_get_type(fd, type); 353 if (IS_ERR(p)) 354 return PTR_ERR(p); 355 356 prog->prog = p; 357 358 return 0; 359} 360 361static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = { 362 [LWT_BPF_IN] = { .type = NLA_NESTED, }, 363 [LWT_BPF_OUT] = { .type = NLA_NESTED, }, 364 [LWT_BPF_XMIT] = { .type = NLA_NESTED, }, 365 [LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 }, 366}; 367 368static int bpf_build_state(struct net *net, struct nlattr *nla, 369 unsigned int family, const void *cfg, 370 struct lwtunnel_state **ts, 371 struct netlink_ext_ack *extack) 372{ 373 struct nlattr *tb[LWT_BPF_MAX + 1]; 374 struct lwtunnel_state *newts; 375 struct bpf_lwt *bpf; 376 int ret; 377 378 if (family != AF_INET && family != AF_INET6) 379 return -EAFNOSUPPORT; 380 381 ret = nla_parse_nested_deprecated(tb, LWT_BPF_MAX, nla, bpf_nl_policy, 382 extack); 383 if (ret < 0) 384 return ret; 385 386 if (!tb[LWT_BPF_IN] && !tb[LWT_BPF_OUT] && !tb[LWT_BPF_XMIT]) 387 return -EINVAL; 388 389 newts = lwtunnel_state_alloc(sizeof(*bpf)); 390 if (!newts) 391 return -ENOMEM; 392 393 newts->type = LWTUNNEL_ENCAP_BPF; 394 bpf = bpf_lwt_lwtunnel(newts); 395 396 if (tb[LWT_BPF_IN]) { 397 newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT; 398 ret = bpf_parse_prog(tb[LWT_BPF_IN], &bpf->in, 399 BPF_PROG_TYPE_LWT_IN); 400 if (ret < 0) 401 goto errout; 402 } 403 404 if (tb[LWT_BPF_OUT]) { 405 newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; 406 ret = bpf_parse_prog(tb[LWT_BPF_OUT], &bpf->out, 407 BPF_PROG_TYPE_LWT_OUT); 408 if (ret < 0) 409 goto errout; 410 } 411 412 if (tb[LWT_BPF_XMIT]) { 413 newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT; 414 ret = bpf_parse_prog(tb[LWT_BPF_XMIT], &bpf->xmit, 415 BPF_PROG_TYPE_LWT_XMIT); 416 if (ret < 0) 417 goto errout; 418 } 419 420 if (tb[LWT_BPF_XMIT_HEADROOM]) { 421 u32 headroom = nla_get_u32(tb[LWT_BPF_XMIT_HEADROOM]); 422 423 if (headroom > LWT_BPF_MAX_HEADROOM) { 424 ret = -ERANGE; 425 goto errout; 426 } 427 428 newts->headroom = headroom; 429 } 430 431 bpf->family = family; 432 *ts = newts; 433 434 return 0; 435 436errout: 437 bpf_destroy_state(newts); 438 kfree(newts); 439 return ret; 440} 441 442static int bpf_fill_lwt_prog(struct sk_buff *skb, int attr, 443 struct bpf_lwt_prog *prog) 444{ 445 struct nlattr *nest; 446 447 if (!prog->prog) 448 return 0; 449 450 nest = nla_nest_start_noflag(skb, attr); 451 if (!nest) 452 return -EMSGSIZE; 453 454 if (prog->name && 455 nla_put_string(skb, LWT_BPF_PROG_NAME, prog->name)) 456 return -EMSGSIZE; 457 458 return nla_nest_end(skb, nest); 459} 460 461static int bpf_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt) 462{ 463 struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); 464 465 if (bpf_fill_lwt_prog(skb, LWT_BPF_IN, &bpf->in) < 0 || 466 bpf_fill_lwt_prog(skb, LWT_BPF_OUT, &bpf->out) < 0 || 467 bpf_fill_lwt_prog(skb, LWT_BPF_XMIT, &bpf->xmit) < 0) 468 return -EMSGSIZE; 469 470 return 0; 471} 472 473static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate) 474{ 475 int nest_len = nla_total_size(sizeof(struct nlattr)) + 476 nla_total_size(MAX_PROG_NAME) + /* LWT_BPF_PROG_NAME */ 477 0; 478 479 return nest_len + /* LWT_BPF_IN */ 480 nest_len + /* LWT_BPF_OUT */ 481 nest_len + /* LWT_BPF_XMIT */ 482 0; 483} 484 485static int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b) 486{ 487 /* FIXME: 488 * The LWT state is currently rebuilt for delete requests which 489 * results in a new bpf_prog instance. Comparing names for now. 490 */ 491 if (!a->name && !b->name) 492 return 0; 493 494 if (!a->name || !b->name) 495 return 1; 496 497 return strcmp(a->name, b->name); 498} 499 500static int bpf_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) 501{ 502 struct bpf_lwt *a_bpf = bpf_lwt_lwtunnel(a); 503 struct bpf_lwt *b_bpf = bpf_lwt_lwtunnel(b); 504 505 return bpf_lwt_prog_cmp(&a_bpf->in, &b_bpf->in) || 506 bpf_lwt_prog_cmp(&a_bpf->out, &b_bpf->out) || 507 bpf_lwt_prog_cmp(&a_bpf->xmit, &b_bpf->xmit); 508} 509 510static const struct lwtunnel_encap_ops bpf_encap_ops = { 511 .build_state = bpf_build_state, 512 .destroy_state = bpf_destroy_state, 513 .input = bpf_input, 514 .output = bpf_output, 515 .xmit = bpf_xmit, 516 .fill_encap = bpf_fill_encap_info, 517 .get_encap_size = bpf_encap_nlsize, 518 .cmp_encap = bpf_encap_cmp, 519 .owner = THIS_MODULE, 520}; 521 522static int handle_gso_type(struct sk_buff *skb, unsigned int gso_type, 523 int encap_len) 524{ 525 struct skb_shared_info *shinfo = skb_shinfo(skb); 526 527 gso_type |= SKB_GSO_DODGY; 528 shinfo->gso_type |= gso_type; 529 skb_decrease_gso_size(shinfo, encap_len); 530 shinfo->gso_segs = 0; 531 return 0; 532} 533 534static int handle_gso_encap(struct sk_buff *skb, bool ipv4, int encap_len) 535{ 536 int next_hdr_offset; 537 void *next_hdr; 538 __u8 protocol; 539 540 /* SCTP and UDP_L4 gso need more nuanced handling than what 541 * handle_gso_type() does above: skb_decrease_gso_size() is not enough. 542 * So at the moment only TCP GSO packets are let through. 543 */ 544 if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) 545 return -ENOTSUPP; 546 547 if (ipv4) { 548 protocol = ip_hdr(skb)->protocol; 549 next_hdr_offset = sizeof(struct iphdr); 550 next_hdr = skb_network_header(skb) + next_hdr_offset; 551 } else { 552 protocol = ipv6_hdr(skb)->nexthdr; 553 next_hdr_offset = sizeof(struct ipv6hdr); 554 next_hdr = skb_network_header(skb) + next_hdr_offset; 555 } 556 557 switch (protocol) { 558 case IPPROTO_GRE: 559 next_hdr_offset += sizeof(struct gre_base_hdr); 560 if (next_hdr_offset > encap_len) 561 return -EINVAL; 562 563 if (((struct gre_base_hdr *)next_hdr)->flags & GRE_CSUM) 564 return handle_gso_type(skb, SKB_GSO_GRE_CSUM, 565 encap_len); 566 return handle_gso_type(skb, SKB_GSO_GRE, encap_len); 567 568 case IPPROTO_UDP: 569 next_hdr_offset += sizeof(struct udphdr); 570 if (next_hdr_offset > encap_len) 571 return -EINVAL; 572 573 if (((struct udphdr *)next_hdr)->check) 574 return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL_CSUM, 575 encap_len); 576 return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL, encap_len); 577 578 case IPPROTO_IP: 579 case IPPROTO_IPV6: 580 if (ipv4) 581 return handle_gso_type(skb, SKB_GSO_IPXIP4, encap_len); 582 else 583 return handle_gso_type(skb, SKB_GSO_IPXIP6, encap_len); 584 585 default: 586 return -EPROTONOSUPPORT; 587 } 588} 589 590int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress) 591{ 592 struct iphdr *iph; 593 bool ipv4; 594 int err; 595 596 if (unlikely(len < sizeof(struct iphdr) || len > LWT_BPF_MAX_HEADROOM)) 597 return -EINVAL; 598 599 /* validate protocol and length */ 600 iph = (struct iphdr *)hdr; 601 if (iph->version == 4) { 602 ipv4 = true; 603 if (unlikely(len < iph->ihl * 4)) 604 return -EINVAL; 605 } else if (iph->version == 6) { 606 ipv4 = false; 607 if (unlikely(len < sizeof(struct ipv6hdr))) 608 return -EINVAL; 609 } else { 610 return -EINVAL; 611 } 612 613 if (ingress) 614 err = skb_cow_head(skb, len + skb->mac_len); 615 else 616 err = skb_cow_head(skb, 617 len + LL_RESERVED_SPACE(skb_dst(skb)->dev)); 618 if (unlikely(err)) 619 return err; 620 621 /* push the encap headers and fix pointers */ 622 skb_reset_inner_headers(skb); 623 skb_reset_inner_mac_header(skb); /* mac header is not yet set */ 624 skb_set_inner_protocol(skb, skb->protocol); 625 skb->encapsulation = 1; 626 skb_push(skb, len); 627 if (ingress) 628 skb_postpush_rcsum(skb, iph, len); 629 skb_reset_network_header(skb); 630 memcpy(skb_network_header(skb), hdr, len); 631 bpf_compute_data_pointers(skb); 632 skb_clear_hash(skb); 633 634 if (ipv4) { 635 skb->protocol = htons(ETH_P_IP); 636 iph = ip_hdr(skb); 637 638 if (!iph->check) 639 iph->check = ip_fast_csum((unsigned char *)iph, 640 iph->ihl); 641 } else { 642 skb->protocol = htons(ETH_P_IPV6); 643 } 644 645 if (skb_is_gso(skb)) 646 return handle_gso_encap(skb, ipv4, len); 647 648 return 0; 649} 650 651static int __init bpf_lwt_init(void) 652{ 653 return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF); 654} 655 656subsys_initcall(bpf_lwt_init) 657