162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 262306a36Sopenharmony_ci/* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch> 362306a36Sopenharmony_ci */ 462306a36Sopenharmony_ci 562306a36Sopenharmony_ci#include <linux/filter.h> 662306a36Sopenharmony_ci#include <linux/kernel.h> 762306a36Sopenharmony_ci#include <linux/module.h> 862306a36Sopenharmony_ci#include <linux/skbuff.h> 962306a36Sopenharmony_ci#include <linux/types.h> 1062306a36Sopenharmony_ci#include <linux/bpf.h> 1162306a36Sopenharmony_ci#include <net/lwtunnel.h> 1262306a36Sopenharmony_ci#include <net/gre.h> 1362306a36Sopenharmony_ci#include <net/ip6_route.h> 1462306a36Sopenharmony_ci#include <net/ipv6_stubs.h> 1562306a36Sopenharmony_ci 1662306a36Sopenharmony_cistruct bpf_lwt_prog { 1762306a36Sopenharmony_ci struct bpf_prog *prog; 1862306a36Sopenharmony_ci char *name; 1962306a36Sopenharmony_ci}; 2062306a36Sopenharmony_ci 2162306a36Sopenharmony_cistruct bpf_lwt { 2262306a36Sopenharmony_ci struct bpf_lwt_prog in; 2362306a36Sopenharmony_ci struct bpf_lwt_prog out; 2462306a36Sopenharmony_ci struct bpf_lwt_prog xmit; 2562306a36Sopenharmony_ci int family; 2662306a36Sopenharmony_ci}; 2762306a36Sopenharmony_ci 2862306a36Sopenharmony_ci#define MAX_PROG_NAME 256 2962306a36Sopenharmony_ci 3062306a36Sopenharmony_cistatic inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt) 3162306a36Sopenharmony_ci{ 3262306a36Sopenharmony_ci return (struct bpf_lwt *)lwt->data; 3362306a36Sopenharmony_ci} 3462306a36Sopenharmony_ci 3562306a36Sopenharmony_ci#define NO_REDIRECT false 3662306a36Sopenharmony_ci#define CAN_REDIRECT true 3762306a36Sopenharmony_ci 3862306a36Sopenharmony_cistatic int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, 3962306a36Sopenharmony_ci struct dst_entry *dst, bool can_redirect) 4062306a36Sopenharmony_ci{ 4162306a36Sopenharmony_ci int ret; 4262306a36Sopenharmony_ci 4362306a36Sopenharmony_ci /* Migration disable and BH disable are needed to protect per-cpu 4462306a36Sopenharmony_ci * redirect_info between BPF prog and skb_do_redirect(). 4562306a36Sopenharmony_ci */ 4662306a36Sopenharmony_ci migrate_disable(); 4762306a36Sopenharmony_ci local_bh_disable(); 4862306a36Sopenharmony_ci bpf_compute_data_pointers(skb); 4962306a36Sopenharmony_ci ret = bpf_prog_run_save_cb(lwt->prog, skb); 5062306a36Sopenharmony_ci 5162306a36Sopenharmony_ci switch (ret) { 5262306a36Sopenharmony_ci case BPF_OK: 5362306a36Sopenharmony_ci case BPF_LWT_REROUTE: 5462306a36Sopenharmony_ci break; 5562306a36Sopenharmony_ci 5662306a36Sopenharmony_ci case BPF_REDIRECT: 5762306a36Sopenharmony_ci if (unlikely(!can_redirect)) { 5862306a36Sopenharmony_ci pr_warn_once("Illegal redirect return code in prog %s\n", 5962306a36Sopenharmony_ci lwt->name ? : "<unknown>"); 6062306a36Sopenharmony_ci ret = BPF_OK; 6162306a36Sopenharmony_ci } else { 6262306a36Sopenharmony_ci skb_reset_mac_header(skb); 6362306a36Sopenharmony_ci skb_do_redirect(skb); 6462306a36Sopenharmony_ci ret = BPF_REDIRECT; 6562306a36Sopenharmony_ci } 6662306a36Sopenharmony_ci break; 6762306a36Sopenharmony_ci 6862306a36Sopenharmony_ci case BPF_DROP: 6962306a36Sopenharmony_ci kfree_skb(skb); 7062306a36Sopenharmony_ci ret = -EPERM; 7162306a36Sopenharmony_ci break; 7262306a36Sopenharmony_ci 7362306a36Sopenharmony_ci default: 7462306a36Sopenharmony_ci pr_warn_once("bpf-lwt: Illegal return value %u, expect packet loss\n", ret); 7562306a36Sopenharmony_ci kfree_skb(skb); 7662306a36Sopenharmony_ci ret = -EINVAL; 7762306a36Sopenharmony_ci break; 7862306a36Sopenharmony_ci } 7962306a36Sopenharmony_ci 8062306a36Sopenharmony_ci local_bh_enable(); 8162306a36Sopenharmony_ci migrate_enable(); 8262306a36Sopenharmony_ci 8362306a36Sopenharmony_ci return ret; 8462306a36Sopenharmony_ci} 8562306a36Sopenharmony_ci 8662306a36Sopenharmony_cistatic int bpf_lwt_input_reroute(struct sk_buff *skb) 8762306a36Sopenharmony_ci{ 8862306a36Sopenharmony_ci int err = -EINVAL; 8962306a36Sopenharmony_ci 9062306a36Sopenharmony_ci if (skb->protocol == htons(ETH_P_IP)) { 9162306a36Sopenharmony_ci struct net_device *dev = skb_dst(skb)->dev; 9262306a36Sopenharmony_ci struct iphdr *iph = ip_hdr(skb); 9362306a36Sopenharmony_ci 9462306a36Sopenharmony_ci dev_hold(dev); 9562306a36Sopenharmony_ci skb_dst_drop(skb); 9662306a36Sopenharmony_ci err = ip_route_input_noref(skb, iph->daddr, iph->saddr, 9762306a36Sopenharmony_ci iph->tos, dev); 9862306a36Sopenharmony_ci dev_put(dev); 9962306a36Sopenharmony_ci } else if (skb->protocol == htons(ETH_P_IPV6)) { 10062306a36Sopenharmony_ci skb_dst_drop(skb); 10162306a36Sopenharmony_ci err = ipv6_stub->ipv6_route_input(skb); 10262306a36Sopenharmony_ci } else { 10362306a36Sopenharmony_ci err = -EAFNOSUPPORT; 10462306a36Sopenharmony_ci } 10562306a36Sopenharmony_ci 10662306a36Sopenharmony_ci if (err) 10762306a36Sopenharmony_ci goto err; 10862306a36Sopenharmony_ci return dst_input(skb); 10962306a36Sopenharmony_ci 11062306a36Sopenharmony_cierr: 11162306a36Sopenharmony_ci kfree_skb(skb); 11262306a36Sopenharmony_ci return err; 11362306a36Sopenharmony_ci} 11462306a36Sopenharmony_ci 11562306a36Sopenharmony_cistatic int bpf_input(struct sk_buff *skb) 11662306a36Sopenharmony_ci{ 11762306a36Sopenharmony_ci struct dst_entry *dst = skb_dst(skb); 11862306a36Sopenharmony_ci struct bpf_lwt *bpf; 11962306a36Sopenharmony_ci int ret; 12062306a36Sopenharmony_ci 12162306a36Sopenharmony_ci bpf = bpf_lwt_lwtunnel(dst->lwtstate); 12262306a36Sopenharmony_ci if (bpf->in.prog) { 12362306a36Sopenharmony_ci ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT); 12462306a36Sopenharmony_ci if (ret < 0) 12562306a36Sopenharmony_ci return ret; 12662306a36Sopenharmony_ci if (ret == BPF_LWT_REROUTE) 12762306a36Sopenharmony_ci return bpf_lwt_input_reroute(skb); 12862306a36Sopenharmony_ci } 12962306a36Sopenharmony_ci 13062306a36Sopenharmony_ci if (unlikely(!dst->lwtstate->orig_input)) { 13162306a36Sopenharmony_ci kfree_skb(skb); 13262306a36Sopenharmony_ci return -EINVAL; 13362306a36Sopenharmony_ci } 13462306a36Sopenharmony_ci 13562306a36Sopenharmony_ci return dst->lwtstate->orig_input(skb); 13662306a36Sopenharmony_ci} 13762306a36Sopenharmony_ci 13862306a36Sopenharmony_cistatic int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb) 13962306a36Sopenharmony_ci{ 14062306a36Sopenharmony_ci struct dst_entry *dst = skb_dst(skb); 14162306a36Sopenharmony_ci struct bpf_lwt *bpf; 14262306a36Sopenharmony_ci int ret; 14362306a36Sopenharmony_ci 14462306a36Sopenharmony_ci bpf = bpf_lwt_lwtunnel(dst->lwtstate); 14562306a36Sopenharmony_ci if (bpf->out.prog) { 14662306a36Sopenharmony_ci ret = run_lwt_bpf(skb, &bpf->out, dst, NO_REDIRECT); 14762306a36Sopenharmony_ci if (ret < 0) 14862306a36Sopenharmony_ci return ret; 14962306a36Sopenharmony_ci } 15062306a36Sopenharmony_ci 15162306a36Sopenharmony_ci if (unlikely(!dst->lwtstate->orig_output)) { 15262306a36Sopenharmony_ci pr_warn_once("orig_output not set on dst for prog %s\n", 15362306a36Sopenharmony_ci bpf->out.name); 15462306a36Sopenharmony_ci kfree_skb(skb); 15562306a36Sopenharmony_ci return -EINVAL; 15662306a36Sopenharmony_ci } 15762306a36Sopenharmony_ci 15862306a36Sopenharmony_ci return dst->lwtstate->orig_output(net, sk, skb); 15962306a36Sopenharmony_ci} 16062306a36Sopenharmony_ci 16162306a36Sopenharmony_cistatic int xmit_check_hhlen(struct sk_buff *skb, int hh_len) 16262306a36Sopenharmony_ci{ 16362306a36Sopenharmony_ci if (skb_headroom(skb) < hh_len) { 16462306a36Sopenharmony_ci int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); 16562306a36Sopenharmony_ci 16662306a36Sopenharmony_ci if (pskb_expand_head(skb, nhead, 0, GFP_ATOMIC)) 16762306a36Sopenharmony_ci return -ENOMEM; 16862306a36Sopenharmony_ci } 16962306a36Sopenharmony_ci 17062306a36Sopenharmony_ci return 0; 17162306a36Sopenharmony_ci} 17262306a36Sopenharmony_ci 17362306a36Sopenharmony_cistatic int bpf_lwt_xmit_reroute(struct sk_buff *skb) 17462306a36Sopenharmony_ci{ 17562306a36Sopenharmony_ci struct net_device *l3mdev = l3mdev_master_dev_rcu(skb_dst(skb)->dev); 17662306a36Sopenharmony_ci int oif = l3mdev ? l3mdev->ifindex : 0; 17762306a36Sopenharmony_ci struct dst_entry *dst = NULL; 17862306a36Sopenharmony_ci int err = -EAFNOSUPPORT; 17962306a36Sopenharmony_ci struct sock *sk; 18062306a36Sopenharmony_ci struct net *net; 18162306a36Sopenharmony_ci bool ipv4; 18262306a36Sopenharmony_ci 18362306a36Sopenharmony_ci if (skb->protocol == htons(ETH_P_IP)) 18462306a36Sopenharmony_ci ipv4 = true; 18562306a36Sopenharmony_ci else if (skb->protocol == htons(ETH_P_IPV6)) 18662306a36Sopenharmony_ci ipv4 = false; 18762306a36Sopenharmony_ci else 18862306a36Sopenharmony_ci goto err; 18962306a36Sopenharmony_ci 19062306a36Sopenharmony_ci sk = sk_to_full_sk(skb->sk); 19162306a36Sopenharmony_ci if (sk) { 19262306a36Sopenharmony_ci if (sk->sk_bound_dev_if) 19362306a36Sopenharmony_ci oif = sk->sk_bound_dev_if; 19462306a36Sopenharmony_ci net = sock_net(sk); 19562306a36Sopenharmony_ci } else { 19662306a36Sopenharmony_ci net = dev_net(skb_dst(skb)->dev); 19762306a36Sopenharmony_ci } 19862306a36Sopenharmony_ci 19962306a36Sopenharmony_ci if (ipv4) { 20062306a36Sopenharmony_ci struct iphdr *iph = ip_hdr(skb); 20162306a36Sopenharmony_ci struct flowi4 fl4 = {}; 20262306a36Sopenharmony_ci struct rtable *rt; 20362306a36Sopenharmony_ci 20462306a36Sopenharmony_ci fl4.flowi4_oif = oif; 20562306a36Sopenharmony_ci fl4.flowi4_mark = skb->mark; 20662306a36Sopenharmony_ci fl4.flowi4_uid = sock_net_uid(net, sk); 20762306a36Sopenharmony_ci fl4.flowi4_tos = RT_TOS(iph->tos); 20862306a36Sopenharmony_ci fl4.flowi4_flags = FLOWI_FLAG_ANYSRC; 20962306a36Sopenharmony_ci fl4.flowi4_proto = iph->protocol; 21062306a36Sopenharmony_ci fl4.daddr = iph->daddr; 21162306a36Sopenharmony_ci fl4.saddr = iph->saddr; 21262306a36Sopenharmony_ci 21362306a36Sopenharmony_ci rt = ip_route_output_key(net, &fl4); 21462306a36Sopenharmony_ci if (IS_ERR(rt)) { 21562306a36Sopenharmony_ci err = PTR_ERR(rt); 21662306a36Sopenharmony_ci goto err; 21762306a36Sopenharmony_ci } 21862306a36Sopenharmony_ci dst = &rt->dst; 21962306a36Sopenharmony_ci } else { 22062306a36Sopenharmony_ci struct ipv6hdr *iph6 = ipv6_hdr(skb); 22162306a36Sopenharmony_ci struct flowi6 fl6 = {}; 22262306a36Sopenharmony_ci 22362306a36Sopenharmony_ci fl6.flowi6_oif = oif; 22462306a36Sopenharmony_ci fl6.flowi6_mark = skb->mark; 22562306a36Sopenharmony_ci fl6.flowi6_uid = sock_net_uid(net, sk); 22662306a36Sopenharmony_ci fl6.flowlabel = ip6_flowinfo(iph6); 22762306a36Sopenharmony_ci fl6.flowi6_proto = iph6->nexthdr; 22862306a36Sopenharmony_ci fl6.daddr = iph6->daddr; 22962306a36Sopenharmony_ci fl6.saddr = iph6->saddr; 23062306a36Sopenharmony_ci 23162306a36Sopenharmony_ci dst = ipv6_stub->ipv6_dst_lookup_flow(net, skb->sk, &fl6, NULL); 23262306a36Sopenharmony_ci if (IS_ERR(dst)) { 23362306a36Sopenharmony_ci err = PTR_ERR(dst); 23462306a36Sopenharmony_ci goto err; 23562306a36Sopenharmony_ci } 23662306a36Sopenharmony_ci } 23762306a36Sopenharmony_ci if (unlikely(dst->error)) { 23862306a36Sopenharmony_ci err = dst->error; 23962306a36Sopenharmony_ci dst_release(dst); 24062306a36Sopenharmony_ci goto err; 24162306a36Sopenharmony_ci } 24262306a36Sopenharmony_ci 24362306a36Sopenharmony_ci /* Although skb header was reserved in bpf_lwt_push_ip_encap(), it 24462306a36Sopenharmony_ci * was done for the previous dst, so we are doing it here again, in 24562306a36Sopenharmony_ci * case the new dst needs much more space. The call below is a noop 24662306a36Sopenharmony_ci * if there is enough header space in skb. 24762306a36Sopenharmony_ci */ 24862306a36Sopenharmony_ci err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); 24962306a36Sopenharmony_ci if (unlikely(err)) 25062306a36Sopenharmony_ci goto err; 25162306a36Sopenharmony_ci 25262306a36Sopenharmony_ci skb_dst_drop(skb); 25362306a36Sopenharmony_ci skb_dst_set(skb, dst); 25462306a36Sopenharmony_ci 25562306a36Sopenharmony_ci err = dst_output(dev_net(skb_dst(skb)->dev), skb->sk, skb); 25662306a36Sopenharmony_ci if (unlikely(err)) 25762306a36Sopenharmony_ci return net_xmit_errno(err); 25862306a36Sopenharmony_ci 25962306a36Sopenharmony_ci /* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */ 26062306a36Sopenharmony_ci return LWTUNNEL_XMIT_DONE; 26162306a36Sopenharmony_ci 26262306a36Sopenharmony_cierr: 26362306a36Sopenharmony_ci kfree_skb(skb); 26462306a36Sopenharmony_ci return err; 26562306a36Sopenharmony_ci} 26662306a36Sopenharmony_ci 26762306a36Sopenharmony_cistatic int bpf_xmit(struct sk_buff *skb) 26862306a36Sopenharmony_ci{ 26962306a36Sopenharmony_ci struct dst_entry *dst = skb_dst(skb); 27062306a36Sopenharmony_ci struct bpf_lwt *bpf; 27162306a36Sopenharmony_ci 27262306a36Sopenharmony_ci bpf = bpf_lwt_lwtunnel(dst->lwtstate); 27362306a36Sopenharmony_ci if (bpf->xmit.prog) { 27462306a36Sopenharmony_ci int hh_len = dst->dev->hard_header_len; 27562306a36Sopenharmony_ci __be16 proto = skb->protocol; 27662306a36Sopenharmony_ci int ret; 27762306a36Sopenharmony_ci 27862306a36Sopenharmony_ci ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT); 27962306a36Sopenharmony_ci switch (ret) { 28062306a36Sopenharmony_ci case BPF_OK: 28162306a36Sopenharmony_ci /* If the header changed, e.g. via bpf_lwt_push_encap, 28262306a36Sopenharmony_ci * BPF_LWT_REROUTE below should have been used if the 28362306a36Sopenharmony_ci * protocol was also changed. 28462306a36Sopenharmony_ci */ 28562306a36Sopenharmony_ci if (skb->protocol != proto) { 28662306a36Sopenharmony_ci kfree_skb(skb); 28762306a36Sopenharmony_ci return -EINVAL; 28862306a36Sopenharmony_ci } 28962306a36Sopenharmony_ci /* If the header was expanded, headroom might be too 29062306a36Sopenharmony_ci * small for L2 header to come, expand as needed. 29162306a36Sopenharmony_ci */ 29262306a36Sopenharmony_ci ret = xmit_check_hhlen(skb, hh_len); 29362306a36Sopenharmony_ci if (unlikely(ret)) 29462306a36Sopenharmony_ci return ret; 29562306a36Sopenharmony_ci 29662306a36Sopenharmony_ci return LWTUNNEL_XMIT_CONTINUE; 29762306a36Sopenharmony_ci case BPF_REDIRECT: 29862306a36Sopenharmony_ci return LWTUNNEL_XMIT_DONE; 29962306a36Sopenharmony_ci case BPF_LWT_REROUTE: 30062306a36Sopenharmony_ci return bpf_lwt_xmit_reroute(skb); 30162306a36Sopenharmony_ci default: 30262306a36Sopenharmony_ci return ret; 30362306a36Sopenharmony_ci } 30462306a36Sopenharmony_ci } 30562306a36Sopenharmony_ci 30662306a36Sopenharmony_ci return LWTUNNEL_XMIT_CONTINUE; 30762306a36Sopenharmony_ci} 30862306a36Sopenharmony_ci 30962306a36Sopenharmony_cistatic void bpf_lwt_prog_destroy(struct bpf_lwt_prog *prog) 31062306a36Sopenharmony_ci{ 31162306a36Sopenharmony_ci if (prog->prog) 31262306a36Sopenharmony_ci bpf_prog_put(prog->prog); 31362306a36Sopenharmony_ci 31462306a36Sopenharmony_ci kfree(prog->name); 31562306a36Sopenharmony_ci} 31662306a36Sopenharmony_ci 31762306a36Sopenharmony_cistatic void bpf_destroy_state(struct lwtunnel_state *lwt) 31862306a36Sopenharmony_ci{ 31962306a36Sopenharmony_ci struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); 32062306a36Sopenharmony_ci 32162306a36Sopenharmony_ci bpf_lwt_prog_destroy(&bpf->in); 32262306a36Sopenharmony_ci bpf_lwt_prog_destroy(&bpf->out); 32362306a36Sopenharmony_ci bpf_lwt_prog_destroy(&bpf->xmit); 32462306a36Sopenharmony_ci} 32562306a36Sopenharmony_ci 32662306a36Sopenharmony_cistatic const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = { 32762306a36Sopenharmony_ci [LWT_BPF_PROG_FD] = { .type = NLA_U32, }, 32862306a36Sopenharmony_ci [LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING, 32962306a36Sopenharmony_ci .len = MAX_PROG_NAME }, 33062306a36Sopenharmony_ci}; 33162306a36Sopenharmony_ci 33262306a36Sopenharmony_cistatic int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog, 33362306a36Sopenharmony_ci enum bpf_prog_type type) 33462306a36Sopenharmony_ci{ 33562306a36Sopenharmony_ci struct nlattr *tb[LWT_BPF_PROG_MAX + 1]; 33662306a36Sopenharmony_ci struct bpf_prog *p; 33762306a36Sopenharmony_ci int ret; 33862306a36Sopenharmony_ci u32 fd; 33962306a36Sopenharmony_ci 34062306a36Sopenharmony_ci ret = nla_parse_nested_deprecated(tb, LWT_BPF_PROG_MAX, attr, 34162306a36Sopenharmony_ci bpf_prog_policy, NULL); 34262306a36Sopenharmony_ci if (ret < 0) 34362306a36Sopenharmony_ci return ret; 34462306a36Sopenharmony_ci 34562306a36Sopenharmony_ci if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME]) 34662306a36Sopenharmony_ci return -EINVAL; 34762306a36Sopenharmony_ci 34862306a36Sopenharmony_ci prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_ATOMIC); 34962306a36Sopenharmony_ci if (!prog->name) 35062306a36Sopenharmony_ci return -ENOMEM; 35162306a36Sopenharmony_ci 35262306a36Sopenharmony_ci fd = nla_get_u32(tb[LWT_BPF_PROG_FD]); 35362306a36Sopenharmony_ci p = bpf_prog_get_type(fd, type); 35462306a36Sopenharmony_ci if (IS_ERR(p)) 35562306a36Sopenharmony_ci return PTR_ERR(p); 35662306a36Sopenharmony_ci 35762306a36Sopenharmony_ci prog->prog = p; 35862306a36Sopenharmony_ci 35962306a36Sopenharmony_ci return 0; 36062306a36Sopenharmony_ci} 36162306a36Sopenharmony_ci 36262306a36Sopenharmony_cistatic const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = { 36362306a36Sopenharmony_ci [LWT_BPF_IN] = { .type = NLA_NESTED, }, 36462306a36Sopenharmony_ci [LWT_BPF_OUT] = { .type = NLA_NESTED, }, 36562306a36Sopenharmony_ci [LWT_BPF_XMIT] = { .type = NLA_NESTED, }, 36662306a36Sopenharmony_ci [LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 }, 36762306a36Sopenharmony_ci}; 36862306a36Sopenharmony_ci 36962306a36Sopenharmony_cistatic int bpf_build_state(struct net *net, struct nlattr *nla, 37062306a36Sopenharmony_ci unsigned int family, const void *cfg, 37162306a36Sopenharmony_ci struct lwtunnel_state **ts, 37262306a36Sopenharmony_ci struct netlink_ext_ack *extack) 37362306a36Sopenharmony_ci{ 37462306a36Sopenharmony_ci struct nlattr *tb[LWT_BPF_MAX + 1]; 37562306a36Sopenharmony_ci struct lwtunnel_state *newts; 37662306a36Sopenharmony_ci struct bpf_lwt *bpf; 37762306a36Sopenharmony_ci int ret; 37862306a36Sopenharmony_ci 37962306a36Sopenharmony_ci if (family != AF_INET && family != AF_INET6) 38062306a36Sopenharmony_ci return -EAFNOSUPPORT; 38162306a36Sopenharmony_ci 38262306a36Sopenharmony_ci ret = nla_parse_nested_deprecated(tb, LWT_BPF_MAX, nla, bpf_nl_policy, 38362306a36Sopenharmony_ci extack); 38462306a36Sopenharmony_ci if (ret < 0) 38562306a36Sopenharmony_ci return ret; 38662306a36Sopenharmony_ci 38762306a36Sopenharmony_ci if (!tb[LWT_BPF_IN] && !tb[LWT_BPF_OUT] && !tb[LWT_BPF_XMIT]) 38862306a36Sopenharmony_ci return -EINVAL; 38962306a36Sopenharmony_ci 39062306a36Sopenharmony_ci newts = lwtunnel_state_alloc(sizeof(*bpf)); 39162306a36Sopenharmony_ci if (!newts) 39262306a36Sopenharmony_ci return -ENOMEM; 39362306a36Sopenharmony_ci 39462306a36Sopenharmony_ci newts->type = LWTUNNEL_ENCAP_BPF; 39562306a36Sopenharmony_ci bpf = bpf_lwt_lwtunnel(newts); 39662306a36Sopenharmony_ci 39762306a36Sopenharmony_ci if (tb[LWT_BPF_IN]) { 39862306a36Sopenharmony_ci newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT; 39962306a36Sopenharmony_ci ret = bpf_parse_prog(tb[LWT_BPF_IN], &bpf->in, 40062306a36Sopenharmony_ci BPF_PROG_TYPE_LWT_IN); 40162306a36Sopenharmony_ci if (ret < 0) 40262306a36Sopenharmony_ci goto errout; 40362306a36Sopenharmony_ci } 40462306a36Sopenharmony_ci 40562306a36Sopenharmony_ci if (tb[LWT_BPF_OUT]) { 40662306a36Sopenharmony_ci newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; 40762306a36Sopenharmony_ci ret = bpf_parse_prog(tb[LWT_BPF_OUT], &bpf->out, 40862306a36Sopenharmony_ci BPF_PROG_TYPE_LWT_OUT); 40962306a36Sopenharmony_ci if (ret < 0) 41062306a36Sopenharmony_ci goto errout; 41162306a36Sopenharmony_ci } 41262306a36Sopenharmony_ci 41362306a36Sopenharmony_ci if (tb[LWT_BPF_XMIT]) { 41462306a36Sopenharmony_ci newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT; 41562306a36Sopenharmony_ci ret = bpf_parse_prog(tb[LWT_BPF_XMIT], &bpf->xmit, 41662306a36Sopenharmony_ci BPF_PROG_TYPE_LWT_XMIT); 41762306a36Sopenharmony_ci if (ret < 0) 41862306a36Sopenharmony_ci goto errout; 41962306a36Sopenharmony_ci } 42062306a36Sopenharmony_ci 42162306a36Sopenharmony_ci if (tb[LWT_BPF_XMIT_HEADROOM]) { 42262306a36Sopenharmony_ci u32 headroom = nla_get_u32(tb[LWT_BPF_XMIT_HEADROOM]); 42362306a36Sopenharmony_ci 42462306a36Sopenharmony_ci if (headroom > LWT_BPF_MAX_HEADROOM) { 42562306a36Sopenharmony_ci ret = -ERANGE; 42662306a36Sopenharmony_ci goto errout; 42762306a36Sopenharmony_ci } 42862306a36Sopenharmony_ci 42962306a36Sopenharmony_ci newts->headroom = headroom; 43062306a36Sopenharmony_ci } 43162306a36Sopenharmony_ci 43262306a36Sopenharmony_ci bpf->family = family; 43362306a36Sopenharmony_ci *ts = newts; 43462306a36Sopenharmony_ci 43562306a36Sopenharmony_ci return 0; 43662306a36Sopenharmony_ci 43762306a36Sopenharmony_cierrout: 43862306a36Sopenharmony_ci bpf_destroy_state(newts); 43962306a36Sopenharmony_ci kfree(newts); 44062306a36Sopenharmony_ci return ret; 44162306a36Sopenharmony_ci} 44262306a36Sopenharmony_ci 44362306a36Sopenharmony_cistatic int bpf_fill_lwt_prog(struct sk_buff *skb, int attr, 44462306a36Sopenharmony_ci struct bpf_lwt_prog *prog) 44562306a36Sopenharmony_ci{ 44662306a36Sopenharmony_ci struct nlattr *nest; 44762306a36Sopenharmony_ci 44862306a36Sopenharmony_ci if (!prog->prog) 44962306a36Sopenharmony_ci return 0; 45062306a36Sopenharmony_ci 45162306a36Sopenharmony_ci nest = nla_nest_start_noflag(skb, attr); 45262306a36Sopenharmony_ci if (!nest) 45362306a36Sopenharmony_ci return -EMSGSIZE; 45462306a36Sopenharmony_ci 45562306a36Sopenharmony_ci if (prog->name && 45662306a36Sopenharmony_ci nla_put_string(skb, LWT_BPF_PROG_NAME, prog->name)) 45762306a36Sopenharmony_ci return -EMSGSIZE; 45862306a36Sopenharmony_ci 45962306a36Sopenharmony_ci return nla_nest_end(skb, nest); 46062306a36Sopenharmony_ci} 46162306a36Sopenharmony_ci 46262306a36Sopenharmony_cistatic int bpf_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt) 46362306a36Sopenharmony_ci{ 46462306a36Sopenharmony_ci struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); 46562306a36Sopenharmony_ci 46662306a36Sopenharmony_ci if (bpf_fill_lwt_prog(skb, LWT_BPF_IN, &bpf->in) < 0 || 46762306a36Sopenharmony_ci bpf_fill_lwt_prog(skb, LWT_BPF_OUT, &bpf->out) < 0 || 46862306a36Sopenharmony_ci bpf_fill_lwt_prog(skb, LWT_BPF_XMIT, &bpf->xmit) < 0) 46962306a36Sopenharmony_ci return -EMSGSIZE; 47062306a36Sopenharmony_ci 47162306a36Sopenharmony_ci return 0; 47262306a36Sopenharmony_ci} 47362306a36Sopenharmony_ci 47462306a36Sopenharmony_cistatic int bpf_encap_nlsize(struct lwtunnel_state *lwtstate) 47562306a36Sopenharmony_ci{ 47662306a36Sopenharmony_ci int nest_len = nla_total_size(sizeof(struct nlattr)) + 47762306a36Sopenharmony_ci nla_total_size(MAX_PROG_NAME) + /* LWT_BPF_PROG_NAME */ 47862306a36Sopenharmony_ci 0; 47962306a36Sopenharmony_ci 48062306a36Sopenharmony_ci return nest_len + /* LWT_BPF_IN */ 48162306a36Sopenharmony_ci nest_len + /* LWT_BPF_OUT */ 48262306a36Sopenharmony_ci nest_len + /* LWT_BPF_XMIT */ 48362306a36Sopenharmony_ci 0; 48462306a36Sopenharmony_ci} 48562306a36Sopenharmony_ci 48662306a36Sopenharmony_cistatic int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b) 48762306a36Sopenharmony_ci{ 48862306a36Sopenharmony_ci /* FIXME: 48962306a36Sopenharmony_ci * The LWT state is currently rebuilt for delete requests which 49062306a36Sopenharmony_ci * results in a new bpf_prog instance. Comparing names for now. 49162306a36Sopenharmony_ci */ 49262306a36Sopenharmony_ci if (!a->name && !b->name) 49362306a36Sopenharmony_ci return 0; 49462306a36Sopenharmony_ci 49562306a36Sopenharmony_ci if (!a->name || !b->name) 49662306a36Sopenharmony_ci return 1; 49762306a36Sopenharmony_ci 49862306a36Sopenharmony_ci return strcmp(a->name, b->name); 49962306a36Sopenharmony_ci} 50062306a36Sopenharmony_ci 50162306a36Sopenharmony_cistatic int bpf_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) 50262306a36Sopenharmony_ci{ 50362306a36Sopenharmony_ci struct bpf_lwt *a_bpf = bpf_lwt_lwtunnel(a); 50462306a36Sopenharmony_ci struct bpf_lwt *b_bpf = bpf_lwt_lwtunnel(b); 50562306a36Sopenharmony_ci 50662306a36Sopenharmony_ci return bpf_lwt_prog_cmp(&a_bpf->in, &b_bpf->in) || 50762306a36Sopenharmony_ci bpf_lwt_prog_cmp(&a_bpf->out, &b_bpf->out) || 50862306a36Sopenharmony_ci bpf_lwt_prog_cmp(&a_bpf->xmit, &b_bpf->xmit); 50962306a36Sopenharmony_ci} 51062306a36Sopenharmony_ci 51162306a36Sopenharmony_cistatic const struct lwtunnel_encap_ops bpf_encap_ops = { 51262306a36Sopenharmony_ci .build_state = bpf_build_state, 51362306a36Sopenharmony_ci .destroy_state = bpf_destroy_state, 51462306a36Sopenharmony_ci .input = bpf_input, 51562306a36Sopenharmony_ci .output = bpf_output, 51662306a36Sopenharmony_ci .xmit = bpf_xmit, 51762306a36Sopenharmony_ci .fill_encap = bpf_fill_encap_info, 51862306a36Sopenharmony_ci .get_encap_size = bpf_encap_nlsize, 51962306a36Sopenharmony_ci .cmp_encap = bpf_encap_cmp, 52062306a36Sopenharmony_ci .owner = THIS_MODULE, 52162306a36Sopenharmony_ci}; 52262306a36Sopenharmony_ci 52362306a36Sopenharmony_cistatic int handle_gso_type(struct sk_buff *skb, unsigned int gso_type, 52462306a36Sopenharmony_ci int encap_len) 52562306a36Sopenharmony_ci{ 52662306a36Sopenharmony_ci struct skb_shared_info *shinfo = skb_shinfo(skb); 52762306a36Sopenharmony_ci 52862306a36Sopenharmony_ci gso_type |= SKB_GSO_DODGY; 52962306a36Sopenharmony_ci shinfo->gso_type |= gso_type; 53062306a36Sopenharmony_ci skb_decrease_gso_size(shinfo, encap_len); 53162306a36Sopenharmony_ci shinfo->gso_segs = 0; 53262306a36Sopenharmony_ci return 0; 53362306a36Sopenharmony_ci} 53462306a36Sopenharmony_ci 53562306a36Sopenharmony_cistatic int handle_gso_encap(struct sk_buff *skb, bool ipv4, int encap_len) 53662306a36Sopenharmony_ci{ 53762306a36Sopenharmony_ci int next_hdr_offset; 53862306a36Sopenharmony_ci void *next_hdr; 53962306a36Sopenharmony_ci __u8 protocol; 54062306a36Sopenharmony_ci 54162306a36Sopenharmony_ci /* SCTP and UDP_L4 gso need more nuanced handling than what 54262306a36Sopenharmony_ci * handle_gso_type() does above: skb_decrease_gso_size() is not enough. 54362306a36Sopenharmony_ci * So at the moment only TCP GSO packets are let through. 54462306a36Sopenharmony_ci */ 54562306a36Sopenharmony_ci if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) 54662306a36Sopenharmony_ci return -ENOTSUPP; 54762306a36Sopenharmony_ci 54862306a36Sopenharmony_ci if (ipv4) { 54962306a36Sopenharmony_ci protocol = ip_hdr(skb)->protocol; 55062306a36Sopenharmony_ci next_hdr_offset = sizeof(struct iphdr); 55162306a36Sopenharmony_ci next_hdr = skb_network_header(skb) + next_hdr_offset; 55262306a36Sopenharmony_ci } else { 55362306a36Sopenharmony_ci protocol = ipv6_hdr(skb)->nexthdr; 55462306a36Sopenharmony_ci next_hdr_offset = sizeof(struct ipv6hdr); 55562306a36Sopenharmony_ci next_hdr = skb_network_header(skb) + next_hdr_offset; 55662306a36Sopenharmony_ci } 55762306a36Sopenharmony_ci 55862306a36Sopenharmony_ci switch (protocol) { 55962306a36Sopenharmony_ci case IPPROTO_GRE: 56062306a36Sopenharmony_ci next_hdr_offset += sizeof(struct gre_base_hdr); 56162306a36Sopenharmony_ci if (next_hdr_offset > encap_len) 56262306a36Sopenharmony_ci return -EINVAL; 56362306a36Sopenharmony_ci 56462306a36Sopenharmony_ci if (((struct gre_base_hdr *)next_hdr)->flags & GRE_CSUM) 56562306a36Sopenharmony_ci return handle_gso_type(skb, SKB_GSO_GRE_CSUM, 56662306a36Sopenharmony_ci encap_len); 56762306a36Sopenharmony_ci return handle_gso_type(skb, SKB_GSO_GRE, encap_len); 56862306a36Sopenharmony_ci 56962306a36Sopenharmony_ci case IPPROTO_UDP: 57062306a36Sopenharmony_ci next_hdr_offset += sizeof(struct udphdr); 57162306a36Sopenharmony_ci if (next_hdr_offset > encap_len) 57262306a36Sopenharmony_ci return -EINVAL; 57362306a36Sopenharmony_ci 57462306a36Sopenharmony_ci if (((struct udphdr *)next_hdr)->check) 57562306a36Sopenharmony_ci return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL_CSUM, 57662306a36Sopenharmony_ci encap_len); 57762306a36Sopenharmony_ci return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL, encap_len); 57862306a36Sopenharmony_ci 57962306a36Sopenharmony_ci case IPPROTO_IP: 58062306a36Sopenharmony_ci case IPPROTO_IPV6: 58162306a36Sopenharmony_ci if (ipv4) 58262306a36Sopenharmony_ci return handle_gso_type(skb, SKB_GSO_IPXIP4, encap_len); 58362306a36Sopenharmony_ci else 58462306a36Sopenharmony_ci return handle_gso_type(skb, SKB_GSO_IPXIP6, encap_len); 58562306a36Sopenharmony_ci 58662306a36Sopenharmony_ci default: 58762306a36Sopenharmony_ci return -EPROTONOSUPPORT; 58862306a36Sopenharmony_ci } 58962306a36Sopenharmony_ci} 59062306a36Sopenharmony_ci 59162306a36Sopenharmony_ciint bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress) 59262306a36Sopenharmony_ci{ 59362306a36Sopenharmony_ci struct iphdr *iph; 59462306a36Sopenharmony_ci bool ipv4; 59562306a36Sopenharmony_ci int err; 59662306a36Sopenharmony_ci 59762306a36Sopenharmony_ci if (unlikely(len < sizeof(struct iphdr) || len > LWT_BPF_MAX_HEADROOM)) 59862306a36Sopenharmony_ci return -EINVAL; 59962306a36Sopenharmony_ci 60062306a36Sopenharmony_ci /* validate protocol and length */ 60162306a36Sopenharmony_ci iph = (struct iphdr *)hdr; 60262306a36Sopenharmony_ci if (iph->version == 4) { 60362306a36Sopenharmony_ci ipv4 = true; 60462306a36Sopenharmony_ci if (unlikely(len < iph->ihl * 4)) 60562306a36Sopenharmony_ci return -EINVAL; 60662306a36Sopenharmony_ci } else if (iph->version == 6) { 60762306a36Sopenharmony_ci ipv4 = false; 60862306a36Sopenharmony_ci if (unlikely(len < sizeof(struct ipv6hdr))) 60962306a36Sopenharmony_ci return -EINVAL; 61062306a36Sopenharmony_ci } else { 61162306a36Sopenharmony_ci return -EINVAL; 61262306a36Sopenharmony_ci } 61362306a36Sopenharmony_ci 61462306a36Sopenharmony_ci if (ingress) 61562306a36Sopenharmony_ci err = skb_cow_head(skb, len + skb->mac_len); 61662306a36Sopenharmony_ci else 61762306a36Sopenharmony_ci err = skb_cow_head(skb, 61862306a36Sopenharmony_ci len + LL_RESERVED_SPACE(skb_dst(skb)->dev)); 61962306a36Sopenharmony_ci if (unlikely(err)) 62062306a36Sopenharmony_ci return err; 62162306a36Sopenharmony_ci 62262306a36Sopenharmony_ci /* push the encap headers and fix pointers */ 62362306a36Sopenharmony_ci skb_reset_inner_headers(skb); 62462306a36Sopenharmony_ci skb_reset_inner_mac_header(skb); /* mac header is not yet set */ 62562306a36Sopenharmony_ci skb_set_inner_protocol(skb, skb->protocol); 62662306a36Sopenharmony_ci skb->encapsulation = 1; 62762306a36Sopenharmony_ci skb_push(skb, len); 62862306a36Sopenharmony_ci if (ingress) 62962306a36Sopenharmony_ci skb_postpush_rcsum(skb, iph, len); 63062306a36Sopenharmony_ci skb_reset_network_header(skb); 63162306a36Sopenharmony_ci memcpy(skb_network_header(skb), hdr, len); 63262306a36Sopenharmony_ci bpf_compute_data_pointers(skb); 63362306a36Sopenharmony_ci skb_clear_hash(skb); 63462306a36Sopenharmony_ci 63562306a36Sopenharmony_ci if (ipv4) { 63662306a36Sopenharmony_ci skb->protocol = htons(ETH_P_IP); 63762306a36Sopenharmony_ci iph = ip_hdr(skb); 63862306a36Sopenharmony_ci 63962306a36Sopenharmony_ci if (!iph->check) 64062306a36Sopenharmony_ci iph->check = ip_fast_csum((unsigned char *)iph, 64162306a36Sopenharmony_ci iph->ihl); 64262306a36Sopenharmony_ci } else { 64362306a36Sopenharmony_ci skb->protocol = htons(ETH_P_IPV6); 64462306a36Sopenharmony_ci } 64562306a36Sopenharmony_ci 64662306a36Sopenharmony_ci if (skb_is_gso(skb)) 64762306a36Sopenharmony_ci return handle_gso_encap(skb, ipv4, len); 64862306a36Sopenharmony_ci 64962306a36Sopenharmony_ci return 0; 65062306a36Sopenharmony_ci} 65162306a36Sopenharmony_ci 65262306a36Sopenharmony_cistatic int __init bpf_lwt_init(void) 65362306a36Sopenharmony_ci{ 65462306a36Sopenharmony_ci return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF); 65562306a36Sopenharmony_ci} 65662306a36Sopenharmony_ci 65762306a36Sopenharmony_cisubsys_initcall(bpf_lwt_init) 658