bpf: Add redirect_neigh helper as redirect drop-in

author Daniel Borkmann <daniel@iogearbox.net>

Wed, 30 Sep 2020 15:18:17 +0000 (17:18 +0200)

committer Alexei Starovoitov <ast@kernel.org>

Wed, 30 Sep 2020 18:50:35 +0000 (11:50 -0700)
author Daniel Borkmann <daniel@iogearbox.net>
Wed, 30 Sep 2020 15:18:17 +0000 (17:18 +0200)
committer Alexei Starovoitov <ast@kernel.org>
Wed, 30 Sep 2020 18:50:35 +0000 (11:50 -0700)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h

index 04a18e0..3d0cf37 100644 (file)
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2548,6 +2548,11 @@ static inline int skb_mac_header_was_set(const struct sk_buff *skb)
         return skb->mac_header != (typeof(skb->mac_header))~0U;
  }
  
+static inline void skb_unset_mac_header(struct sk_buff *skb)
+{
+       skb->mac_header = (typeof(skb->mac_header))~0U;
+}
+
  static inline void skb_reset_mac_header(struct sk_buff *skb)
  {
         skb->mac_header = skb->data - skb->head;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h

index 6116a7f..1f17c67 100644 (file)
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3652,6 +3652,19 @@ union bpf_attr {
   *             associated socket instead of the current process.
   *     Return
   *             The id is returned or 0 in case the id could not be retrieved.
+ *
+ * long bpf_redirect_neigh(u32 ifindex, u64 flags)
+ *     Description
+ *             Redirect the packet to another net device of index *ifindex*
+ *             and fill in L2 addresses from neighboring subsystem. This helper
+ *             is somewhat similar to **bpf_redirect**\ (), except that it
+ *             fills in e.g. MAC addresses based on the L3 information from
+ *             the packet. This helper is supported for IPv4 and IPv6 protocols.
+ *             The *flags* argument is reserved and must be 0. The helper is
+ *             currently only supported for tc BPF program types.
+ *     Return
+ *             The helper returns **TC_ACT_REDIRECT** on success or
+ *             **TC_ACT_SHOT** on error.
   */
  #define __BPF_FUNC_MAPPER(FN)          \
         FN(unspec),                     \
@@ -3806,6 +3819,7 @@ union bpf_attr {
         FN(snprintf_btf),               \
         FN(seq_printf_btf),             \
         FN(skb_cgroup_classid),         \
+       FN(redirect_neigh),             \
         /* */
  
  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/net/core/filter.c b/net/core/filter.c

index a0776e4..3fb6ada 100644 (file)
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2163,13 +2163,233 @@ static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev,
                 return __bpf_redirect_no_mac(skb, dev, flags);
  }
  
+#if IS_ENABLED(CONFIG_IPV6)
+static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb)
+{
+       struct dst_entry *dst = skb_dst(skb);
+       struct net_device *dev = dst->dev;
+       u32 hh_len = LL_RESERVED_SPACE(dev);
+       const struct in6_addr *nexthop;
+       struct neighbour *neigh;
+
+       if (dev_xmit_recursion()) {
+               net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
+               goto out_drop;
+       }
+
+       skb->dev = dev;
+       skb->tstamp = 0;
+
+       if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
+               struct sk_buff *skb2;
+
+               skb2 = skb_realloc_headroom(skb, hh_len);
+               if (unlikely(!skb2)) {
+                       kfree_skb(skb);
+                       return -ENOMEM;
+               }
+               if (skb->sk)
+                       skb_set_owner_w(skb2, skb->sk);
+               consume_skb(skb);
+               skb = skb2;
+       }
+
+       rcu_read_lock_bh();
+       nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst),
+                             &ipv6_hdr(skb)->daddr);
+       neigh = ip_neigh_gw6(dev, nexthop);
+       if (likely(!IS_ERR(neigh))) {
+               int ret;
+
+               sock_confirm_neigh(skb, neigh);
+               dev_xmit_recursion_inc();
+               ret = neigh_output(neigh, skb, false);
+               dev_xmit_recursion_dec();
+               rcu_read_unlock_bh();
+               return ret;
+       }
+       rcu_read_unlock_bh();
+       IP6_INC_STATS(dev_net(dst->dev),
+                     ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
+out_drop:
+       kfree_skb(skb);
+       return -ENETDOWN;
+}
+
+static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev)
+{
+       const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+       struct net *net = dev_net(dev);
+       int err, ret = NET_XMIT_DROP;
+       struct dst_entry *dst;
+       struct flowi6 fl6 = {
+               .flowi6_flags   = FLOWI_FLAG_ANYSRC,
+               .flowi6_mark    = skb->mark,
+               .flowlabel      = ip6_flowinfo(ip6h),
+               .flowi6_oif     = dev->ifindex,
+               .flowi6_proto   = ip6h->nexthdr,
+               .daddr          = ip6h->daddr,
+               .saddr          = ip6h->saddr,
+       };
+
+       dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL);
+       if (IS_ERR(dst))
+               goto out_drop;
+
+       skb_dst_set(skb, dst);
+
+       err = bpf_out_neigh_v6(net, skb);
+       if (unlikely(net_xmit_eval(err)))
+               dev->stats.tx_errors++;
+       else
+               ret = NET_XMIT_SUCCESS;
+       goto out_xmit;
+out_drop:
+       dev->stats.tx_errors++;
+       kfree_skb(skb);
+out_xmit:
+       return ret;
+}
+#else
+static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev)
+{
+       kfree_skb(skb);
+       return NET_XMIT_DROP;
+}
+#endif /* CONFIG_IPV6 */
+
+#if IS_ENABLED(CONFIG_INET)
+static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb)
+{
+       struct dst_entry *dst = skb_dst(skb);
+       struct rtable *rt = container_of(dst, struct rtable, dst);
+       struct net_device *dev = dst->dev;
+       u32 hh_len = LL_RESERVED_SPACE(dev);
+       struct neighbour *neigh;
+       bool is_v6gw = false;
+
+       if (dev_xmit_recursion()) {
+               net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
+               goto out_drop;
+       }
+
+       skb->dev = dev;
+       skb->tstamp = 0;
+
+       if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
+               struct sk_buff *skb2;
+
+               skb2 = skb_realloc_headroom(skb, hh_len);
+               if (unlikely(!skb2)) {
+                       kfree_skb(skb);
+                       return -ENOMEM;
+               }
+               if (skb->sk)
+                       skb_set_owner_w(skb2, skb->sk);
+               consume_skb(skb);
+               skb = skb2;
+       }
+
+       rcu_read_lock_bh();
+       neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
+       if (likely(!IS_ERR(neigh))) {
+               int ret;
+
+               sock_confirm_neigh(skb, neigh);
+               dev_xmit_recursion_inc();
+               ret = neigh_output(neigh, skb, is_v6gw);
+               dev_xmit_recursion_dec();
+               rcu_read_unlock_bh();
+               return ret;
+       }
+       rcu_read_unlock_bh();
+out_drop:
+       kfree_skb(skb);
+       return -ENETDOWN;
+}
+
+static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev)
+{
+       const struct iphdr *ip4h = ip_hdr(skb);
+       struct net *net = dev_net(dev);
+       int err, ret = NET_XMIT_DROP;
+       struct rtable *rt;
+       struct flowi4 fl4 = {
+               .flowi4_flags   = FLOWI_FLAG_ANYSRC,
+               .flowi4_mark    = skb->mark,
+               .flowi4_tos     = RT_TOS(ip4h->tos),
+               .flowi4_oif     = dev->ifindex,
+               .flowi4_proto   = ip4h->protocol,
+               .daddr          = ip4h->daddr,
+               .saddr          = ip4h->saddr,
+       };
+
+       rt = ip_route_output_flow(net, &fl4, NULL);
+       if (IS_ERR(rt))
+               goto out_drop;
+       if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) {
+               ip_rt_put(rt);
+               goto out_drop;
+       }
+
+       skb_dst_set(skb, &rt->dst);
+
+       err = bpf_out_neigh_v4(net, skb);
+       if (unlikely(net_xmit_eval(err)))
+               dev->stats.tx_errors++;
+       else
+               ret = NET_XMIT_SUCCESS;
+       goto out_xmit;
+out_drop:
+       dev->stats.tx_errors++;
+       kfree_skb(skb);
+out_xmit:
+       return ret;
+}
+#else
+static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev)
+{
+       kfree_skb(skb);
+       return NET_XMIT_DROP;
+}
+#endif /* CONFIG_INET */
+
+static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev)
+{
+       struct ethhdr *ethh = eth_hdr(skb);
+
+       if (unlikely(skb->mac_header >= skb->network_header))
+               goto out;
+       bpf_push_mac_rcsum(skb);
+       if (is_multicast_ether_addr(ethh->h_dest))
+               goto out;
+
+       skb_pull(skb, sizeof(*ethh));
+       skb_unset_mac_header(skb);
+       skb_reset_network_header(skb);
+
+       if (skb->protocol == htons(ETH_P_IP))
+               return __bpf_redirect_neigh_v4(skb, dev);
+       else if (skb->protocol == htons(ETH_P_IPV6))
+               return __bpf_redirect_neigh_v6(skb, dev);
+out:
+       kfree_skb(skb);
+       return -ENOTSUPP;
+}
+
+/* Internal, non-exposed redirect flags. */
+enum {
+       BPF_F_NEIGH = (1ULL << 1),
+#define BPF_F_REDIRECT_INTERNAL        (BPF_F_NEIGH)
+};
+
  BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
  {
         struct net_device *dev;
         struct sk_buff *clone;
         int ret;
  
-       if (unlikely(flags & ~(BPF_F_INGRESS)))
+       if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))
                 return -EINVAL;
  
         dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex);
@@ -2206,23 +2426,11 @@ static const struct bpf_func_proto bpf_clone_redirect_proto = {
  DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);
  EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info);
  
-BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
-{
-       struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
-
-       if (unlikely(flags & ~(BPF_F_INGRESS)))
-               return TC_ACT_SHOT;
-
-       ri->flags = flags;
-       ri->tgt_index = ifindex;
-
-       return TC_ACT_REDIRECT;
-}
-
  int skb_do_redirect(struct sk_buff *skb)
  {
         struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
         struct net_device *dev;
+       u32 flags = ri->flags;
  
         dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->tgt_index);
         ri->tgt_index = 0;
@@ -2231,7 +2439,22 @@ int skb_do_redirect(struct sk_buff *skb)
                 return -EINVAL;
         }
  
-       return __bpf_redirect(skb, dev, ri->flags);
+       return flags & BPF_F_NEIGH ?
+              __bpf_redirect_neigh(skb, dev) :
+              __bpf_redirect(skb, dev, flags);
+}
+
+BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
+{
+       struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+
+       if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))
+               return TC_ACT_SHOT;
+
+       ri->flags = flags;
+       ri->tgt_index = ifindex;
+
+       return TC_ACT_REDIRECT;
  }
  
  static const struct bpf_func_proto bpf_redirect_proto = {
@@ -2242,6 +2465,27 @@ static const struct bpf_func_proto bpf_redirect_proto = {
         .arg2_type      = ARG_ANYTHING,
  };
  
+BPF_CALL_2(bpf_redirect_neigh, u32, ifindex, u64, flags)
+{
+       struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+
+       if (unlikely(flags))
+               return TC_ACT_SHOT;
+
+       ri->flags = BPF_F_NEIGH;
+       ri->tgt_index = ifindex;
+
+       return TC_ACT_REDIRECT;
+}
+
+static const struct bpf_func_proto bpf_redirect_neigh_proto = {
+       .func           = bpf_redirect_neigh,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_ANYTHING,
+       .arg2_type      = ARG_ANYTHING,
+};
+
  BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes)
  {
         msg->apply_bytes = bytes;
@@ -6759,6 +7003,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
                 return bpf_get_skb_set_tunnel_proto(func_id);
         case BPF_FUNC_redirect:
                 return &bpf_redirect_proto;
+       case BPF_FUNC_redirect_neigh:
+               return &bpf_redirect_neigh_proto;
         case BPF_FUNC_get_route_realm:
                 return &bpf_get_route_realm_proto;
         case BPF_FUNC_get_hash_recalc:
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h

index 6116a7f..1f17c67 100644 (file)
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3652,6 +3652,19 @@ union bpf_attr {
   *             associated socket instead of the current process.
   *     Return
   *             The id is returned or 0 in case the id could not be retrieved.
+ *
+ * long bpf_redirect_neigh(u32 ifindex, u64 flags)
+ *     Description
+ *             Redirect the packet to another net device of index *ifindex*
+ *             and fill in L2 addresses from neighboring subsystem. This helper
+ *             is somewhat similar to **bpf_redirect**\ (), except that it
+ *             fills in e.g. MAC addresses based on the L3 information from
+ *             the packet. This helper is supported for IPv4 and IPv6 protocols.
+ *             The *flags* argument is reserved and must be 0. The helper is
+ *             currently only supported for tc BPF program types.
+ *     Return
+ *             The helper returns **TC_ACT_REDIRECT** on success or
+ *             **TC_ACT_SHOT** on error.
   */
  #define __BPF_FUNC_MAPPER(FN)          \
         FN(unspec),                     \
@@ -3806,6 +3819,7 @@ union bpf_attr {
         FN(snprintf_btf),               \
         FN(seq_printf_btf),             \
         FN(skb_cgroup_classid),         \
+       FN(redirect_neigh),             \
         /* */
  
  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
author	Daniel Borkmann <daniel@iogearbox.net>
	Wed, 30 Sep 2020 15:18:17 +0000 (17:18 +0200)
committer	Alexei Starovoitov <ast@kernel.org>
	Wed, 30 Sep 2020 18:50:35 +0000 (11:50 -0700)
include/linux/skbuff.h		patch \| blob \| history
include/uapi/linux/bpf.h		patch \| blob \| history
net/core/filter.c		patch \| blob \| history
tools/include/uapi/linux/bpf.h		patch \| blob \| history