bpf: Fix bpf_redirect_neigh helper api to support supplying nexthop
authorToke Høiland-Jørgensen <toke@redhat.com>
Tue, 20 Oct 2020 21:25:56 +0000 (23:25 +0200)
committerDaniel Borkmann <daniel@iogearbox.net>
Wed, 21 Oct 2020 23:28:54 +0000 (01:28 +0200)
Based on the discussion in [0], update the bpf_redirect_neigh() helper to
accept an optional parameter specifying the nexthop information. This makes
it possible to combine bpf_fib_lookup() and bpf_redirect_neigh() without
incurring a duplicate FIB lookup - since the FIB lookup helper will return
the nexthop information even if no neighbour is present, this can simply
be passed on to bpf_redirect_neigh() if bpf_fib_lookup() returns
BPF_FIB_LKUP_RET_NO_NEIGH. Thus fix & extend it before helper API is frozen.

  [0] https://lore.kernel.org/bpf/393e17fc-d187-3a8d-2f0d-a627c7c63fca@iogearbox.net/

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/bpf/160322915615.32199.1187570224032024535.stgit@toke.dk
include/linux/filter.h
include/uapi/linux/bpf.h
net/core/filter.c
scripts/bpf_helpers_doc.py
tools/include/uapi/linux/bpf.h

index 20fc24c..72d62cb 100644 (file)
@@ -607,12 +607,21 @@ struct bpf_skb_data_end {
        void *data_end;
 };
 
+struct bpf_nh_params {
+       u32 nh_family;
+       union {
+               u32 ipv4_nh;
+               struct in6_addr ipv6_nh;
+       };
+};
+
 struct bpf_redirect_info {
        u32 flags;
        u32 tgt_index;
        void *tgt_value;
        struct bpf_map *map;
        u32 kern_flags;
+       struct bpf_nh_params nh;
 };
 
 DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);
index bf5a99d..e6ceac3 100644 (file)
@@ -3677,15 +3677,19 @@ union bpf_attr {
  *     Return
  *             The id is returned or 0 in case the id could not be retrieved.
  *
- * long bpf_redirect_neigh(u32 ifindex, u64 flags)
+ * long bpf_redirect_neigh(u32 ifindex, struct bpf_redir_neigh *params, int plen, u64 flags)
  *     Description
  *             Redirect the packet to another net device of index *ifindex*
  *             and fill in L2 addresses from neighboring subsystem. This helper
  *             is somewhat similar to **bpf_redirect**\ (), except that it
  *             populates L2 addresses as well, meaning, internally, the helper
- *             performs a FIB lookup based on the skb's networking header to
- *             get the address of the next hop and then relies on the neighbor
- *             lookup for the L2 address of the nexthop.
+ *             relies on the neighbor lookup for the L2 address of the nexthop.
+ *
+ *             The helper will perform a FIB lookup based on the skb's
+ *             networking header to get the address of the next hop, unless
+ *             this is supplied by the caller in the *params* argument. The
+ *             *plen* argument indicates the len of *params* and should be set
+ *             to 0 if *params* is NULL.
  *
  *             The *flags* argument is reserved and must be 0. The helper is
  *             currently only supported for tc BPF program types, and enabled
@@ -4906,6 +4910,16 @@ struct bpf_fib_lookup {
        __u8    dmac[6];     /* ETH_ALEN */
 };
 
+struct bpf_redir_neigh {
+       /* network family for lookup (AF_INET, AF_INET6) */
+       __u32 nh_family;
+       /* network address of nexthop; skips fib lookup to find gateway */
+       union {
+               __be32          ipv4_nh;
+               __u32           ipv6_nh[4];  /* in6_addr; network order */
+       };
+};
+
 enum bpf_task_fd_type {
        BPF_FD_TYPE_RAW_TRACEPOINT,     /* tp name */
        BPF_FD_TYPE_TRACEPOINT,         /* tp name */
index c5e2a1c..6d0fa65 100644 (file)
@@ -2165,12 +2165,12 @@ static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev,
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
-static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb)
+static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,
+                           struct net_device *dev, struct bpf_nh_params *nh)
 {
-       struct dst_entry *dst = skb_dst(skb);
-       struct net_device *dev = dst->dev;
        u32 hh_len = LL_RESERVED_SPACE(dev);
        const struct in6_addr *nexthop;
+       struct dst_entry *dst = NULL;
        struct neighbour *neigh;
 
        if (dev_xmit_recursion()) {
@@ -2196,8 +2196,13 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb)
        }
 
        rcu_read_lock_bh();
-       nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst),
-                             &ipv6_hdr(skb)->daddr);
+       if (!nh) {
+               dst = skb_dst(skb);
+               nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst),
+                                     &ipv6_hdr(skb)->daddr);
+       } else {
+               nexthop = &nh->ipv6_nh;
+       }
        neigh = ip_neigh_gw6(dev, nexthop);
        if (likely(!IS_ERR(neigh))) {
                int ret;
@@ -2210,36 +2215,43 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb)
                return ret;
        }
        rcu_read_unlock_bh();
-       IP6_INC_STATS(dev_net(dst->dev),
-                     ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
+       if (dst)
+               IP6_INC_STATS(dev_net(dst->dev),
+                             ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 out_drop:
        kfree_skb(skb);
        return -ENETDOWN;
 }
 
-static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev)
+static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,
+                                  struct bpf_nh_params *nh)
 {
        const struct ipv6hdr *ip6h = ipv6_hdr(skb);
        struct net *net = dev_net(dev);
        int err, ret = NET_XMIT_DROP;
-       struct dst_entry *dst;
-       struct flowi6 fl6 = {
-               .flowi6_flags   = FLOWI_FLAG_ANYSRC,
-               .flowi6_mark    = skb->mark,
-               .flowlabel      = ip6_flowinfo(ip6h),
-               .flowi6_oif     = dev->ifindex,
-               .flowi6_proto   = ip6h->nexthdr,
-               .daddr          = ip6h->daddr,
-               .saddr          = ip6h->saddr,
-       };
 
-       dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL);
-       if (IS_ERR(dst))
-               goto out_drop;
+       if (!nh) {
+               struct dst_entry *dst;
+               struct flowi6 fl6 = {
+                       .flowi6_flags = FLOWI_FLAG_ANYSRC,
+                       .flowi6_mark  = skb->mark,
+                       .flowlabel    = ip6_flowinfo(ip6h),
+                       .flowi6_oif   = dev->ifindex,
+                       .flowi6_proto = ip6h->nexthdr,
+                       .daddr        = ip6h->daddr,
+                       .saddr        = ip6h->saddr,
+               };
+
+               dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL);
+               if (IS_ERR(dst))
+                       goto out_drop;
 
-       skb_dst_set(skb, dst);
+               skb_dst_set(skb, dst);
+       } else if (nh->nh_family != AF_INET6) {
+               goto out_drop;
+       }
 
-       err = bpf_out_neigh_v6(net, skb);
+       err = bpf_out_neigh_v6(net, skb, dev, nh);
        if (unlikely(net_xmit_eval(err)))
                dev->stats.tx_errors++;
        else
@@ -2252,7 +2264,8 @@ out_xmit:
        return ret;
 }
 #else
-static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev)
+static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,
+                                  struct bpf_nh_params *nh)
 {
        kfree_skb(skb);
        return NET_XMIT_DROP;
@@ -2260,11 +2273,9 @@ static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev)
 #endif /* CONFIG_IPV6 */
 
 #if IS_ENABLED(CONFIG_INET)
-static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb)
+static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb,
+                           struct net_device *dev, struct bpf_nh_params *nh)
 {
-       struct dst_entry *dst = skb_dst(skb);
-       struct rtable *rt = container_of(dst, struct rtable, dst);
-       struct net_device *dev = dst->dev;
        u32 hh_len = LL_RESERVED_SPACE(dev);
        struct neighbour *neigh;
        bool is_v6gw = false;
@@ -2292,7 +2303,21 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb)
        }
 
        rcu_read_lock_bh();
-       neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
+       if (!nh) {
+               struct dst_entry *dst = skb_dst(skb);
+               struct rtable *rt = container_of(dst, struct rtable, dst);
+
+               neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
+       } else if (nh->nh_family == AF_INET6) {
+               neigh = ip_neigh_gw6(dev, &nh->ipv6_nh);
+               is_v6gw = true;
+       } else if (nh->nh_family == AF_INET) {
+               neigh = ip_neigh_gw4(dev, nh->ipv4_nh);
+       } else {
+               rcu_read_unlock_bh();
+               goto out_drop;
+       }
+
        if (likely(!IS_ERR(neigh))) {
                int ret;
 
@@ -2309,33 +2334,37 @@ out_drop:
        return -ENETDOWN;
 }
 
-static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev)
+static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
+                                  struct bpf_nh_params *nh)
 {
        const struct iphdr *ip4h = ip_hdr(skb);
        struct net *net = dev_net(dev);
        int err, ret = NET_XMIT_DROP;
-       struct rtable *rt;
-       struct flowi4 fl4 = {
-               .flowi4_flags   = FLOWI_FLAG_ANYSRC,
-               .flowi4_mark    = skb->mark,
-               .flowi4_tos     = RT_TOS(ip4h->tos),
-               .flowi4_oif     = dev->ifindex,
-               .flowi4_proto   = ip4h->protocol,
-               .daddr          = ip4h->daddr,
-               .saddr          = ip4h->saddr,
-       };
 
-       rt = ip_route_output_flow(net, &fl4, NULL);
-       if (IS_ERR(rt))
-               goto out_drop;
-       if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) {
-               ip_rt_put(rt);
-               goto out_drop;
-       }
+       if (!nh) {
+               struct flowi4 fl4 = {
+                       .flowi4_flags = FLOWI_FLAG_ANYSRC,
+                       .flowi4_mark  = skb->mark,
+                       .flowi4_tos   = RT_TOS(ip4h->tos),
+                       .flowi4_oif   = dev->ifindex,
+                       .flowi4_proto = ip4h->protocol,
+                       .daddr        = ip4h->daddr,
+                       .saddr        = ip4h->saddr,
+               };
+               struct rtable *rt;
+
+               rt = ip_route_output_flow(net, &fl4, NULL);
+               if (IS_ERR(rt))
+                       goto out_drop;
+               if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) {
+                       ip_rt_put(rt);
+                       goto out_drop;
+               }
 
-       skb_dst_set(skb, &rt->dst);
+               skb_dst_set(skb, &rt->dst);
+       }
 
-       err = bpf_out_neigh_v4(net, skb);
+       err = bpf_out_neigh_v4(net, skb, dev, nh);
        if (unlikely(net_xmit_eval(err)))
                dev->stats.tx_errors++;
        else
@@ -2348,14 +2377,16 @@ out_xmit:
        return ret;
 }
 #else
-static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev)
+static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
+                                  struct bpf_nh_params *nh)
 {
        kfree_skb(skb);
        return NET_XMIT_DROP;
 }
 #endif /* CONFIG_INET */
 
-static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev)
+static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev,
+                               struct bpf_nh_params *nh)
 {
        struct ethhdr *ethh = eth_hdr(skb);
 
@@ -2370,9 +2401,9 @@ static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev)
        skb_reset_network_header(skb);
 
        if (skb->protocol == htons(ETH_P_IP))
-               return __bpf_redirect_neigh_v4(skb, dev);
+               return __bpf_redirect_neigh_v4(skb, dev, nh);
        else if (skb->protocol == htons(ETH_P_IPV6))
-               return __bpf_redirect_neigh_v6(skb, dev);
+               return __bpf_redirect_neigh_v6(skb, dev, nh);
 out:
        kfree_skb(skb);
        return -ENOTSUPP;
@@ -2382,7 +2413,8 @@ out:
 enum {
        BPF_F_NEIGH     = (1ULL << 1),
        BPF_F_PEER      = (1ULL << 2),
-#define BPF_F_REDIRECT_INTERNAL        (BPF_F_NEIGH | BPF_F_PEER)
+       BPF_F_NEXTHOP   = (1ULL << 3),
+#define BPF_F_REDIRECT_INTERNAL        (BPF_F_NEIGH | BPF_F_PEER | BPF_F_NEXTHOP)
 };
 
 BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
@@ -2455,7 +2487,8 @@ int skb_do_redirect(struct sk_buff *skb)
                return -EAGAIN;
        }
        return flags & BPF_F_NEIGH ?
-              __bpf_redirect_neigh(skb, dev) :
+              __bpf_redirect_neigh(skb, dev, flags & BPF_F_NEXTHOP ?
+                                   &ri->nh : NULL) :
               __bpf_redirect(skb, dev, flags);
 out_drop:
        kfree_skb(skb);
@@ -2504,16 +2537,21 @@ static const struct bpf_func_proto bpf_redirect_peer_proto = {
        .arg2_type      = ARG_ANYTHING,
 };
 
-BPF_CALL_2(bpf_redirect_neigh, u32, ifindex, u64, flags)
+BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params,
+          int, plen, u64, flags)
 {
        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
 
-       if (unlikely(flags))
+       if (unlikely((plen && plen < sizeof(*params)) || flags))
                return TC_ACT_SHOT;
 
-       ri->flags = BPF_F_NEIGH;
+       ri->flags = BPF_F_NEIGH | (plen ? BPF_F_NEXTHOP : 0);
        ri->tgt_index = ifindex;
 
+       BUILD_BUG_ON(sizeof(struct bpf_redir_neigh) != sizeof(struct bpf_nh_params));
+       if (plen)
+               memcpy(&ri->nh, params, sizeof(ri->nh));
+
        return TC_ACT_REDIRECT;
 }
 
@@ -2522,7 +2560,9 @@ static const struct bpf_func_proto bpf_redirect_neigh_proto = {
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_ANYTHING,
-       .arg2_type      = ARG_ANYTHING,
+       .arg2_type      = ARG_PTR_TO_MEM_OR_NULL,
+       .arg3_type      = ARG_CONST_SIZE_OR_ZERO,
+       .arg4_type      = ARG_ANYTHING,
 };
 
 BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes)
index 7d86fdd..6769caa 100755 (executable)
@@ -453,6 +453,7 @@ class PrinterHelpers(Printer):
             'struct bpf_perf_event_data',
             'struct bpf_perf_event_value',
             'struct bpf_pidns_info',
+            'struct bpf_redir_neigh',
             'struct bpf_sk_lookup',
             'struct bpf_sock',
             'struct bpf_sock_addr',
index bf5a99d..e6ceac3 100644 (file)
@@ -3677,15 +3677,19 @@ union bpf_attr {
  *     Return
  *             The id is returned or 0 in case the id could not be retrieved.
  *
- * long bpf_redirect_neigh(u32 ifindex, u64 flags)
+ * long bpf_redirect_neigh(u32 ifindex, struct bpf_redir_neigh *params, int plen, u64 flags)
  *     Description
  *             Redirect the packet to another net device of index *ifindex*
  *             and fill in L2 addresses from neighboring subsystem. This helper
  *             is somewhat similar to **bpf_redirect**\ (), except that it
  *             populates L2 addresses as well, meaning, internally, the helper
- *             performs a FIB lookup based on the skb's networking header to
- *             get the address of the next hop and then relies on the neighbor
- *             lookup for the L2 address of the nexthop.
+ *             relies on the neighbor lookup for the L2 address of the nexthop.
+ *
+ *             The helper will perform a FIB lookup based on the skb's
+ *             networking header to get the address of the next hop, unless
+ *             this is supplied by the caller in the *params* argument. The
+ *             *plen* argument indicates the len of *params* and should be set
+ *             to 0 if *params* is NULL.
  *
  *             The *flags* argument is reserved and must be 0. The helper is
  *             currently only supported for tc BPF program types, and enabled
@@ -4906,6 +4910,16 @@ struct bpf_fib_lookup {
        __u8    dmac[6];     /* ETH_ALEN */
 };
 
+struct bpf_redir_neigh {
+       /* network family for lookup (AF_INET, AF_INET6) */
+       __u32 nh_family;
+       /* network address of nexthop; skips fib lookup to find gateway */
+       union {
+               __be32          ipv4_nh;
+               __u32           ipv6_nh[4];  /* in6_addr; network order */
+       };
+};
+
 enum bpf_task_fd_type {
        BPF_FD_TYPE_RAW_TRACEPOINT,     /* tp name */
        BPF_FD_TYPE_TRACEPOINT,         /* tp name */