Merge tag 'net-next-5.14' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev...

[linux-2.6-microblaze.git] / net / core / filter.c
diff --git a/net/core/filter.c b/net/core/filter.c

index d81352c..d70187c 100644 (file)
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3241,9 +3241,6 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
         u32 off = skb_mac_header_len(skb);
         int ret;
  
-       if (skb_is_gso(skb) && !skb_is_gso_tcp(skb))
-               return -ENOTSUPP;
-
         ret = skb_cow(skb, len_diff);
         if (unlikely(ret < 0))
                 return ret;
@@ -3255,19 +3252,11 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
         if (skb_is_gso(skb)) {
                 struct skb_shared_info *shinfo = skb_shinfo(skb);
  
-               /* SKB_GSO_TCPV4 needs to be changed into
-                * SKB_GSO_TCPV6.
-                */
+               /* SKB_GSO_TCPV4 needs to be changed into SKB_GSO_TCPV6. */
                 if (shinfo->gso_type & SKB_GSO_TCPV4) {
                         shinfo->gso_type &= ~SKB_GSO_TCPV4;
                         shinfo->gso_type |=  SKB_GSO_TCPV6;
                 }
-
-               /* Due to IPv6 header, MSS needs to be downgraded. */
-               skb_decrease_gso_size(shinfo, len_diff);
-               /* Header must be checked, and gso_segs recomputed. */
-               shinfo->gso_type |= SKB_GSO_DODGY;
-               shinfo->gso_segs = 0;
         }
  
         skb->protocol = htons(ETH_P_IPV6);
@@ -3282,9 +3271,6 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
         u32 off = skb_mac_header_len(skb);
         int ret;
  
-       if (skb_is_gso(skb) && !skb_is_gso_tcp(skb))
-               return -ENOTSUPP;
-
         ret = skb_unclone(skb, GFP_ATOMIC);
         if (unlikely(ret < 0))
                 return ret;
@@ -3296,19 +3282,11 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
         if (skb_is_gso(skb)) {
                 struct skb_shared_info *shinfo = skb_shinfo(skb);
  
-               /* SKB_GSO_TCPV6 needs to be changed into
-                * SKB_GSO_TCPV4.
-                */
+               /* SKB_GSO_TCPV6 needs to be changed into SKB_GSO_TCPV4. */
                 if (shinfo->gso_type & SKB_GSO_TCPV6) {
                         shinfo->gso_type &= ~SKB_GSO_TCPV6;
                         shinfo->gso_type |=  SKB_GSO_TCPV4;
                 }
-
-               /* Due to IPv4 header, MSS can be upgraded. */
-               skb_increase_gso_size(shinfo, len_diff);
-               /* Header must be checked, and gso_segs recomputed. */
-               shinfo->gso_type |= SKB_GSO_DODGY;
-               shinfo->gso_segs = 0;
         }
  
         skb->protocol = htons(ETH_P_IP);
@@ -3919,6 +3897,34 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
         .arg2_type      = ARG_ANYTHING,
  };
  
+/* XDP_REDIRECT works by a three-step process, implemented in the functions
+ * below:
+ *
+ * 1. The bpf_redirect() and bpf_redirect_map() helpers will lookup the target
+ *    of the redirect and store it (along with some other metadata) in a per-CPU
+ *    struct bpf_redirect_info.
+ *
+ * 2. When the program returns the XDP_REDIRECT return code, the driver will
+ *    call xdp_do_redirect() which will use the information in struct
+ *    bpf_redirect_info to actually enqueue the frame into a map type-specific
+ *    bulk queue structure.
+ *
+ * 3. Before exiting its NAPI poll loop, the driver will call xdp_do_flush(),
+ *    which will flush all the different bulk queues, thus completing the
+ *    redirect.
+ *
+ * Pointers to the map entries will be kept around for this whole sequence of
+ * steps, protected by RCU. However, there is no top-level rcu_read_lock() in
+ * the core code; instead, the RCU protection relies on everything happening
+ * inside a single NAPI poll sequence, which means it's between a pair of calls
+ * to local_bh_disable()/local_bh_enable().
+ *
+ * The map entries are marked as __rcu and the map code makes sure to
+ * dereference those pointers with rcu_dereference_check() in a way that works
+ * for both sections that to hold an rcu_read_lock() and sections that are
+ * called from NAPI without a separate rcu_read_lock(). The code below does not
+ * use RCU annotations, but relies on those in the map code.
+ */
  void xdp_do_flush(void)
  {
         __dev_flush();
@@ -3927,6 +3933,23 @@ void xdp_do_flush(void)
  }
  EXPORT_SYMBOL_GPL(xdp_do_flush);
  
+void bpf_clear_redirect_map(struct bpf_map *map)
+{
+       struct bpf_redirect_info *ri;
+       int cpu;
+
+       for_each_possible_cpu(cpu) {
+               ri = per_cpu_ptr(&bpf_redirect_info, cpu);
+               /* Avoid polluting remote cacheline due to writes if
+                * not needed. Once we pass this test, we need the
+                * cmpxchg() to make sure it hasn't been changed in
+                * the meantime by remote CPU.
+                */
+               if (unlikely(READ_ONCE(ri->map) == map))
+                       cmpxchg(&ri->map, map, NULL);
+       }
+}
+
  int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
                     struct bpf_prog *xdp_prog)
  {
@@ -3934,6 +3957,7 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
         enum bpf_map_type map_type = ri->map_type;
         void *fwd = ri->tgt_value;
         u32 map_id = ri->map_id;
+       struct bpf_map *map;
         int err;
  
         ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
@@ -3943,7 +3967,14 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
         case BPF_MAP_TYPE_DEVMAP:
                 fallthrough;
         case BPF_MAP_TYPE_DEVMAP_HASH:
-               err = dev_map_enqueue(fwd, xdp, dev);
+               map = READ_ONCE(ri->map);
+               if (unlikely(map)) {
+                       WRITE_ONCE(ri->map, NULL);
+                       err = dev_map_enqueue_multi(xdp, dev, map,
+                                                   ri->flags & BPF_F_EXCLUDE_INGRESS);
+               } else {
+                       err = dev_map_enqueue(fwd, xdp, dev);
+               }
                 break;
         case BPF_MAP_TYPE_CPUMAP:
                 err = cpu_map_enqueue(fwd, xdp, dev);
@@ -3985,13 +4016,21 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
                                        enum bpf_map_type map_type, u32 map_id)
  {
         struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+       struct bpf_map *map;
         int err;
  
         switch (map_type) {
         case BPF_MAP_TYPE_DEVMAP:
                 fallthrough;
         case BPF_MAP_TYPE_DEVMAP_HASH:
-               err = dev_map_generic_redirect(fwd, skb, xdp_prog);
+               map = READ_ONCE(ri->map);
+               if (unlikely(map)) {
+                       WRITE_ONCE(ri->map, NULL);
+                       err = dev_map_redirect_multi(dev, skb, xdp_prog, map,
+                                                    ri->flags & BPF_F_EXCLUDE_INGRESS);
+               } else {
+                       err = dev_map_generic_redirect(fwd, skb, xdp_prog);
+               }
                 if (unlikely(err))
                         goto err;
                 break;
@@ -10008,11 +10047,13 @@ out:
  static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
                                     struct sock_reuseport *reuse,
                                     struct sock *sk, struct sk_buff *skb,
+                                   struct sock *migrating_sk,
                                     u32 hash)
  {
         reuse_kern->skb = skb;
         reuse_kern->sk = sk;
         reuse_kern->selected_sk = NULL;
+       reuse_kern->migrating_sk = migrating_sk;
         reuse_kern->data_end = skb->data + skb_headlen(skb);
         reuse_kern->hash = hash;
         reuse_kern->reuseport_id = reuse->reuseport_id;
@@ -10021,12 +10062,13 @@ static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
  
  struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
                                   struct bpf_prog *prog, struct sk_buff *skb,
+                                 struct sock *migrating_sk,
                                   u32 hash)
  {
         struct sk_reuseport_kern reuse_kern;
         enum sk_action action;
  
-       bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, hash);
+       bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, migrating_sk, hash);
         action = BPF_PROG_RUN(prog, &reuse_kern);
  
         if (action == SK_PASS)
@@ -10136,6 +10178,8 @@ sk_reuseport_func_proto(enum bpf_func_id func_id,
                 return &sk_reuseport_load_bytes_proto;
         case BPF_FUNC_skb_load_bytes_relative:
                 return &sk_reuseport_load_bytes_relative_proto;
+       case BPF_FUNC_get_socket_cookie:
+               return &bpf_get_socket_ptr_cookie_proto;
         default:
                 return bpf_base_func_proto(func_id);
         }
@@ -10165,6 +10209,14 @@ sk_reuseport_is_valid_access(int off, int size,
         case offsetof(struct sk_reuseport_md, hash):
                 return size == size_default;
  
+       case offsetof(struct sk_reuseport_md, sk):
+               info->reg_type = PTR_TO_SOCKET;
+               return size == sizeof(__u64);
+
+       case offsetof(struct sk_reuseport_md, migrating_sk):
+               info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL;
+               return size == sizeof(__u64);
+
         /* Fields that allow narrowing */
         case bpf_ctx_range(struct sk_reuseport_md, eth_protocol):
                 if (size < sizeof_field(struct sk_buff, protocol))
@@ -10237,6 +10289,14 @@ static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type,
         case offsetof(struct sk_reuseport_md, bind_inany):
                 SK_REUSEPORT_LOAD_FIELD(bind_inany);
                 break;
+
+       case offsetof(struct sk_reuseport_md, sk):
+               SK_REUSEPORT_LOAD_FIELD(sk);
+               break;
+
+       case offsetof(struct sk_reuseport_md, migrating_sk):
+               SK_REUSEPORT_LOAD_FIELD(migrating_sk);
+               break;
         }
  
         return insn - insn_buf;