Merge tag 'net-next-5.14' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev...
[linux-2.6-microblaze.git] / net / core / filter.c
index d81352c..d70187c 100644 (file)
@@ -3241,9 +3241,6 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
        u32 off = skb_mac_header_len(skb);
        int ret;
 
-       if (skb_is_gso(skb) && !skb_is_gso_tcp(skb))
-               return -ENOTSUPP;
-
        ret = skb_cow(skb, len_diff);
        if (unlikely(ret < 0))
                return ret;
@@ -3255,19 +3252,11 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
        if (skb_is_gso(skb)) {
                struct skb_shared_info *shinfo = skb_shinfo(skb);
 
-               /* SKB_GSO_TCPV4 needs to be changed into
-                * SKB_GSO_TCPV6.
-                */
+               /* SKB_GSO_TCPV4 needs to be changed into SKB_GSO_TCPV6. */
                if (shinfo->gso_type & SKB_GSO_TCPV4) {
                        shinfo->gso_type &= ~SKB_GSO_TCPV4;
                        shinfo->gso_type |=  SKB_GSO_TCPV6;
                }
-
-               /* Due to IPv6 header, MSS needs to be downgraded. */
-               skb_decrease_gso_size(shinfo, len_diff);
-               /* Header must be checked, and gso_segs recomputed. */
-               shinfo->gso_type |= SKB_GSO_DODGY;
-               shinfo->gso_segs = 0;
        }
 
        skb->protocol = htons(ETH_P_IPV6);
@@ -3282,9 +3271,6 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
        u32 off = skb_mac_header_len(skb);
        int ret;
 
-       if (skb_is_gso(skb) && !skb_is_gso_tcp(skb))
-               return -ENOTSUPP;
-
        ret = skb_unclone(skb, GFP_ATOMIC);
        if (unlikely(ret < 0))
                return ret;
@@ -3296,19 +3282,11 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
        if (skb_is_gso(skb)) {
                struct skb_shared_info *shinfo = skb_shinfo(skb);
 
-               /* SKB_GSO_TCPV6 needs to be changed into
-                * SKB_GSO_TCPV4.
-                */
+               /* SKB_GSO_TCPV6 needs to be changed into SKB_GSO_TCPV4. */
                if (shinfo->gso_type & SKB_GSO_TCPV6) {
                        shinfo->gso_type &= ~SKB_GSO_TCPV6;
                        shinfo->gso_type |=  SKB_GSO_TCPV4;
                }
-
-               /* Due to IPv4 header, MSS can be upgraded. */
-               skb_increase_gso_size(shinfo, len_diff);
-               /* Header must be checked, and gso_segs recomputed. */
-               shinfo->gso_type |= SKB_GSO_DODGY;
-               shinfo->gso_segs = 0;
        }
 
        skb->protocol = htons(ETH_P_IP);
@@ -3919,6 +3897,34 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
        .arg2_type      = ARG_ANYTHING,
 };
 
+/* XDP_REDIRECT works by a three-step process, implemented in the functions
+ * below:
+ *
+ * 1. The bpf_redirect() and bpf_redirect_map() helpers will lookup the target
+ *    of the redirect and store it (along with some other metadata) in a per-CPU
+ *    struct bpf_redirect_info.
+ *
+ * 2. When the program returns the XDP_REDIRECT return code, the driver will
+ *    call xdp_do_redirect() which will use the information in struct
+ *    bpf_redirect_info to actually enqueue the frame into a map type-specific
+ *    bulk queue structure.
+ *
+ * 3. Before exiting its NAPI poll loop, the driver will call xdp_do_flush(),
+ *    which will flush all the different bulk queues, thus completing the
+ *    redirect.
+ *
+ * Pointers to the map entries will be kept around for this whole sequence of
+ * steps, protected by RCU. However, there is no top-level rcu_read_lock() in
+ * the core code; instead, the RCU protection relies on everything happening
+ * inside a single NAPI poll sequence, which means it's between a pair of calls
+ * to local_bh_disable()/local_bh_enable().
+ *
+ * The map entries are marked as __rcu and the map code makes sure to
+ * dereference those pointers with rcu_dereference_check() in a way that works
+ * for both sections that to hold an rcu_read_lock() and sections that are
+ * called from NAPI without a separate rcu_read_lock(). The code below does not
+ * use RCU annotations, but relies on those in the map code.
+ */
 void xdp_do_flush(void)
 {
        __dev_flush();
@@ -3927,6 +3933,23 @@ void xdp_do_flush(void)
 }
 EXPORT_SYMBOL_GPL(xdp_do_flush);
 
+void bpf_clear_redirect_map(struct bpf_map *map)
+{
+       struct bpf_redirect_info *ri;
+       int cpu;
+
+       for_each_possible_cpu(cpu) {
+               ri = per_cpu_ptr(&bpf_redirect_info, cpu);
+               /* Avoid polluting remote cacheline due to writes if
+                * not needed. Once we pass this test, we need the
+                * cmpxchg() to make sure it hasn't been changed in
+                * the meantime by remote CPU.
+                */
+               if (unlikely(READ_ONCE(ri->map) == map))
+                       cmpxchg(&ri->map, map, NULL);
+       }
+}
+
 int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
                    struct bpf_prog *xdp_prog)
 {
@@ -3934,6 +3957,7 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
        enum bpf_map_type map_type = ri->map_type;
        void *fwd = ri->tgt_value;
        u32 map_id = ri->map_id;
+       struct bpf_map *map;
        int err;
 
        ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
@@ -3943,7 +3967,14 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
        case BPF_MAP_TYPE_DEVMAP:
                fallthrough;
        case BPF_MAP_TYPE_DEVMAP_HASH:
-               err = dev_map_enqueue(fwd, xdp, dev);
+               map = READ_ONCE(ri->map);
+               if (unlikely(map)) {
+                       WRITE_ONCE(ri->map, NULL);
+                       err = dev_map_enqueue_multi(xdp, dev, map,
+                                                   ri->flags & BPF_F_EXCLUDE_INGRESS);
+               } else {
+                       err = dev_map_enqueue(fwd, xdp, dev);
+               }
                break;
        case BPF_MAP_TYPE_CPUMAP:
                err = cpu_map_enqueue(fwd, xdp, dev);
@@ -3985,13 +4016,21 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
                                       enum bpf_map_type map_type, u32 map_id)
 {
        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+       struct bpf_map *map;
        int err;
 
        switch (map_type) {
        case BPF_MAP_TYPE_DEVMAP:
                fallthrough;
        case BPF_MAP_TYPE_DEVMAP_HASH:
-               err = dev_map_generic_redirect(fwd, skb, xdp_prog);
+               map = READ_ONCE(ri->map);
+               if (unlikely(map)) {
+                       WRITE_ONCE(ri->map, NULL);
+                       err = dev_map_redirect_multi(dev, skb, xdp_prog, map,
+                                                    ri->flags & BPF_F_EXCLUDE_INGRESS);
+               } else {
+                       err = dev_map_generic_redirect(fwd, skb, xdp_prog);
+               }
                if (unlikely(err))
                        goto err;
                break;
@@ -10008,11 +10047,13 @@ out:
 static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
                                    struct sock_reuseport *reuse,
                                    struct sock *sk, struct sk_buff *skb,
+                                   struct sock *migrating_sk,
                                    u32 hash)
 {
        reuse_kern->skb = skb;
        reuse_kern->sk = sk;
        reuse_kern->selected_sk = NULL;
+       reuse_kern->migrating_sk = migrating_sk;
        reuse_kern->data_end = skb->data + skb_headlen(skb);
        reuse_kern->hash = hash;
        reuse_kern->reuseport_id = reuse->reuseport_id;
@@ -10021,12 +10062,13 @@ static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
 
 struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
                                  struct bpf_prog *prog, struct sk_buff *skb,
+                                 struct sock *migrating_sk,
                                  u32 hash)
 {
        struct sk_reuseport_kern reuse_kern;
        enum sk_action action;
 
-       bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, hash);
+       bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, migrating_sk, hash);
        action = BPF_PROG_RUN(prog, &reuse_kern);
 
        if (action == SK_PASS)
@@ -10136,6 +10178,8 @@ sk_reuseport_func_proto(enum bpf_func_id func_id,
                return &sk_reuseport_load_bytes_proto;
        case BPF_FUNC_skb_load_bytes_relative:
                return &sk_reuseport_load_bytes_relative_proto;
+       case BPF_FUNC_get_socket_cookie:
+               return &bpf_get_socket_ptr_cookie_proto;
        default:
                return bpf_base_func_proto(func_id);
        }
@@ -10165,6 +10209,14 @@ sk_reuseport_is_valid_access(int off, int size,
        case offsetof(struct sk_reuseport_md, hash):
                return size == size_default;
 
+       case offsetof(struct sk_reuseport_md, sk):
+               info->reg_type = PTR_TO_SOCKET;
+               return size == sizeof(__u64);
+
+       case offsetof(struct sk_reuseport_md, migrating_sk):
+               info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL;
+               return size == sizeof(__u64);
+
        /* Fields that allow narrowing */
        case bpf_ctx_range(struct sk_reuseport_md, eth_protocol):
                if (size < sizeof_field(struct sk_buff, protocol))
@@ -10237,6 +10289,14 @@ static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type,
        case offsetof(struct sk_reuseport_md, bind_inany):
                SK_REUSEPORT_LOAD_FIELD(bind_inany);
                break;
+
+       case offsetof(struct sk_reuseport_md, sk):
+               SK_REUSEPORT_LOAD_FIELD(sk);
+               break;
+
+       case offsetof(struct sk_reuseport_md, migrating_sk):
+               SK_REUSEPORT_LOAD_FIELD(migrating_sk);
+               break;
        }
 
        return insn - insn_buf;