mptcp: Only send extra TCP acks in eligible socket states
[linux-2.6-microblaze.git] / net / core / filter.c
index d81352c..2e32cee 100644 (file)
@@ -77,6 +77,7 @@
 #include <net/transp_v6.h>
 #include <linux/btf_ids.h>
 #include <net/tls.h>
+#include <net/xdp.h>
 
 static const struct bpf_func_proto *
 bpf_sk_base_func_proto(enum bpf_func_id func_id);
@@ -113,7 +114,7 @@ EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user);
  * Run the eBPF program and then cut skb->data to correct size returned by
  * the program. If pkt_len is 0 we toss packet. If skb->len is smaller
  * than pkt_len we keep whole skb->data. This is the socket level
- * wrapper to BPF_PROG_RUN. It returns 0 if the packet should
+ * wrapper to bpf_prog_run. It returns 0 if the packet should
  * be accepted or -EPERM if the packet should be tossed.
  *
  */
@@ -2179,17 +2180,9 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,
        skb->tstamp = 0;
 
        if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
-               struct sk_buff *skb2;
-
-               skb2 = skb_realloc_headroom(skb, hh_len);
-               if (unlikely(!skb2)) {
-                       kfree_skb(skb);
+               skb = skb_expand_head(skb, hh_len);
+               if (!skb)
                        return -ENOMEM;
-               }
-               if (skb->sk)
-                       skb_set_owner_w(skb2, skb->sk);
-               consume_skb(skb);
-               skb = skb2;
        }
 
        rcu_read_lock_bh();
@@ -2213,8 +2206,7 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,
        }
        rcu_read_unlock_bh();
        if (dst)
-               IP6_INC_STATS(dev_net(dst->dev),
-                             ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
+               IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 out_drop:
        kfree_skb(skb);
        return -ENETDOWN;
@@ -2286,17 +2278,9 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb,
        skb->tstamp = 0;
 
        if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
-               struct sk_buff *skb2;
-
-               skb2 = skb_realloc_headroom(skb, hh_len);
-               if (unlikely(!skb2)) {
-                       kfree_skb(skb);
+               skb = skb_expand_head(skb, hh_len);
+               if (!skb)
                        return -ENOMEM;
-               }
-               if (skb->sk)
-                       skb_set_owner_w(skb2, skb->sk);
-               consume_skb(skb);
-               skb = skb2;
        }
 
        rcu_read_lock_bh();
@@ -3241,9 +3225,6 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
        u32 off = skb_mac_header_len(skb);
        int ret;
 
-       if (skb_is_gso(skb) && !skb_is_gso_tcp(skb))
-               return -ENOTSUPP;
-
        ret = skb_cow(skb, len_diff);
        if (unlikely(ret < 0))
                return ret;
@@ -3255,19 +3236,11 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
        if (skb_is_gso(skb)) {
                struct skb_shared_info *shinfo = skb_shinfo(skb);
 
-               /* SKB_GSO_TCPV4 needs to be changed into
-                * SKB_GSO_TCPV6.
-                */
+               /* SKB_GSO_TCPV4 needs to be changed into SKB_GSO_TCPV6. */
                if (shinfo->gso_type & SKB_GSO_TCPV4) {
                        shinfo->gso_type &= ~SKB_GSO_TCPV4;
                        shinfo->gso_type |=  SKB_GSO_TCPV6;
                }
-
-               /* Due to IPv6 header, MSS needs to be downgraded. */
-               skb_decrease_gso_size(shinfo, len_diff);
-               /* Header must be checked, and gso_segs recomputed. */
-               shinfo->gso_type |= SKB_GSO_DODGY;
-               shinfo->gso_segs = 0;
        }
 
        skb->protocol = htons(ETH_P_IPV6);
@@ -3282,9 +3255,6 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
        u32 off = skb_mac_header_len(skb);
        int ret;
 
-       if (skb_is_gso(skb) && !skb_is_gso_tcp(skb))
-               return -ENOTSUPP;
-
        ret = skb_unclone(skb, GFP_ATOMIC);
        if (unlikely(ret < 0))
                return ret;
@@ -3296,19 +3266,11 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
        if (skb_is_gso(skb)) {
                struct skb_shared_info *shinfo = skb_shinfo(skb);
 
-               /* SKB_GSO_TCPV6 needs to be changed into
-                * SKB_GSO_TCPV4.
-                */
+               /* SKB_GSO_TCPV6 needs to be changed into SKB_GSO_TCPV4. */
                if (shinfo->gso_type & SKB_GSO_TCPV6) {
                        shinfo->gso_type &= ~SKB_GSO_TCPV6;
                        shinfo->gso_type |=  SKB_GSO_TCPV4;
                }
-
-               /* Due to IPv4 header, MSS can be upgraded. */
-               skb_increase_gso_size(shinfo, len_diff);
-               /* Header must be checked, and gso_segs recomputed. */
-               shinfo->gso_type |= SKB_GSO_DODGY;
-               shinfo->gso_segs = 0;
        }
 
        skb->protocol = htons(ETH_P_IP);
@@ -3902,8 +3864,7 @@ BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset)
        if (unlikely(meta < xdp_frame_end ||
                     meta > xdp->data))
                return -EINVAL;
-       if (unlikely((metalen & (sizeof(__u32) - 1)) ||
-                    (metalen > 32)))
+       if (unlikely(xdp_metalen_invalid(metalen)))
                return -EACCES;
 
        xdp->data_meta = meta;
@@ -3919,6 +3880,34 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
        .arg2_type      = ARG_ANYTHING,
 };
 
+/* XDP_REDIRECT works by a three-step process, implemented in the functions
+ * below:
+ *
+ * 1. The bpf_redirect() and bpf_redirect_map() helpers will lookup the target
+ *    of the redirect and store it (along with some other metadata) in a per-CPU
+ *    struct bpf_redirect_info.
+ *
+ * 2. When the program returns the XDP_REDIRECT return code, the driver will
+ *    call xdp_do_redirect() which will use the information in struct
+ *    bpf_redirect_info to actually enqueue the frame into a map type-specific
+ *    bulk queue structure.
+ *
+ * 3. Before exiting its NAPI poll loop, the driver will call xdp_do_flush(),
+ *    which will flush all the different bulk queues, thus completing the
+ *    redirect.
+ *
+ * Pointers to the map entries will be kept around for this whole sequence of
+ * steps, protected by RCU. However, there is no top-level rcu_read_lock() in
+ * the core code; instead, the RCU protection relies on everything happening
+ * inside a single NAPI poll sequence, which means it's between a pair of calls
+ * to local_bh_disable()/local_bh_enable().
+ *
+ * The map entries are marked as __rcu and the map code makes sure to
+ * dereference those pointers with rcu_dereference_check() in a way that works
+ * for both sections that to hold an rcu_read_lock() and sections that are
+ * called from NAPI without a separate rcu_read_lock(). The code below does not
+ * use RCU annotations, but relies on those in the map code.
+ */
 void xdp_do_flush(void)
 {
        __dev_flush();
@@ -3927,6 +3916,48 @@ void xdp_do_flush(void)
 }
 EXPORT_SYMBOL_GPL(xdp_do_flush);
 
+void bpf_clear_redirect_map(struct bpf_map *map)
+{
+       struct bpf_redirect_info *ri;
+       int cpu;
+
+       for_each_possible_cpu(cpu) {
+               ri = per_cpu_ptr(&bpf_redirect_info, cpu);
+               /* Avoid polluting remote cacheline due to writes if
+                * not needed. Once we pass this test, we need the
+                * cmpxchg() to make sure it hasn't been changed in
+                * the meantime by remote CPU.
+                */
+               if (unlikely(READ_ONCE(ri->map) == map))
+                       cmpxchg(&ri->map, map, NULL);
+       }
+}
+
+DEFINE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key);
+EXPORT_SYMBOL_GPL(bpf_master_redirect_enabled_key);
+
+u32 xdp_master_redirect(struct xdp_buff *xdp)
+{
+       struct net_device *master, *slave;
+       struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+
+       master = netdev_master_upper_dev_get_rcu(xdp->rxq->dev);
+       slave = master->netdev_ops->ndo_xdp_get_xmit_slave(master, xdp);
+       if (slave && slave != xdp->rxq->dev) {
+               /* The target device is different from the receiving device, so
+                * redirect it to the new device.
+                * Using XDP_REDIRECT gets the correct behaviour from XDP enabled
+                * drivers to unmap the packet from their rx ring.
+                */
+               ri->tgt_index = slave->ifindex;
+               ri->map_id = INT_MAX;
+               ri->map_type = BPF_MAP_TYPE_UNSPEC;
+               return XDP_REDIRECT;
+       }
+       return XDP_TX;
+}
+EXPORT_SYMBOL_GPL(xdp_master_redirect);
+
 int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
                    struct bpf_prog *xdp_prog)
 {
@@ -3934,6 +3965,7 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
        enum bpf_map_type map_type = ri->map_type;
        void *fwd = ri->tgt_value;
        u32 map_id = ri->map_id;
+       struct bpf_map *map;
        int err;
 
        ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
@@ -3943,7 +3975,14 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
        case BPF_MAP_TYPE_DEVMAP:
                fallthrough;
        case BPF_MAP_TYPE_DEVMAP_HASH:
-               err = dev_map_enqueue(fwd, xdp, dev);
+               map = READ_ONCE(ri->map);
+               if (unlikely(map)) {
+                       WRITE_ONCE(ri->map, NULL);
+                       err = dev_map_enqueue_multi(xdp, dev, map,
+                                                   ri->flags & BPF_F_EXCLUDE_INGRESS);
+               } else {
+                       err = dev_map_enqueue(fwd, xdp, dev);
+               }
                break;
        case BPF_MAP_TYPE_CPUMAP:
                err = cpu_map_enqueue(fwd, xdp, dev);
@@ -3985,13 +4024,21 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
                                       enum bpf_map_type map_type, u32 map_id)
 {
        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+       struct bpf_map *map;
        int err;
 
        switch (map_type) {
        case BPF_MAP_TYPE_DEVMAP:
                fallthrough;
        case BPF_MAP_TYPE_DEVMAP_HASH:
-               err = dev_map_generic_redirect(fwd, skb, xdp_prog);
+               map = READ_ONCE(ri->map);
+               if (unlikely(map)) {
+                       WRITE_ONCE(ri->map, NULL);
+                       err = dev_map_redirect_multi(dev, skb, xdp_prog, map,
+                                                    ri->flags & BPF_F_EXCLUDE_INGRESS);
+               } else {
+                       err = dev_map_generic_redirect(fwd, skb, xdp_prog);
+               }
                if (unlikely(err))
                        goto err;
                break;
@@ -4001,8 +4048,12 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
                        goto err;
                consume_skb(skb);
                break;
+       case BPF_MAP_TYPE_CPUMAP:
+               err = cpu_map_generic_redirect(fwd, skb);
+               if (unlikely(err))
+                       goto err;
+               break;
        default:
-               /* TODO: Handle BPF_MAP_TYPE_CPUMAP */
                err = -EBADRQC;
                goto err;
        }
@@ -4625,6 +4676,30 @@ static const struct bpf_func_proto bpf_get_netns_cookie_sock_addr_proto = {
        .arg1_type      = ARG_PTR_TO_CTX_OR_NULL,
 };
 
+BPF_CALL_1(bpf_get_netns_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
+{
+       return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
+}
+
+static const struct bpf_func_proto bpf_get_netns_cookie_sock_ops_proto = {
+       .func           = bpf_get_netns_cookie_sock_ops,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX_OR_NULL,
+};
+
+BPF_CALL_1(bpf_get_netns_cookie_sk_msg, struct sk_msg *, ctx)
+{
+       return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
+}
+
+static const struct bpf_func_proto bpf_get_netns_cookie_sk_msg_proto = {
+       .func           = bpf_get_netns_cookie_sk_msg,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX_OR_NULL,
+};
+
 BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb)
 {
        struct sock *sk = sk_to_full_sk(skb->sk);
@@ -4973,6 +5048,46 @@ err_clear:
        return -EINVAL;
 }
 
+BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level,
+          int, optname, char *, optval, int, optlen)
+{
+       if (level == SOL_TCP && optname == TCP_CONGESTION) {
+               if (optlen >= sizeof("cdg") - 1 &&
+                   !strncmp("cdg", optval, optlen))
+                       return -ENOTSUPP;
+       }
+
+       return _bpf_setsockopt(sk, level, optname, optval, optlen);
+}
+
+const struct bpf_func_proto bpf_sk_setsockopt_proto = {
+       .func           = bpf_sk_setsockopt,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+       .arg2_type      = ARG_ANYTHING,
+       .arg3_type      = ARG_ANYTHING,
+       .arg4_type      = ARG_PTR_TO_MEM,
+       .arg5_type      = ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(bpf_sk_getsockopt, struct sock *, sk, int, level,
+          int, optname, char *, optval, int, optlen)
+{
+       return _bpf_getsockopt(sk, level, optname, optval, optlen);
+}
+
+const struct bpf_func_proto bpf_sk_getsockopt_proto = {
+       .func           = bpf_sk_getsockopt,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+       .arg2_type      = ARG_ANYTHING,
+       .arg3_type      = ARG_ANYTHING,
+       .arg4_type      = ARG_PTR_TO_UNINIT_MEM,
+       .arg5_type      = ARG_CONST_SIZE,
+};
+
 BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx,
           int, level, int, optname, char *, optval, int, optlen)
 {
@@ -7406,6 +7521,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
                return &bpf_sk_storage_get_proto;
        case BPF_FUNC_sk_storage_delete:
                return &bpf_sk_storage_delete_proto;
+       case BPF_FUNC_get_netns_cookie:
+               return &bpf_get_netns_cookie_sock_ops_proto;
 #ifdef CONFIG_INET
        case BPF_FUNC_load_hdr_opt:
                return &bpf_sock_ops_load_hdr_opt_proto;
@@ -7452,6 +7569,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
                return &bpf_sk_storage_get_proto;
        case BPF_FUNC_sk_storage_delete:
                return &bpf_sk_storage_delete_proto;
+       case BPF_FUNC_get_netns_cookie:
+               return &bpf_get_netns_cookie_sk_msg_proto;
 #ifdef CONFIG_CGROUPS
        case BPF_FUNC_get_current_cgroup_id:
                return &bpf_get_current_cgroup_id_proto;
@@ -10008,11 +10127,13 @@ out:
 static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
                                    struct sock_reuseport *reuse,
                                    struct sock *sk, struct sk_buff *skb,
+                                   struct sock *migrating_sk,
                                    u32 hash)
 {
        reuse_kern->skb = skb;
        reuse_kern->sk = sk;
        reuse_kern->selected_sk = NULL;
+       reuse_kern->migrating_sk = migrating_sk;
        reuse_kern->data_end = skb->data + skb_headlen(skb);
        reuse_kern->hash = hash;
        reuse_kern->reuseport_id = reuse->reuseport_id;
@@ -10021,13 +10142,14 @@ static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
 
 struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
                                  struct bpf_prog *prog, struct sk_buff *skb,
+                                 struct sock *migrating_sk,
                                  u32 hash)
 {
        struct sk_reuseport_kern reuse_kern;
        enum sk_action action;
 
-       bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, hash);
-       action = BPF_PROG_RUN(prog, &reuse_kern);
+       bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, migrating_sk, hash);
+       action = bpf_prog_run(prog, &reuse_kern);
 
        if (action == SK_PASS)
                return reuse_kern.selected_sk;
@@ -10136,6 +10258,8 @@ sk_reuseport_func_proto(enum bpf_func_id func_id,
                return &sk_reuseport_load_bytes_proto;
        case BPF_FUNC_skb_load_bytes_relative:
                return &sk_reuseport_load_bytes_relative_proto;
+       case BPF_FUNC_get_socket_cookie:
+               return &bpf_get_socket_ptr_cookie_proto;
        default:
                return bpf_base_func_proto(func_id);
        }
@@ -10165,6 +10289,14 @@ sk_reuseport_is_valid_access(int off, int size,
        case offsetof(struct sk_reuseport_md, hash):
                return size == size_default;
 
+       case offsetof(struct sk_reuseport_md, sk):
+               info->reg_type = PTR_TO_SOCKET;
+               return size == sizeof(__u64);
+
+       case offsetof(struct sk_reuseport_md, migrating_sk):
+               info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL;
+               return size == sizeof(__u64);
+
        /* Fields that allow narrowing */
        case bpf_ctx_range(struct sk_reuseport_md, eth_protocol):
                if (size < sizeof_field(struct sk_buff, protocol))
@@ -10237,6 +10369,14 @@ static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type,
        case offsetof(struct sk_reuseport_md, bind_inany):
                SK_REUSEPORT_LOAD_FIELD(bind_inany);
                break;
+
+       case offsetof(struct sk_reuseport_md, sk):
+               SK_REUSEPORT_LOAD_FIELD(sk);
+               break;
+
+       case offsetof(struct sk_reuseport_md, migrating_sk):
+               SK_REUSEPORT_LOAD_FIELD(migrating_sk);
+               break;
        }
 
        return insn - insn_buf;