bpf, sockmap: convert to generic sk_msg interface

author Daniel Borkmann <daniel@iogearbox.net>

Sat, 13 Oct 2018 00:45:58 +0000 (02:45 +0200)

committer Alexei Starovoitov <ast@kernel.org>

Mon, 15 Oct 2018 19:23:19 +0000 (12:23 -0700)
author Daniel Borkmann <daniel@iogearbox.net>
Sat, 13 Oct 2018 00:45:58 +0000 (02:45 +0200)
committer Alexei Starovoitov <ast@kernel.org>
Mon, 15 Oct 2018 19:23:19 +0000 (12:23 -0700)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h

index 9b55871..e60fff4 100644 (file)
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -737,33 +737,18 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map)
  }
  #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
  
-#if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_INET)
-struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key);
-struct sock  *__sock_hash_lookup_elem(struct bpf_map *map, void *key);
-int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type);
-int sockmap_get_from_fd(const union bpf_attr *attr, int type,
-                       struct bpf_prog *prog);
+#if defined(CONFIG_BPF_STREAM_PARSER)
+int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, u32 which);
+int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog);
  #else
-static inline struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
-{
-       return NULL;
-}
-
-static inline struct sock  *__sock_hash_lookup_elem(struct bpf_map *map,
-                                                   void *key)
-{
-       return NULL;
-}
-
-static inline int sock_map_prog(struct bpf_map *map,
-                               struct bpf_prog *prog,
-                               u32 type)
+static inline int sock_map_prog_update(struct bpf_map *map,
+                                      struct bpf_prog *prog, u32 which)
  {
         return -EOPNOTSUPP;
  }
  
-static inline int sockmap_get_from_fd(const union bpf_attr *attr, int type,
-                                     struct bpf_prog *prog)
+static inline int sock_map_get_from_fd(const union bpf_attr *attr,
+                                      struct bpf_prog *prog)
  {
         return -EINVAL;
  }
@@ -839,6 +824,10 @@ extern const struct bpf_func_proto bpf_get_stack_proto;
  extern const struct bpf_func_proto bpf_sock_map_update_proto;
  extern const struct bpf_func_proto bpf_sock_hash_update_proto;
  extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto;
+extern const struct bpf_func_proto bpf_msg_redirect_hash_proto;
+extern const struct bpf_func_proto bpf_msg_redirect_map_proto;
+extern const struct bpf_func_proto bpf_sk_redirect_hash_proto;
+extern const struct bpf_func_proto bpf_sk_redirect_map_proto;
  
  extern const struct bpf_func_proto bpf_get_local_storage_proto;
  
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h

index 5432f4c..fa48343 100644 (file)
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -57,7 +57,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops)
  BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
  #ifdef CONFIG_NET
  BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
-#if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_INET)
+#if defined(CONFIG_BPF_STREAM_PARSER)
  BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
  BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops)
  #endif
diff --git a/include/linux/filter.h b/include/linux/filter.h

index 6791a0a..5771874 100644 (file)
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -520,24 +520,6 @@ struct bpf_skb_data_end {
         void *data_end;
  };
  
-struct sk_msg_buff {
-       void *data;
-       void *data_end;
-       __u32 apply_bytes;
-       __u32 cork_bytes;
-       int sg_copybreak;
-       int sg_start;
-       int sg_curr;
-       int sg_end;
-       struct scatterlist sg_data[MAX_SKB_FRAGS];
-       bool sg_copy[MAX_SKB_FRAGS];
-       __u32 flags;
-       struct sock *sk_redir;
-       struct sock *sk;
-       struct sk_buff *skb;
-       struct list_head list;
-};
-
  struct bpf_redirect_info {
         u32 ifindex;
         u32 flags;
@@ -833,9 +815,6 @@ void xdp_do_flush_map(void);
  
  void bpf_warn_invalid_xdp_action(u32 act);
  
-struct sock *do_sk_redirect_map(struct sk_buff *skb);
-struct sock *do_msg_redirect_map(struct sk_msg_buff *md);
-
  #ifdef CONFIG_INET
  struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
                                   struct bpf_prog *prog, struct sk_buff *skb,
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h

new file mode 100644 (file)

index 0000000..9567810
--- /dev/null
+++ b/include/linux/skmsg.h
@@ -0,0 +1,371 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
+
+#ifndef _LINUX_SKMSG_H
+#define _LINUX_SKMSG_H
+
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/scatterlist.h>
+#include <linux/skbuff.h>
+
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <net/strparser.h>
+
+#define MAX_MSG_FRAGS                  MAX_SKB_FRAGS
+
+enum __sk_action {
+       __SK_DROP = 0,
+       __SK_PASS,
+       __SK_REDIRECT,
+       __SK_NONE,
+};
+
+struct sk_msg_sg {
+       u32                             start;
+       u32                             curr;
+       u32                             end;
+       u32                             size;
+       u32                             copybreak;
+       bool                            copy[MAX_MSG_FRAGS];
+       struct scatterlist              data[MAX_MSG_FRAGS];
+};
+
+struct sk_msg {
+       struct sk_msg_sg                sg;
+       void                            *data;
+       void                            *data_end;
+       u32                             apply_bytes;
+       u32                             cork_bytes;
+       u32                             flags;
+       struct sk_buff                  *skb;
+       struct sock                     *sk_redir;
+       struct sock                     *sk;
+       struct list_head                list;
+};
+
+struct sk_psock_progs {
+       struct bpf_prog                 *msg_parser;
+       struct bpf_prog                 *skb_parser;
+       struct bpf_prog                 *skb_verdict;
+};
+
+enum sk_psock_state_bits {
+       SK_PSOCK_TX_ENABLED,
+};
+
+struct sk_psock_link {
+       struct list_head                list;
+       struct bpf_map                  *map;
+       void                            *link_raw;
+};
+
+struct sk_psock_parser {
+       struct strparser                strp;
+       bool                            enabled;
+       void (*saved_data_ready)(struct sock *sk);
+};
+
+struct sk_psock_work_state {
+       struct sk_buff                  *skb;
+       u32                             len;
+       u32                             off;
+};
+
+struct sk_psock {
+       struct sock                     *sk;
+       struct sock                     *sk_redir;
+       u32                             apply_bytes;
+       u32                             cork_bytes;
+       u32                             eval;
+       struct sk_msg                   *cork;
+       struct sk_psock_progs           progs;
+       struct sk_psock_parser          parser;
+       struct sk_buff_head             ingress_skb;
+       struct list_head                ingress_msg;
+       unsigned long                   state;
+       struct list_head                link;
+       spinlock_t                      link_lock;
+       refcount_t                      refcnt;
+       void (*saved_unhash)(struct sock *sk);
+       void (*saved_close)(struct sock *sk, long timeout);
+       void (*saved_write_space)(struct sock *sk);
+       struct proto                    *sk_proto;
+       struct sk_psock_work_state      work_state;
+       struct work_struct              work;
+       union {
+               struct rcu_head         rcu;
+               struct work_struct      gc;
+       };
+};
+
+int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len,
+                int elem_first_coalesce);
+void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len);
+int sk_msg_free(struct sock *sk, struct sk_msg *msg);
+int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg);
+void sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes);
+void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg,
+                                 u32 bytes);
+
+void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes);
+
+int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
+                             struct sk_msg *msg, u32 bytes);
+int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from,
+                            struct sk_msg *msg, u32 bytes);
+
+static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes)
+{
+       WARN_ON(i == msg->sg.end && bytes);
+}
+
+static inline void sk_msg_apply_bytes(struct sk_psock *psock, u32 bytes)
+{
+       if (psock->apply_bytes) {
+               if (psock->apply_bytes < bytes)
+                       psock->apply_bytes = 0;
+               else
+                       psock->apply_bytes -= bytes;
+       }
+}
+
+#define sk_msg_iter_var_prev(var)                      \
+       do {                                            \
+               if (var == 0)                           \
+                       var = MAX_MSG_FRAGS - 1;        \
+               else                                    \
+                       var--;                          \
+       } while (0)
+
+#define sk_msg_iter_var_next(var)                      \
+       do {                                            \
+               var++;                                  \
+               if (var == MAX_MSG_FRAGS)               \
+                       var = 0;                        \
+       } while (0)
+
+#define sk_msg_iter_prev(msg, which)                   \
+       sk_msg_iter_var_prev(msg->sg.which)
+
+#define sk_msg_iter_next(msg, which)                   \
+       sk_msg_iter_var_next(msg->sg.which)
+
+static inline void sk_msg_clear_meta(struct sk_msg *msg)
+{
+       memset(&msg->sg, 0, offsetofend(struct sk_msg_sg, copy));
+}
+
+static inline void sk_msg_init(struct sk_msg *msg)
+{
+       memset(msg, 0, sizeof(*msg));
+       sg_init_marker(msg->sg.data, ARRAY_SIZE(msg->sg.data));
+}
+
+static inline void sk_msg_xfer(struct sk_msg *dst, struct sk_msg *src,
+                              int which, u32 size)
+{
+       dst->sg.data[which] = src->sg.data[which];
+       dst->sg.data[which].length  = size;
+       src->sg.data[which].length -= size;
+       src->sg.data[which].offset += size;
+}
+
+static inline u32 sk_msg_elem_used(const struct sk_msg *msg)
+{
+       return msg->sg.end >= msg->sg.start ?
+               msg->sg.end - msg->sg.start :
+               msg->sg.end + (MAX_MSG_FRAGS - msg->sg.start);
+}
+
+static inline bool sk_msg_full(const struct sk_msg *msg)
+{
+       return (msg->sg.end == msg->sg.start) && msg->sg.size;
+}
+
+static inline struct scatterlist *sk_msg_elem(struct sk_msg *msg, int which)
+{
+       return &msg->sg.data[which];
+}
+
+static inline struct page *sk_msg_page(struct sk_msg *msg, int which)
+{
+       return sg_page(sk_msg_elem(msg, which));
+}
+
+static inline bool sk_msg_to_ingress(const struct sk_msg *msg)
+{
+       return msg->flags & BPF_F_INGRESS;
+}
+
+static inline void sk_msg_compute_data_pointers(struct sk_msg *msg)
+{
+       struct scatterlist *sge = sk_msg_elem(msg, msg->sg.start);
+
+       if (msg->sg.copy[msg->sg.start]) {
+               msg->data = NULL;
+               msg->data_end = NULL;
+       } else {
+               msg->data = sg_virt(sge);
+               msg->data_end = msg->data + sge->length;
+       }
+}
+
+static inline void sk_msg_page_add(struct sk_msg *msg, struct page *page,
+                                  u32 len, u32 offset)
+{
+       struct scatterlist *sge;
+
+       get_page(page);
+       sge = sk_msg_elem(msg, msg->sg.end);
+       sg_set_page(sge, page, len, offset);
+       sg_unmark_end(sge);
+
+       msg->sg.copy[msg->sg.end] = true;
+       msg->sg.size += len;
+       sk_msg_iter_next(msg, end);
+}
+
+static inline struct sk_psock *sk_psock(const struct sock *sk)
+{
+       return rcu_dereference_sk_user_data(sk);
+}
+
+static inline bool sk_has_psock(struct sock *sk)
+{
+       return sk_psock(sk) != NULL && sk->sk_prot->recvmsg == tcp_bpf_recvmsg;
+}
+
+static inline void sk_psock_queue_msg(struct sk_psock *psock,
+                                     struct sk_msg *msg)
+{
+       list_add_tail(&msg->list, &psock->ingress_msg);
+}
+
+static inline void sk_psock_report_error(struct sk_psock *psock, int err)
+{
+       struct sock *sk = psock->sk;
+
+       sk->sk_err = err;
+       sk->sk_error_report(sk);
+}
+
+struct sk_psock *sk_psock_init(struct sock *sk, int node);
+
+int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock);
+void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock);
+void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock);
+
+int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
+                        struct sk_msg *msg);
+
+static inline struct sk_psock_link *sk_psock_init_link(void)
+{
+       return kzalloc(sizeof(struct sk_psock_link),
+                      GFP_ATOMIC | __GFP_NOWARN);
+}
+
+static inline void sk_psock_free_link(struct sk_psock_link *link)
+{
+       kfree(link);
+}
+
+struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock);
+#if defined(CONFIG_BPF_STREAM_PARSER)
+void sk_psock_unlink(struct sock *sk, struct sk_psock_link *link);
+#else
+static inline void sk_psock_unlink(struct sock *sk,
+                                  struct sk_psock_link *link)
+{
+}
+#endif
+
+void __sk_psock_purge_ingress_msg(struct sk_psock *psock);
+
+static inline void sk_psock_cork_free(struct sk_psock *psock)
+{
+       if (psock->cork) {
+               sk_msg_free(psock->sk, psock->cork);
+               kfree(psock->cork);
+               psock->cork = NULL;
+       }
+}
+
+static inline void sk_psock_update_proto(struct sock *sk,
+                                        struct sk_psock *psock,
+                                        struct proto *ops)
+{
+       psock->saved_unhash = sk->sk_prot->unhash;
+       psock->saved_close = sk->sk_prot->close;
+       psock->saved_write_space = sk->sk_write_space;
+
+       psock->sk_proto = sk->sk_prot;
+       sk->sk_prot = ops;
+}
+
+static inline void sk_psock_restore_proto(struct sock *sk,
+                                         struct sk_psock *psock)
+{
+       if (psock->sk_proto) {
+               sk->sk_prot = psock->sk_proto;
+               psock->sk_proto = NULL;
+       }
+}
+
+static inline void sk_psock_set_state(struct sk_psock *psock,
+                                     enum sk_psock_state_bits bit)
+{
+       set_bit(bit, &psock->state);
+}
+
+static inline void sk_psock_clear_state(struct sk_psock *psock,
+                                       enum sk_psock_state_bits bit)
+{
+       clear_bit(bit, &psock->state);
+}
+
+static inline bool sk_psock_test_state(const struct sk_psock *psock,
+                                      enum sk_psock_state_bits bit)
+{
+       return test_bit(bit, &psock->state);
+}
+
+static inline struct sk_psock *sk_psock_get(struct sock *sk)
+{
+       struct sk_psock *psock;
+
+       rcu_read_lock();
+       psock = sk_psock(sk);
+       if (psock && !refcount_inc_not_zero(&psock->refcnt))
+               psock = NULL;
+       rcu_read_unlock();
+       return psock;
+}
+
+void sk_psock_stop(struct sock *sk, struct sk_psock *psock);
+void sk_psock_destroy(struct rcu_head *rcu);
+void sk_psock_drop(struct sock *sk, struct sk_psock *psock);
+
+static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock)
+{
+       if (refcount_dec_and_test(&psock->refcnt))
+               sk_psock_drop(sk, psock);
+}
+
+static inline void psock_set_prog(struct bpf_prog **pprog,
+                                 struct bpf_prog *prog)
+{
+       prog = xchg(pprog, prog);
+       if (prog)
+               bpf_prog_put(prog);
+}
+
+static inline void psock_progs_drop(struct sk_psock_progs *progs)
+{
+       psock_set_prog(&progs->msg_parser, NULL);
+       psock_set_prog(&progs->skb_parser, NULL);
+       psock_set_prog(&progs->skb_verdict, NULL);
+}
+
+#endif /* _LINUX_SKMSG_H */
diff --git a/include/net/tcp.h b/include/net/tcp.h

index 8f5cef6..3600ae0 100644 (file)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -858,6 +858,21 @@ static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb)
         TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb);
  }
  
+static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb)
+{
+       return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS;
+}
+
+static inline struct sock *tcp_skb_bpf_redirect_fetch(struct sk_buff *skb)
+{
+       return TCP_SKB_CB(skb)->bpf.sk_redir;
+}
+
+static inline void tcp_skb_bpf_redirect_clear(struct sk_buff *skb)
+{
+       TCP_SKB_CB(skb)->bpf.sk_redir = NULL;
+}
+
  #if IS_ENABLED(CONFIG_IPV6)
  /* This is the variant of inet6_iif() that must be used by TCP,
   * as TCP moves IP6CB into a different location in skb->cb[]
@@ -2064,6 +2079,18 @@ void tcp_cleanup_ulp(struct sock *sk);
         __MODULE_INFO(alias, alias_userspace, name);            \
         __MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name)
  
+struct sk_msg;
+struct sk_psock;
+
+int tcp_bpf_init(struct sock *sk);
+void tcp_bpf_reinit(struct sock *sk);
+int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes,
+                         int flags);
+int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+                   int nonblock, int flags, int *addr_len);
+int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
+                     struct msghdr *msg, int len);
+
  /* Call BPF_SOCK_OPS program that returns an int. If the return value
   * is < 0, then the BPF op failed (for example if the loaded BPF
   * program does not support the chosen operation or there is no BPF
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile

index 0488b82..ff82626 100644 (file)
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -13,11 +13,6 @@ ifeq ($(CONFIG_XDP_SOCKETS),y)
  obj-$(CONFIG_BPF_SYSCALL) += xskmap.o
  endif
  obj-$(CONFIG_BPF_SYSCALL) += offload.o
-ifeq ($(CONFIG_STREAM_PARSER),y)
-ifeq ($(CONFIG_INET),y)
-obj-$(CONFIG_BPF_SYSCALL) += sockmap.o
-endif
-endif
  endif
  ifeq ($(CONFIG_PERF_EVENTS),y)
  obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c

index 3f5bf1a..defcf4d 100644 (file)
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1792,8 +1792,6 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
  const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
  const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
  const struct bpf_func_proto bpf_get_current_comm_proto __weak;
-const struct bpf_func_proto bpf_sock_map_update_proto __weak;
-const struct bpf_func_proto bpf_sock_hash_update_proto __weak;
  const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
  const struct bpf_func_proto bpf_get_local_storage_proto __weak;
  
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c

deleted file mode 100644 (file)

index de6f7a6..0000000
--- a/kernel/bpf/sockmap.c
+++ /dev/null
@@ -1,2610 +0,0 @@
-/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- */
-
-/* A BPF sock_map is used to store sock objects. This is primarly used
- * for doing socket redirect with BPF helper routines.
- *
- * A sock map may have BPF programs attached to it, currently a program
- * used to parse packets and a program to provide a verdict and redirect
- * decision on the packet are supported. Any programs attached to a sock
- * map are inherited by sock objects when they are added to the map. If
- * no BPF programs are attached the sock object may only be used for sock
- * redirect.
- *
- * A sock object may be in multiple maps, but can only inherit a single
- * parse or verdict program. If adding a sock object to a map would result
- * in having multiple parsing programs the update will return an EBUSY error.
- *
- * For reference this program is similar to devmap used in XDP context
- * reviewing these together may be useful. For an example please review
- * ./samples/bpf/sockmap/.
- */
-#include <linux/bpf.h>
-#include <net/sock.h>
-#include <linux/filter.h>
-#include <linux/errno.h>
-#include <linux/file.h>
-#include <linux/kernel.h>
-#include <linux/net.h>
-#include <linux/skbuff.h>
-#include <linux/workqueue.h>
-#include <linux/list.h>
-#include <linux/mm.h>
-#include <net/strparser.h>
-#include <net/tcp.h>
-#include <linux/ptr_ring.h>
-#include <net/inet_common.h>
-#include <linux/sched/signal.h>
-
-#define SOCK_CREATE_FLAG_MASK \
-       (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
-
-struct bpf_sock_progs {
-       struct bpf_prog *bpf_tx_msg;
-       struct bpf_prog *bpf_parse;
-       struct bpf_prog *bpf_verdict;
-};
-
-struct bpf_stab {
-       struct bpf_map map;
-       struct sock **sock_map;
-       struct bpf_sock_progs progs;
-       raw_spinlock_t lock;
-};
-
-struct bucket {
-       struct hlist_head head;
-       raw_spinlock_t lock;
-};
-
-struct bpf_htab {
-       struct bpf_map map;
-       struct bucket *buckets;
-       atomic_t count;
-       u32 n_buckets;
-       u32 elem_size;
-       struct bpf_sock_progs progs;
-       struct rcu_head rcu;
-};
-
-struct htab_elem {
-       struct rcu_head rcu;
-       struct hlist_node hash_node;
-       u32 hash;
-       struct sock *sk;
-       char key[0];
-};
-
-enum smap_psock_state {
-       SMAP_TX_RUNNING,
-};
-
-struct smap_psock_map_entry {
-       struct list_head list;
-       struct bpf_map *map;
-       struct sock **entry;
-       struct htab_elem __rcu *hash_link;
-};
-
-struct smap_psock {
-       struct rcu_head rcu;
-       refcount_t refcnt;
-
-       /* datapath variables */
-       struct sk_buff_head rxqueue;
-       bool strp_enabled;
-
-       /* datapath error path cache across tx work invocations */
-       int save_rem;
-       int save_off;
-       struct sk_buff *save_skb;
-
-       /* datapath variables for tx_msg ULP */
-       struct sock *sk_redir;
-       int apply_bytes;
-       int cork_bytes;
-       int sg_size;
-       int eval;
-       struct sk_msg_buff *cork;
-       struct list_head ingress;
-
-       struct strparser strp;
-       struct bpf_prog *bpf_tx_msg;
-       struct bpf_prog *bpf_parse;
-       struct bpf_prog *bpf_verdict;
-       struct list_head maps;
-       spinlock_t maps_lock;
-
-       /* Back reference used when sock callback trigger sockmap operations */
-       struct sock *sock;
-       unsigned long state;
-
-       struct work_struct tx_work;
-       struct work_struct gc_work;
-
-       struct proto *sk_proto;
-       void (*save_unhash)(struct sock *sk);
-       void (*save_close)(struct sock *sk, long timeout);
-       void (*save_data_ready)(struct sock *sk);
-       void (*save_write_space)(struct sock *sk);
-};
-
-static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
-static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
-                          int nonblock, int flags, int *addr_len);
-static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
-static int bpf_tcp_sendpage(struct sock *sk, struct page *page,
-                           int offset, size_t size, int flags);
-static void bpf_tcp_unhash(struct sock *sk);
-static void bpf_tcp_close(struct sock *sk, long timeout);
-
-static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
-{
-       return rcu_dereference_sk_user_data(sk);
-}
-
-static bool bpf_tcp_stream_read(const struct sock *sk)
-{
-       struct smap_psock *psock;
-       bool empty = true;
-
-       rcu_read_lock();
-       psock = smap_psock_sk(sk);
-       if (unlikely(!psock))
-               goto out;
-       empty = list_empty(&psock->ingress);
-out:
-       rcu_read_unlock();
-       return !empty;
-}
-
-enum {
-       SOCKMAP_IPV4,
-       SOCKMAP_IPV6,
-       SOCKMAP_NUM_PROTS,
-};
-
-enum {
-       SOCKMAP_BASE,
-       SOCKMAP_TX,
-       SOCKMAP_NUM_CONFIGS,
-};
-
-static struct proto *saved_tcpv6_prot __read_mostly;
-static DEFINE_SPINLOCK(tcpv6_prot_lock);
-static struct proto bpf_tcp_prots[SOCKMAP_NUM_PROTS][SOCKMAP_NUM_CONFIGS];
-
-static void build_protos(struct proto prot[SOCKMAP_NUM_CONFIGS],
-                        struct proto *base)
-{
-       prot[SOCKMAP_BASE]                      = *base;
-       prot[SOCKMAP_BASE].unhash               = bpf_tcp_unhash;
-       prot[SOCKMAP_BASE].close                = bpf_tcp_close;
-       prot[SOCKMAP_BASE].recvmsg              = bpf_tcp_recvmsg;
-       prot[SOCKMAP_BASE].stream_memory_read   = bpf_tcp_stream_read;
-
-       prot[SOCKMAP_TX]                        = prot[SOCKMAP_BASE];
-       prot[SOCKMAP_TX].sendmsg                = bpf_tcp_sendmsg;
-       prot[SOCKMAP_TX].sendpage               = bpf_tcp_sendpage;
-}
-
-static void update_sk_prot(struct sock *sk, struct smap_psock *psock)
-{
-       int family = sk->sk_family == AF_INET6 ? SOCKMAP_IPV6 : SOCKMAP_IPV4;
-       int conf = psock->bpf_tx_msg ? SOCKMAP_TX : SOCKMAP_BASE;
-
-       sk->sk_prot = &bpf_tcp_prots[family][conf];
-}
-
-static int bpf_tcp_init(struct sock *sk)
-{
-       struct smap_psock *psock;
-
-       rcu_read_lock();
-       psock = smap_psock_sk(sk);
-       if (unlikely(!psock)) {
-               rcu_read_unlock();
-               return -EINVAL;
-       }
-
-       if (unlikely(psock->sk_proto)) {
-               rcu_read_unlock();
-               return -EBUSY;
-       }
-
-       psock->save_unhash = sk->sk_prot->unhash;
-       psock->save_close = sk->sk_prot->close;
-       psock->sk_proto = sk->sk_prot;
-
-       /* Build IPv6 sockmap whenever the address of tcpv6_prot changes */
-       if (sk->sk_family == AF_INET6 &&
-           unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) {
-               spin_lock_bh(&tcpv6_prot_lock);
-               if (likely(sk->sk_prot != saved_tcpv6_prot)) {
-                       build_protos(bpf_tcp_prots[SOCKMAP_IPV6], sk->sk_prot);
-                       smp_store_release(&saved_tcpv6_prot, sk->sk_prot);
-               }
-               spin_unlock_bh(&tcpv6_prot_lock);
-       }
-       update_sk_prot(sk, psock);
-       rcu_read_unlock();
-       return 0;
-}
-
-static int __init bpf_sock_init(void)
-{
-       build_protos(bpf_tcp_prots[SOCKMAP_IPV4], &tcp_prot);
-       return 0;
-}
-core_initcall(bpf_sock_init);
-
-static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
-static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge);
-
-static void bpf_tcp_release(struct sock *sk)
-{
-       struct smap_psock *psock;
-
-       rcu_read_lock();
-       psock = smap_psock_sk(sk);
-       if (unlikely(!psock))
-               goto out;
-
-       if (psock->cork) {
-               free_start_sg(psock->sock, psock->cork, true);
-               kfree(psock->cork);
-               psock->cork = NULL;
-       }
-
-       if (psock->sk_proto) {
-               sk->sk_prot = psock->sk_proto;
-               psock->sk_proto = NULL;
-       }
-out:
-       rcu_read_unlock();
-}
-
-static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
-                                        u32 hash, void *key, u32 key_size)
-{
-       struct htab_elem *l;
-
-       hlist_for_each_entry_rcu(l, head, hash_node) {
-               if (l->hash == hash && !memcmp(&l->key, key, key_size))
-                       return l;
-       }
-
-       return NULL;
-}
-
-static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
-{
-       return &htab->buckets[hash & (htab->n_buckets - 1)];
-}
-
-static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
-{
-       return &__select_bucket(htab, hash)->head;
-}
-
-static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
-{
-       atomic_dec(&htab->count);
-       kfree_rcu(l, rcu);
-}
-
-static struct smap_psock_map_entry *psock_map_pop(struct sock *sk,
-                                                 struct smap_psock *psock)
-{
-       struct smap_psock_map_entry *e;
-
-       spin_lock_bh(&psock->maps_lock);
-       e = list_first_entry_or_null(&psock->maps,
-                                    struct smap_psock_map_entry,
-                                    list);
-       if (e)
-               list_del(&e->list);
-       spin_unlock_bh(&psock->maps_lock);
-       return e;
-}
-
-static void bpf_tcp_remove(struct sock *sk, struct smap_psock *psock)
-{
-       struct smap_psock_map_entry *e;
-       struct sk_msg_buff *md, *mtmp;
-       struct sock *osk;
-
-       if (psock->cork) {
-               free_start_sg(psock->sock, psock->cork, true);
-               kfree(psock->cork);
-               psock->cork = NULL;
-       }
-
-       list_for_each_entry_safe(md, mtmp, &psock->ingress, list) {
-               list_del(&md->list);
-               free_start_sg(psock->sock, md, true);
-               kfree(md);
-       }
-
-       e = psock_map_pop(sk, psock);
-       while (e) {
-               if (e->entry) {
-                       struct bpf_stab *stab = container_of(e->map, struct bpf_stab, map);
-
-                       raw_spin_lock_bh(&stab->lock);
-                       osk = *e->entry;
-                       if (osk == sk) {
-                               *e->entry = NULL;
-                               smap_release_sock(psock, sk);
-                       }
-                       raw_spin_unlock_bh(&stab->lock);
-               } else {
-                       struct htab_elem *link = rcu_dereference(e->hash_link);
-                       struct bpf_htab *htab = container_of(e->map, struct bpf_htab, map);
-                       struct hlist_head *head;
-                       struct htab_elem *l;
-                       struct bucket *b;
-
-                       b = __select_bucket(htab, link->hash);
-                       head = &b->head;
-                       raw_spin_lock_bh(&b->lock);
-                       l = lookup_elem_raw(head,
-                                           link->hash, link->key,
-                                           htab->map.key_size);
-                       /* If another thread deleted this object skip deletion.
-                        * The refcnt on psock may or may not be zero.
-                        */
-                       if (l && l == link) {
-                               hlist_del_rcu(&link->hash_node);
-                               smap_release_sock(psock, link->sk);
-                               free_htab_elem(htab, link);
-                       }
-                       raw_spin_unlock_bh(&b->lock);
-               }
-               kfree(e);
-               e = psock_map_pop(sk, psock);
-       }
-}
-
-static void bpf_tcp_unhash(struct sock *sk)
-{
-       void (*unhash_fun)(struct sock *sk);
-       struct smap_psock *psock;
-
-       rcu_read_lock();
-       psock = smap_psock_sk(sk);
-       if (unlikely(!psock)) {
-               rcu_read_unlock();
-               if (sk->sk_prot->unhash)
-                       sk->sk_prot->unhash(sk);
-               return;
-       }
-       unhash_fun = psock->save_unhash;
-       bpf_tcp_remove(sk, psock);
-       rcu_read_unlock();
-       unhash_fun(sk);
-}
-
-static void bpf_tcp_close(struct sock *sk, long timeout)
-{
-       void (*close_fun)(struct sock *sk, long timeout);
-       struct smap_psock *psock;
-
-       lock_sock(sk);
-       rcu_read_lock();
-       psock = smap_psock_sk(sk);
-       if (unlikely(!psock)) {
-               rcu_read_unlock();
-               release_sock(sk);
-               return sk->sk_prot->close(sk, timeout);
-       }
-       close_fun = psock->save_close;
-       bpf_tcp_remove(sk, psock);
-       rcu_read_unlock();
-       release_sock(sk);
-       close_fun(sk, timeout);
-}
-
-enum __sk_action {
-       __SK_DROP = 0,
-       __SK_PASS,
-       __SK_REDIRECT,
-       __SK_NONE,
-};
-
-static int memcopy_from_iter(struct sock *sk,
-                            struct sk_msg_buff *md,
-                            struct iov_iter *from, int bytes)
-{
-       struct scatterlist *sg = md->sg_data;
-       int i = md->sg_curr, rc = -ENOSPC;
-
-       do {
-               int copy;
-               char *to;
-
-               if (md->sg_copybreak >= sg[i].length) {
-                       md->sg_copybreak = 0;
-
-                       if (++i == MAX_SKB_FRAGS)
-                               i = 0;
-
-                       if (i == md->sg_end)
-                               break;
-               }
-
-               copy = sg[i].length - md->sg_copybreak;
-               to = sg_virt(&sg[i]) + md->sg_copybreak;
-               md->sg_copybreak += copy;
-
-               if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY)
-                       rc = copy_from_iter_nocache(to, copy, from);
-               else
-                       rc = copy_from_iter(to, copy, from);
-
-               if (rc != copy) {
-                       rc = -EFAULT;
-                       goto out;
-               }
-
-               bytes -= copy;
-               if (!bytes)
-                       break;
-
-               md->sg_copybreak = 0;
-               if (++i == MAX_SKB_FRAGS)
-                       i = 0;
-       } while (i != md->sg_end);
-out:
-       md->sg_curr = i;
-       return rc;
-}
-
-static int bpf_tcp_push(struct sock *sk, int apply_bytes,
-                       struct sk_msg_buff *md,
-                       int flags, bool uncharge)
-{
-       bool apply = apply_bytes;
-       struct scatterlist *sg;
-       int offset, ret = 0;
-       struct page *p;
-       size_t size;
-
-       while (1) {
-               sg = md->sg_data + md->sg_start;
-               size = (apply && apply_bytes < sg->length) ?
-                       apply_bytes : sg->length;
-               offset = sg->offset;
-
-               tcp_rate_check_app_limited(sk);
-               p = sg_page(sg);
-retry:
-               ret = do_tcp_sendpages(sk, p, offset, size, flags);
-               if (ret != size) {
-                       if (ret > 0) {
-                               if (apply)
-                                       apply_bytes -= ret;
-
-                               sg->offset += ret;
-                               sg->length -= ret;
-                               size -= ret;
-                               offset += ret;
-                               if (uncharge)
-                                       sk_mem_uncharge(sk, ret);
-                               goto retry;
-                       }
-
-                       return ret;
-               }
-
-               if (apply)
-                       apply_bytes -= ret;
-               sg->offset += ret;
-               sg->length -= ret;
-               if (uncharge)
-                       sk_mem_uncharge(sk, ret);
-
-               if (!sg->length) {
-                       put_page(p);
-                       md->sg_start++;
-                       if (md->sg_start == MAX_SKB_FRAGS)
-                               md->sg_start = 0;
-                       sg_init_table(sg, 1);
-
-                       if (md->sg_start == md->sg_end)
-                               break;
-               }
-
-               if (apply && !apply_bytes)
-                       break;
-       }
-       return 0;
-}
-
-static inline void bpf_compute_data_pointers_sg(struct sk_msg_buff *md)
-{
-       struct scatterlist *sg = md->sg_data + md->sg_start;
-
-       if (md->sg_copy[md->sg_start]) {
-               md->data = md->data_end = 0;
-       } else {
-               md->data = sg_virt(sg);
-               md->data_end = md->data + sg->length;
-       }
-}
-
-static void return_mem_sg(struct sock *sk, int bytes, struct sk_msg_buff *md)
-{
-       struct scatterlist *sg = md->sg_data;
-       int i = md->sg_start;
-
-       do {
-               int uncharge = (bytes < sg[i].length) ? bytes : sg[i].length;
-
-               sk_mem_uncharge(sk, uncharge);
-               bytes -= uncharge;
-               if (!bytes)
-                       break;
-               i++;
-               if (i == MAX_SKB_FRAGS)
-                       i = 0;
-       } while (i != md->sg_end);
-}
-
-static void free_bytes_sg(struct sock *sk, int bytes,
-                         struct sk_msg_buff *md, bool charge)
-{
-       struct scatterlist *sg = md->sg_data;
-       int i = md->sg_start, free;
-
-       while (bytes && sg[i].length) {
-               free = sg[i].length;
-               if (bytes < free) {
-                       sg[i].length -= bytes;
-                       sg[i].offset += bytes;
-                       if (charge)
-                               sk_mem_uncharge(sk, bytes);
-                       break;
-               }
-
-               if (charge)
-                       sk_mem_uncharge(sk, sg[i].length);
-               put_page(sg_page(&sg[i]));
-               bytes -= sg[i].length;
-               sg[i].length = 0;
-               sg[i].page_link = 0;
-               sg[i].offset = 0;
-               i++;
-
-               if (i == MAX_SKB_FRAGS)
-                       i = 0;
-       }
-       md->sg_start = i;
-}
-
-static int free_sg(struct sock *sk, int start,
-                  struct sk_msg_buff *md, bool charge)
-{
-       struct scatterlist *sg = md->sg_data;
-       int i = start, free = 0;
-
-       while (sg[i].length) {
-               free += sg[i].length;
-               if (charge)
-                       sk_mem_uncharge(sk, sg[i].length);
-               if (!md->skb)
-                       put_page(sg_page(&sg[i]));
-               sg[i].length = 0;
-               sg[i].page_link = 0;
-               sg[i].offset = 0;
-               i++;
-
-               if (i == MAX_SKB_FRAGS)
-                       i = 0;
-       }
-       consume_skb(md->skb);
-
-       return free;
-}
-
-static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge)
-{
-       int free = free_sg(sk, md->sg_start, md, charge);
-
-       md->sg_start = md->sg_end;
-       return free;
-}
-
-static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md)
-{
-       return free_sg(sk, md->sg_curr, md, true);
-}
-
-static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md)
-{
-       return ((_rc == SK_PASS) ?
-              (md->sk_redir ? __SK_REDIRECT : __SK_PASS) :
-              __SK_DROP);
-}
-
-static unsigned int smap_do_tx_msg(struct sock *sk,
-                                  struct smap_psock *psock,
-                                  struct sk_msg_buff *md)
-{
-       struct bpf_prog *prog;
-       unsigned int rc, _rc;
-
-       preempt_disable();
-       rcu_read_lock();
-
-       /* If the policy was removed mid-send then default to 'accept' */
-       prog = READ_ONCE(psock->bpf_tx_msg);
-       if (unlikely(!prog)) {
-               _rc = SK_PASS;
-               goto verdict;
-       }
-
-       bpf_compute_data_pointers_sg(md);
-       md->sk = sk;
-       rc = (*prog->bpf_func)(md, prog->insnsi);
-       psock->apply_bytes = md->apply_bytes;
-
-       /* Moving return codes from UAPI namespace into internal namespace */
-       _rc = bpf_map_msg_verdict(rc, md);
-
-       /* The psock has a refcount on the sock but not on the map and because
-        * we need to drop rcu read lock here its possible the map could be
-        * removed between here and when we need it to execute the sock
-        * redirect. So do the map lookup now for future use.
-        */
-       if (_rc == __SK_REDIRECT) {
-               if (psock->sk_redir)
-                       sock_put(psock->sk_redir);
-               psock->sk_redir = do_msg_redirect_map(md);
-               if (!psock->sk_redir) {
-                       _rc = __SK_DROP;
-                       goto verdict;
-               }
-               sock_hold(psock->sk_redir);
-       }
-verdict:
-       rcu_read_unlock();
-       preempt_enable();
-
-       return _rc;
-}
-
-static int bpf_tcp_ingress(struct sock *sk, int apply_bytes,
-                          struct smap_psock *psock,
-                          struct sk_msg_buff *md, int flags)
-{
-       bool apply = apply_bytes;
-       size_t size, copied = 0;
-       struct sk_msg_buff *r;
-       int err = 0, i;
-
-       r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_KERNEL);
-       if (unlikely(!r))
-               return -ENOMEM;
-
-       lock_sock(sk);
-       r->sg_start = md->sg_start;
-       i = md->sg_start;
-
-       do {
-               size = (apply && apply_bytes < md->sg_data[i].length) ?
-                       apply_bytes : md->sg_data[i].length;
-
-               if (!sk_wmem_schedule(sk, size)) {
-                       if (!copied)
-                               err = -ENOMEM;
-                       break;
-               }
-
-               sk_mem_charge(sk, size);
-               r->sg_data[i] = md->sg_data[i];
-               r->sg_data[i].length = size;
-               md->sg_data[i].length -= size;
-               md->sg_data[i].offset += size;
-               copied += size;
-
-               if (md->sg_data[i].length) {
-                       get_page(sg_page(&r->sg_data[i]));
-                       r->sg_end = (i + 1) == MAX_SKB_FRAGS ? 0 : i + 1;
-               } else {
-                       i++;
-                       if (i == MAX_SKB_FRAGS)
-                               i = 0;
-                       r->sg_end = i;
-               }
-
-               if (apply) {
-                       apply_bytes -= size;
-                       if (!apply_bytes)
-                               break;
-               }
-       } while (i != md->sg_end);
-
-       md->sg_start = i;
-
-       if (!err) {
-               list_add_tail(&r->list, &psock->ingress);
-               sk->sk_data_ready(sk);
-       } else {
-               free_start_sg(sk, r, true);
-               kfree(r);
-       }
-
-       release_sock(sk);
-       return err;
-}
-
-static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send,
-                                      struct sk_msg_buff *md,
-                                      int flags)
-{
-       bool ingress = !!(md->flags & BPF_F_INGRESS);
-       struct smap_psock *psock;
-       int err = 0;
-
-       rcu_read_lock();
-       psock = smap_psock_sk(sk);
-       if (unlikely(!psock))
-               goto out_rcu;
-
-       if (!refcount_inc_not_zero(&psock->refcnt))
-               goto out_rcu;
-
-       rcu_read_unlock();
-
-       if (ingress) {
-               err = bpf_tcp_ingress(sk, send, psock, md, flags);
-       } else {
-               lock_sock(sk);
-               err = bpf_tcp_push(sk, send, md, flags, false);
-               release_sock(sk);
-       }
-       smap_release_sock(psock, sk);
-       return err;
-out_rcu:
-       rcu_read_unlock();
-       return 0;
-}
-
-static inline void bpf_md_init(struct smap_psock *psock)
-{
-       if (!psock->apply_bytes) {
-               psock->eval =  __SK_NONE;
-               if (psock->sk_redir) {
-                       sock_put(psock->sk_redir);
-                       psock->sk_redir = NULL;
-               }
-       }
-}
-
-static void apply_bytes_dec(struct smap_psock *psock, int i)
-{
-       if (psock->apply_bytes) {
-               if (psock->apply_bytes < i)
-                       psock->apply_bytes = 0;
-               else
-                       psock->apply_bytes -= i;
-       }
-}
-
-static int bpf_exec_tx_verdict(struct smap_psock *psock,
-                              struct sk_msg_buff *m,
-                              struct sock *sk,
-                              int *copied, int flags)
-{
-       bool cork = false, enospc = (m->sg_start == m->sg_end);
-       struct sock *redir;
-       int err = 0;
-       int send;
-
-more_data:
-       if (psock->eval == __SK_NONE)
-               psock->eval = smap_do_tx_msg(sk, psock, m);
-
-       if (m->cork_bytes &&
-           m->cork_bytes > psock->sg_size && !enospc) {
-               psock->cork_bytes = m->cork_bytes - psock->sg_size;
-               if (!psock->cork) {
-                       psock->cork = kcalloc(1,
-                                       sizeof(struct sk_msg_buff),
-                                       GFP_ATOMIC | __GFP_NOWARN);
-
-                       if (!psock->cork) {
-                               err = -ENOMEM;
-                               goto out_err;
-                       }
-               }
-               memcpy(psock->cork, m, sizeof(*m));
-               goto out_err;
-       }
-
-       send = psock->sg_size;
-       if (psock->apply_bytes && psock->apply_bytes < send)
-               send = psock->apply_bytes;
-
-       switch (psock->eval) {
-       case __SK_PASS:
-               err = bpf_tcp_push(sk, send, m, flags, true);
-               if (unlikely(err)) {
-                       *copied -= free_start_sg(sk, m, true);
-                       break;
-               }
-
-               apply_bytes_dec(psock, send);
-               psock->sg_size -= send;
-               break;
-       case __SK_REDIRECT:
-               redir = psock->sk_redir;
-               apply_bytes_dec(psock, send);
-
-               if (psock->cork) {
-                       cork = true;
-                       psock->cork = NULL;
-               }
-
-               return_mem_sg(sk, send, m);
-               release_sock(sk);
-
-               err = bpf_tcp_sendmsg_do_redirect(redir, send, m, flags);
-               lock_sock(sk);
-
-               if (unlikely(err < 0)) {
-                       int free = free_start_sg(sk, m, false);
-
-                       psock->sg_size = 0;
-                       if (!cork)
-                               *copied -= free;
-               } else {
-                       psock->sg_size -= send;
-               }
-
-               if (cork) {
-                       free_start_sg(sk, m, true);
-                       psock->sg_size = 0;
-                       kfree(m);
-                       m = NULL;
-                       err = 0;
-               }
-               break;
-       case __SK_DROP:
-       default:
-               free_bytes_sg(sk, send, m, true);
-               apply_bytes_dec(psock, send);
-               *copied -= send;
-               psock->sg_size -= send;
-               err = -EACCES;
-               break;
-       }
-
-       if (likely(!err)) {
-               bpf_md_init(psock);
-               if (m &&
-                   m->sg_data[m->sg_start].page_link &&
-                   m->sg_data[m->sg_start].length)
-                       goto more_data;
-       }
-
-out_err:
-       return err;
-}
-
-static int bpf_wait_data(struct sock *sk,
-                        struct smap_psock *psk, int flags,
-                        long timeo, int *err)
-{
-       int rc;
-
-       DEFINE_WAIT_FUNC(wait, woken_wake_function);
-
-       add_wait_queue(sk_sleep(sk), &wait);
-       sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
-       rc = sk_wait_event(sk, &timeo,
-                          !list_empty(&psk->ingress) ||
-                          !skb_queue_empty(&sk->sk_receive_queue),
-                          &wait);
-       sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
-       remove_wait_queue(sk_sleep(sk), &wait);
-
-       return rc;
-}
-
-static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
-                          int nonblock, int flags, int *addr_len)
-{
-       struct iov_iter *iter = &msg->msg_iter;
-       struct smap_psock *psock;
-       int copied = 0;
-
-       if (unlikely(flags & MSG_ERRQUEUE))
-               return inet_recv_error(sk, msg, len, addr_len);
-       if (!skb_queue_empty(&sk->sk_receive_queue))
-               return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
-
-       rcu_read_lock();
-       psock = smap_psock_sk(sk);
-       if (unlikely(!psock))
-               goto out;
-
-       if (unlikely(!refcount_inc_not_zero(&psock->refcnt)))
-               goto out;
-       rcu_read_unlock();
-
-       lock_sock(sk);
-bytes_ready:
-       while (copied != len) {
-               struct scatterlist *sg;
-               struct sk_msg_buff *md;
-               int i;
-
-               md = list_first_entry_or_null(&psock->ingress,
-                                             struct sk_msg_buff, list);
-               if (unlikely(!md))
-                       break;
-               i = md->sg_start;
-               do {
-                       struct page *page;
-                       int n, copy;
-
-                       sg = &md->sg_data[i];
-                       copy = sg->length;
-                       page = sg_page(sg);
-
-                       if (copied + copy > len)
-                               copy = len - copied;
-
-                       n = copy_page_to_iter(page, sg->offset, copy, iter);
-                       if (n != copy) {
-                               md->sg_start = i;
-                               release_sock(sk);
-                               smap_release_sock(psock, sk);
-                               return -EFAULT;
-                       }
-
-                       copied += copy;
-                       sg->offset += copy;
-                       sg->length -= copy;
-                       sk_mem_uncharge(sk, copy);
-
-                       if (!sg->length) {
-                               i++;
-                               if (i == MAX_SKB_FRAGS)
-                                       i = 0;
-                               if (!md->skb)
-                                       put_page(page);
-                       }
-                       if (copied == len)
-                               break;
-               } while (i != md->sg_end);
-               md->sg_start = i;
-
-               if (!sg->length && md->sg_start == md->sg_end) {
-                       list_del(&md->list);
-                       consume_skb(md->skb);
-                       kfree(md);
-               }
-       }
-
-       if (!copied) {
-               long timeo;
-               int data;
-               int err = 0;
-
-               timeo = sock_rcvtimeo(sk, nonblock);
-               data = bpf_wait_data(sk, psock, flags, timeo, &err);
-
-               if (data) {
-                       if (!skb_queue_empty(&sk->sk_receive_queue)) {
-                               release_sock(sk);
-                               smap_release_sock(psock, sk);
-                               copied = tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
-                               return copied;
-                       }
-                       goto bytes_ready;
-               }
-
-               if (err)
-                       copied = err;
-       }
-
-       release_sock(sk);
-       smap_release_sock(psock, sk);
-       return copied;
-out:
-       rcu_read_unlock();
-       return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
-}
-
-
-static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
-{
-       int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS;
-       struct sk_msg_buff md = {0};
-       unsigned int sg_copy = 0;
-       struct smap_psock *psock;
-       int copied = 0, err = 0;
-       struct scatterlist *sg;
-       long timeo;
-
-       /* Its possible a sock event or user removed the psock _but_ the ops
-        * have not been reprogrammed yet so we get here. In this case fallback
-        * to tcp_sendmsg. Note this only works because we _only_ ever allow
-        * a single ULP there is no hierarchy here.
-        */
-       rcu_read_lock();
-       psock = smap_psock_sk(sk);
-       if (unlikely(!psock)) {
-               rcu_read_unlock();
-               return tcp_sendmsg(sk, msg, size);
-       }
-
-       /* Increment the psock refcnt to ensure its not released while sending a
-        * message. Required because sk lookup and bpf programs are used in
-        * separate rcu critical sections. Its OK if we lose the map entry
-        * but we can't lose the sock reference.
-        */
-       if (!refcount_inc_not_zero(&psock->refcnt)) {
-               rcu_read_unlock();
-               return tcp_sendmsg(sk, msg, size);
-       }
-
-       sg = md.sg_data;
-       sg_init_marker(sg, MAX_SKB_FRAGS);
-       rcu_read_unlock();
-
-       lock_sock(sk);
-       timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
-
-       while (msg_data_left(msg)) {
-               struct sk_msg_buff *m = NULL;
-               bool enospc = false;
-               int copy;
-
-               if (sk->sk_err) {
-                       err = -sk->sk_err;
-                       goto out_err;
-               }
-
-               copy = msg_data_left(msg);
-               if (!sk_stream_memory_free(sk))
-                       goto wait_for_sndbuf;
-
-               m = psock->cork_bytes ? psock->cork : &md;
-               m->sg_curr = m->sg_copybreak ? m->sg_curr : m->sg_end;
-               err = sk_alloc_sg(sk, copy, m->sg_data,
-                                 m->sg_start, &m->sg_end, &sg_copy,
-                                 m->sg_end - 1);
-               if (err) {
-                       if (err != -ENOSPC)
-                               goto wait_for_memory;
-                       enospc = true;
-                       copy = sg_copy;
-               }
-
-               err = memcopy_from_iter(sk, m, &msg->msg_iter, copy);
-               if (err < 0) {
-                       free_curr_sg(sk, m);
-                       goto out_err;
-               }
-
-               psock->sg_size += copy;
-               copied += copy;
-               sg_copy = 0;
-
-               /* When bytes are being corked skip running BPF program and
-                * applying verdict unless there is no more buffer space. In
-                * the ENOSPC case simply run BPF prorgram with currently
-                * accumulated data. We don't have much choice at this point
-                * we could try extending the page frags or chaining complex
-                * frags but even in these cases _eventually_ we will hit an
-                * OOM scenario. More complex recovery schemes may be
-                * implemented in the future, but BPF programs must handle
-                * the case where apply_cork requests are not honored. The
-                * canonical method to verify this is to check data length.
-                */
-               if (psock->cork_bytes) {
-                       if (copy > psock->cork_bytes)
-                               psock->cork_bytes = 0;
-                       else
-                               psock->cork_bytes -= copy;
-
-                       if (psock->cork_bytes && !enospc)
-                               goto out_cork;
-
-                       /* All cork bytes accounted for re-run filter */
-                       psock->eval = __SK_NONE;
-                       psock->cork_bytes = 0;
-               }
-
-               err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags);
-               if (unlikely(err < 0))
-                       goto out_err;
-               continue;
-wait_for_sndbuf:
-               set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-wait_for_memory:
-               err = sk_stream_wait_memory(sk, &timeo);
-               if (err) {
-                       if (m && m != psock->cork)
-                               free_start_sg(sk, m, true);
-                       goto out_err;
-               }
-       }
-out_err:
-       if (err < 0)
-               err = sk_stream_error(sk, msg->msg_flags, err);
-out_cork:
-       release_sock(sk);
-       smap_release_sock(psock, sk);
-       return copied ? copied : err;
-}
-
-static int bpf_tcp_sendpage(struct sock *sk, struct page *page,
-                           int offset, size_t size, int flags)
-{
-       struct sk_msg_buff md = {0}, *m = NULL;
-       int err = 0, copied = 0;
-       struct smap_psock *psock;
-       struct scatterlist *sg;
-       bool enospc = false;
-
-       rcu_read_lock();
-       psock = smap_psock_sk(sk);
-       if (unlikely(!psock))
-               goto accept;
-
-       if (!refcount_inc_not_zero(&psock->refcnt))
-               goto accept;
-       rcu_read_unlock();
-
-       lock_sock(sk);
-
-       if (psock->cork_bytes) {
-               m = psock->cork;
-               sg = &m->sg_data[m->sg_end];
-       } else {
-               m = &md;
-               sg = m->sg_data;
-               sg_init_marker(sg, MAX_SKB_FRAGS);
-       }
-
-       /* Catch case where ring is full and sendpage is stalled. */
-       if (unlikely(m->sg_end == m->sg_start &&
-           m->sg_data[m->sg_end].length))
-               goto out_err;
-
-       psock->sg_size += size;
-       sg_set_page(sg, page, size, offset);
-       get_page(page);
-       m->sg_copy[m->sg_end] = true;
-       sk_mem_charge(sk, size);
-       m->sg_end++;
-       copied = size;
-
-       if (m->sg_end == MAX_SKB_FRAGS)
-               m->sg_end = 0;
-
-       if (m->sg_end == m->sg_start)
-               enospc = true;
-
-       if (psock->cork_bytes) {
-               if (size > psock->cork_bytes)
-                       psock->cork_bytes = 0;
-               else
-                       psock->cork_bytes -= size;
-
-               if (psock->cork_bytes && !enospc)
-                       goto out_err;
-
-               /* All cork bytes accounted for re-run filter */
-               psock->eval = __SK_NONE;
-               psock->cork_bytes = 0;
-       }
-
-       err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags);
-out_err:
-       release_sock(sk);
-       smap_release_sock(psock, sk);
-       return copied ? copied : err;
-accept:
-       rcu_read_unlock();
-       return tcp_sendpage(sk, page, offset, size, flags);
-}
-
-static void bpf_tcp_msg_add(struct smap_psock *psock,
-                           struct sock *sk,
-                           struct bpf_prog *tx_msg)
-{
-       struct bpf_prog *orig_tx_msg;
-
-       orig_tx_msg = xchg(&psock->bpf_tx_msg, tx_msg);
-       if (orig_tx_msg)
-               bpf_prog_put(orig_tx_msg);
-}
-
-static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
-{
-       struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict);
-       int rc;
-
-       if (unlikely(!prog))
-               return __SK_DROP;
-
-       skb_orphan(skb);
-       /* We need to ensure that BPF metadata for maps is also cleared
-        * when we orphan the skb so that we don't have the possibility
-        * to reference a stale map.
-        */
-       TCP_SKB_CB(skb)->bpf.sk_redir = NULL;
-       skb->sk = psock->sock;
-       bpf_compute_data_end_sk_skb(skb);
-       preempt_disable();
-       rc = (*prog->bpf_func)(skb, prog->insnsi);
-       preempt_enable();
-       skb->sk = NULL;
-
-       /* Moving return codes from UAPI namespace into internal namespace */
-       return rc == SK_PASS ?
-               (TCP_SKB_CB(skb)->bpf.sk_redir ? __SK_REDIRECT : __SK_PASS) :
-               __SK_DROP;
-}
-
-static int smap_do_ingress(struct smap_psock *psock, struct sk_buff *skb)
-{
-       struct sock *sk = psock->sock;
-       int copied = 0, num_sg;
-       struct sk_msg_buff *r;
-
-       r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_ATOMIC);
-       if (unlikely(!r))
-               return -EAGAIN;
-
-       if (!sk_rmem_schedule(sk, skb, skb->len)) {
-               kfree(r);
-               return -EAGAIN;
-       }
-
-       sg_init_table(r->sg_data, MAX_SKB_FRAGS);
-       num_sg = skb_to_sgvec(skb, r->sg_data, 0, skb->len);
-       if (unlikely(num_sg < 0)) {
-               kfree(r);
-               return num_sg;
-       }
-       sk_mem_charge(sk, skb->len);
-       copied = skb->len;
-       r->sg_start = 0;
-       r->sg_end = num_sg == MAX_SKB_FRAGS ? 0 : num_sg;
-       r->skb = skb;
-       list_add_tail(&r->list, &psock->ingress);
-       sk->sk_data_ready(sk);
-       return copied;
-}
-
-static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb)
-{
-       struct smap_psock *peer;
-       struct sock *sk;
-       __u32 in;
-       int rc;
-
-       rc = smap_verdict_func(psock, skb);
-       switch (rc) {
-       case __SK_REDIRECT:
-               sk = do_sk_redirect_map(skb);
-               if (!sk) {
-                       kfree_skb(skb);
-                       break;
-               }
-
-               peer = smap_psock_sk(sk);
-               in = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS;
-
-               if (unlikely(!peer || sock_flag(sk, SOCK_DEAD) ||
-                            !test_bit(SMAP_TX_RUNNING, &peer->state))) {
-                       kfree_skb(skb);
-                       break;
-               }
-
-               if (!in && sock_writeable(sk)) {
-                       skb_set_owner_w(skb, sk);
-                       skb_queue_tail(&peer->rxqueue, skb);
-                       schedule_work(&peer->tx_work);
-                       break;
-               } else if (in &&
-                          atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) {
-                       skb_queue_tail(&peer->rxqueue, skb);
-                       schedule_work(&peer->tx_work);
-                       break;
-               }
-       /* Fall through and free skb otherwise */
-       case __SK_DROP:
-       default:
-               kfree_skb(skb);
-       }
-}
-
-static void smap_report_sk_error(struct smap_psock *psock, int err)
-{
-       struct sock *sk = psock->sock;
-
-       sk->sk_err = err;
-       sk->sk_error_report(sk);
-}
-
-static void smap_read_sock_strparser(struct strparser *strp,
-                                    struct sk_buff *skb)
-{
-       struct smap_psock *psock;
-
-       rcu_read_lock();
-       psock = container_of(strp, struct smap_psock, strp);
-       smap_do_verdict(psock, skb);
-       rcu_read_unlock();
-}
-
-/* Called with lock held on socket */
-static void smap_data_ready(struct sock *sk)
-{
-       struct smap_psock *psock;
-
-       rcu_read_lock();
-       psock = smap_psock_sk(sk);
-       if (likely(psock)) {
-               write_lock_bh(&sk->sk_callback_lock);
-               strp_data_ready(&psock->strp);
-               write_unlock_bh(&sk->sk_callback_lock);
-       }
-       rcu_read_unlock();
-}
-
-static void smap_tx_work(struct work_struct *w)
-{
-       struct smap_psock *psock;
-       struct sk_buff *skb;
-       int rem, off, n;
-
-       psock = container_of(w, struct smap_psock, tx_work);
-
-       /* lock sock to avoid losing sk_socket at some point during loop */
-       lock_sock(psock->sock);
-       if (psock->save_skb) {
-               skb = psock->save_skb;
-               rem = psock->save_rem;
-               off = psock->save_off;
-               psock->save_skb = NULL;
-               goto start;
-       }
-
-       while ((skb = skb_dequeue(&psock->rxqueue))) {
-               __u32 flags;
-
-               rem = skb->len;
-               off = 0;
-start:
-               flags = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS;
-               do {
-                       if (likely(psock->sock->sk_socket)) {
-                               if (flags)
-                                       n = smap_do_ingress(psock, skb);
-                               else
-                                       n = skb_send_sock_locked(psock->sock,
-                                                                skb, off, rem);
-                       } else {
-                               n = -EINVAL;
-                       }
-
-                       if (n <= 0) {
-                               if (n == -EAGAIN) {
-                                       /* Retry when space is available */
-                                       psock->save_skb = skb;
-                                       psock->save_rem = rem;
-                                       psock->save_off = off;
-                                       goto out;
-                               }
-                               /* Hard errors break pipe and stop xmit */
-                               smap_report_sk_error(psock, n ? -n : EPIPE);
-                               clear_bit(SMAP_TX_RUNNING, &psock->state);
-                               kfree_skb(skb);
-                               goto out;
-                       }
-                       rem -= n;
-                       off += n;
-               } while (rem);
-
-               if (!flags)
-                       kfree_skb(skb);
-       }
-out:
-       release_sock(psock->sock);
-}
-
-static void smap_write_space(struct sock *sk)
-{
-       struct smap_psock *psock;
-       void (*write_space)(struct sock *sk);
-
-       rcu_read_lock();
-       psock = smap_psock_sk(sk);
-       if (likely(psock && test_bit(SMAP_TX_RUNNING, &psock->state)))
-               schedule_work(&psock->tx_work);
-       write_space = psock->save_write_space;
-       rcu_read_unlock();
-       write_space(sk);
-}
-
-static void smap_stop_sock(struct smap_psock *psock, struct sock *sk)
-{
-       if (!psock->strp_enabled)
-               return;
-       sk->sk_data_ready = psock->save_data_ready;
-       sk->sk_write_space = psock->save_write_space;
-       psock->save_data_ready = NULL;
-       psock->save_write_space = NULL;
-       strp_stop(&psock->strp);
-       psock->strp_enabled = false;
-}
-
-static void smap_destroy_psock(struct rcu_head *rcu)
-{
-       struct smap_psock *psock = container_of(rcu,
-                                                 struct smap_psock, rcu);
-
-       /* Now that a grace period has passed there is no longer
-        * any reference to this sock in the sockmap so we can
-        * destroy the psock, strparser, and bpf programs. But,
-        * because we use workqueue sync operations we can not
-        * do it in rcu context
-        */
-       schedule_work(&psock->gc_work);
-}
-
-static bool psock_is_smap_sk(struct sock *sk)
-{
-       return inet_csk(sk)->icsk_ulp_ops == &bpf_tcp_ulp_ops;
-}
-
-static void smap_release_sock(struct smap_psock *psock, struct sock *sock)
-{
-       if (refcount_dec_and_test(&psock->refcnt)) {
-               if (psock_is_smap_sk(sock))
-                       bpf_tcp_release(sock);
-               write_lock_bh(&sock->sk_callback_lock);
-               smap_stop_sock(psock, sock);
-               write_unlock_bh(&sock->sk_callback_lock);
-               clear_bit(SMAP_TX_RUNNING, &psock->state);
-               rcu_assign_sk_user_data(sock, NULL);
-               call_rcu_sched(&psock->rcu, smap_destroy_psock);
-       }
-}
-
-static int smap_parse_func_strparser(struct strparser *strp,
-                                      struct sk_buff *skb)
-{
-       struct smap_psock *psock;
-       struct bpf_prog *prog;
-       int rc;
-
-       rcu_read_lock();
-       psock = container_of(strp, struct smap_psock, strp);
-       prog = READ_ONCE(psock->bpf_parse);
-
-       if (unlikely(!prog)) {
-               rcu_read_unlock();
-               return skb->len;
-       }
-
-       /* Attach socket for bpf program to use if needed we can do this
-        * because strparser clones the skb before handing it to a upper
-        * layer, meaning skb_orphan has been called. We NULL sk on the
-        * way out to ensure we don't trigger a BUG_ON in skb/sk operations
-        * later and because we are not charging the memory of this skb to
-        * any socket yet.
-        */
-       skb->sk = psock->sock;
-       bpf_compute_data_end_sk_skb(skb);
-       rc = (*prog->bpf_func)(skb, prog->insnsi);
-       skb->sk = NULL;
-       rcu_read_unlock();
-       return rc;
-}
-
-static int smap_read_sock_done(struct strparser *strp, int err)
-{
-       return err;
-}
-
-static int smap_init_sock(struct smap_psock *psock,
-                         struct sock *sk)
-{
-       static const struct strp_callbacks cb = {
-               .rcv_msg = smap_read_sock_strparser,
-               .parse_msg = smap_parse_func_strparser,
-               .read_sock_done = smap_read_sock_done,
-       };
-
-       return strp_init(&psock->strp, sk, &cb);
-}
-
-static void smap_init_progs(struct smap_psock *psock,
-                           struct bpf_prog *verdict,
-                           struct bpf_prog *parse)
-{
-       struct bpf_prog *orig_parse, *orig_verdict;
-
-       orig_parse = xchg(&psock->bpf_parse, parse);
-       orig_verdict = xchg(&psock->bpf_verdict, verdict);
-
-       if (orig_verdict)
-               bpf_prog_put(orig_verdict);
-       if (orig_parse)
-               bpf_prog_put(orig_parse);
-}
-
-static void smap_start_sock(struct smap_psock *psock, struct sock *sk)
-{
-       if (sk->sk_data_ready == smap_data_ready)
-               return;
-       psock->save_data_ready = sk->sk_data_ready;
-       psock->save_write_space = sk->sk_write_space;
-       sk->sk_data_ready = smap_data_ready;
-       sk->sk_write_space = smap_write_space;
-       psock->strp_enabled = true;
-}
-
-static void sock_map_remove_complete(struct bpf_stab *stab)
-{
-       bpf_map_area_free(stab->sock_map);
-       kfree(stab);
-}
-
-static void smap_gc_work(struct work_struct *w)
-{
-       struct smap_psock_map_entry *e, *tmp;
-       struct sk_msg_buff *md, *mtmp;
-       struct smap_psock *psock;
-
-       psock = container_of(w, struct smap_psock, gc_work);
-
-       /* no callback lock needed because we already detached sockmap ops */
-       if (psock->strp_enabled)
-               strp_done(&psock->strp);
-
-       cancel_work_sync(&psock->tx_work);
-       __skb_queue_purge(&psock->rxqueue);
-
-       /* At this point all strparser and xmit work must be complete */
-       if (psock->bpf_parse)
-               bpf_prog_put(psock->bpf_parse);
-       if (psock->bpf_verdict)
-               bpf_prog_put(psock->bpf_verdict);
-       if (psock->bpf_tx_msg)
-               bpf_prog_put(psock->bpf_tx_msg);
-
-       if (psock->cork) {
-               free_start_sg(psock->sock, psock->cork, true);
-               kfree(psock->cork);
-       }
-
-       list_for_each_entry_safe(md, mtmp, &psock->ingress, list) {
-               list_del(&md->list);
-               free_start_sg(psock->sock, md, true);
-               kfree(md);
-       }
-
-       list_for_each_entry_safe(e, tmp, &psock->maps, list) {
-               list_del(&e->list);
-               kfree(e);
-       }
-
-       if (psock->sk_redir)
-               sock_put(psock->sk_redir);
-
-       sock_put(psock->sock);
-       kfree(psock);
-}
-
-static struct smap_psock *smap_init_psock(struct sock *sock, int node)
-{
-       struct smap_psock *psock;
-
-       psock = kzalloc_node(sizeof(struct smap_psock),
-                            GFP_ATOMIC | __GFP_NOWARN,
-                            node);
-       if (!psock)
-               return ERR_PTR(-ENOMEM);
-
-       psock->eval =  __SK_NONE;
-       psock->sock = sock;
-       skb_queue_head_init(&psock->rxqueue);
-       INIT_WORK(&psock->tx_work, smap_tx_work);
-       INIT_WORK(&psock->gc_work, smap_gc_work);
-       INIT_LIST_HEAD(&psock->maps);
-       INIT_LIST_HEAD(&psock->ingress);
-       refcount_set(&psock->refcnt, 1);
-       spin_lock_init(&psock->maps_lock);
-
-       rcu_assign_sk_user_data(sock, psock);
-       sock_hold(sock);
-       return psock;
-}
-
-static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
-{
-       struct bpf_stab *stab;
-       u64 cost;
-       int err;
-
-       if (!capable(CAP_NET_ADMIN))
-               return ERR_PTR(-EPERM);
-
-       /* check sanity of attributes */
-       if (attr->max_entries == 0 || attr->key_size != 4 ||
-           attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
-               return ERR_PTR(-EINVAL);
-
-       stab = kzalloc(sizeof(*stab), GFP_USER);
-       if (!stab)
-               return ERR_PTR(-ENOMEM);
-
-       bpf_map_init_from_attr(&stab->map, attr);
-       raw_spin_lock_init(&stab->lock);
-
-       /* make sure page count doesn't overflow */
-       cost = (u64) stab->map.max_entries * sizeof(struct sock *);
-       err = -EINVAL;
-       if (cost >= U32_MAX - PAGE_SIZE)
-               goto free_stab;
-
-       stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
-
-       /* if map size is larger than memlock limit, reject it early */
-       err = bpf_map_precharge_memlock(stab->map.pages);
-       if (err)
-               goto free_stab;
-
-       err = -ENOMEM;
-       stab->sock_map = bpf_map_area_alloc(stab->map.max_entries *
-                                           sizeof(struct sock *),
-                                           stab->map.numa_node);
-       if (!stab->sock_map)
-               goto free_stab;
-
-       return &stab->map;
-free_stab:
-       kfree(stab);
-       return ERR_PTR(err);
-}
-
-static void smap_list_map_remove(struct smap_psock *psock,
-                                struct sock **entry)
-{
-       struct smap_psock_map_entry *e, *tmp;
-
-       spin_lock_bh(&psock->maps_lock);
-       list_for_each_entry_safe(e, tmp, &psock->maps, list) {
-               if (e->entry == entry) {
-                       list_del(&e->list);
-                       kfree(e);
-               }
-       }
-       spin_unlock_bh(&psock->maps_lock);
-}
-
-static void smap_list_hash_remove(struct smap_psock *psock,
-                                 struct htab_elem *hash_link)
-{
-       struct smap_psock_map_entry *e, *tmp;
-
-       spin_lock_bh(&psock->maps_lock);
-       list_for_each_entry_safe(e, tmp, &psock->maps, list) {
-               struct htab_elem *c = rcu_dereference(e->hash_link);
-
-               if (c == hash_link) {
-                       list_del(&e->list);
-                       kfree(e);
-               }
-       }
-       spin_unlock_bh(&psock->maps_lock);
-}
-
-static void sock_map_free(struct bpf_map *map)
-{
-       struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
-       int i;
-
-       synchronize_rcu();
-
-       /* At this point no update, lookup or delete operations can happen.
-        * However, be aware we can still get a socket state event updates,
-        * and data ready callabacks that reference the psock from sk_user_data
-        * Also psock worker threads are still in-flight. So smap_release_sock
-        * will only free the psock after cancel_sync on the worker threads
-        * and a grace period expire to ensure psock is really safe to remove.
-        */
-       rcu_read_lock();
-       raw_spin_lock_bh(&stab->lock);
-       for (i = 0; i < stab->map.max_entries; i++) {
-               struct smap_psock *psock;
-               struct sock *sock;
-
-               sock = stab->sock_map[i];
-               if (!sock)
-                       continue;
-               stab->sock_map[i] = NULL;
-               psock = smap_psock_sk(sock);
-               /* This check handles a racing sock event that can get the
-                * sk_callback_lock before this case but after xchg happens
-                * causing the refcnt to hit zero and sock user data (psock)
-                * to be null and queued for garbage collection.
-                */
-               if (likely(psock)) {
-                       smap_list_map_remove(psock, &stab->sock_map[i]);
-                       smap_release_sock(psock, sock);
-               }
-       }
-       raw_spin_unlock_bh(&stab->lock);
-       rcu_read_unlock();
-
-       sock_map_remove_complete(stab);
-}
-
-static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
-{
-       struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
-       u32 i = key ? *(u32 *)key : U32_MAX;
-       u32 *next = (u32 *)next_key;
-
-       if (i >= stab->map.max_entries) {
-               *next = 0;
-               return 0;
-       }
-
-       if (i == stab->map.max_entries - 1)
-               return -ENOENT;
-
-       *next = i + 1;
-       return 0;
-}
-
-struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
-{
-       struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
-
-       if (key >= map->max_entries)
-               return NULL;
-
-       return READ_ONCE(stab->sock_map[key]);
-}
-
-static int sock_map_delete_elem(struct bpf_map *map, void *key)
-{
-       struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
-       struct smap_psock *psock;
-       int k = *(u32 *)key;
-       struct sock *sock;
-
-       if (k >= map->max_entries)
-               return -EINVAL;
-
-       raw_spin_lock_bh(&stab->lock);
-       sock = stab->sock_map[k];
-       stab->sock_map[k] = NULL;
-       raw_spin_unlock_bh(&stab->lock);
-       if (!sock)
-               return -EINVAL;
-
-       psock = smap_psock_sk(sock);
-       if (!psock)
-               return 0;
-       if (psock->bpf_parse) {
-               write_lock_bh(&sock->sk_callback_lock);
-               smap_stop_sock(psock, sock);
-               write_unlock_bh(&sock->sk_callback_lock);
-       }
-       smap_list_map_remove(psock, &stab->sock_map[k]);
-       smap_release_sock(psock, sock);
-       return 0;
-}
-
-/* Locking notes: Concurrent updates, deletes, and lookups are allowed and are
- * done inside rcu critical sections. This ensures on updates that the psock
- * will not be released via smap_release_sock() until concurrent updates/deletes
- * complete. All operations operate on sock_map using cmpxchg and xchg
- * operations to ensure we do not get stale references. Any reads into the
- * map must be done with READ_ONCE() because of this.
- *
- * A psock is destroyed via call_rcu and after any worker threads are cancelled
- * and syncd so we are certain all references from the update/lookup/delete
- * operations as well as references in the data path are no longer in use.
- *
- * Psocks may exist in multiple maps, but only a single set of parse/verdict
- * programs may be inherited from the maps it belongs to. A reference count
- * is kept with the total number of references to the psock from all maps. The
- * psock will not be released until this reaches zero. The psock and sock
- * user data data use the sk_callback_lock to protect critical data structures
- * from concurrent access. This allows us to avoid two updates from modifying
- * the user data in sock and the lock is required anyways for modifying
- * callbacks, we simply increase its scope slightly.
- *
- * Rules to follow,
- *  - psock must always be read inside RCU critical section
- *  - sk_user_data must only be modified inside sk_callback_lock and read
- *    inside RCU critical section.
- *  - psock->maps list must only be read & modified inside sk_callback_lock
- *  - sock_map must use READ_ONCE and (cmp)xchg operations
- *  - BPF verdict/parse programs must use READ_ONCE and xchg operations
- */
-
-static int __sock_map_ctx_update_elem(struct bpf_map *map,
-                                     struct bpf_sock_progs *progs,
-                                     struct sock *sock,
-                                     void *key)
-{
-       struct bpf_prog *verdict, *parse, *tx_msg;
-       struct smap_psock *psock;
-       bool new = false;
-       int err = 0;
-
-       /* 1. If sock map has BPF programs those will be inherited by the
-        * sock being added. If the sock is already attached to BPF programs
-        * this results in an error.
-        */
-       verdict = READ_ONCE(progs->bpf_verdict);
-       parse = READ_ONCE(progs->bpf_parse);
-       tx_msg = READ_ONCE(progs->bpf_tx_msg);
-
-       if (parse && verdict) {
-               /* bpf prog refcnt may be zero if a concurrent attach operation
-                * removes the program after the above READ_ONCE() but before
-                * we increment the refcnt. If this is the case abort with an
-                * error.
-                */
-               verdict = bpf_prog_inc_not_zero(verdict);
-               if (IS_ERR(verdict))
-                       return PTR_ERR(verdict);
-
-               parse = bpf_prog_inc_not_zero(parse);
-               if (IS_ERR(parse)) {
-                       bpf_prog_put(verdict);
-                       return PTR_ERR(parse);
-               }
-       }
-
-       if (tx_msg) {
-               tx_msg = bpf_prog_inc_not_zero(tx_msg);
-               if (IS_ERR(tx_msg)) {
-                       if (parse && verdict) {
-                               bpf_prog_put(parse);
-                               bpf_prog_put(verdict);
-                       }
-                       return PTR_ERR(tx_msg);
-               }
-       }
-
-       psock = smap_psock_sk(sock);
-
-       /* 2. Do not allow inheriting programs if psock exists and has
-        * already inherited programs. This would create confusion on
-        * which parser/verdict program is running. If no psock exists
-        * create one. Inside sk_callback_lock to ensure concurrent create
-        * doesn't update user data.
-        */
-       if (psock) {
-               if (!psock_is_smap_sk(sock)) {
-                       err = -EBUSY;
-                       goto out_progs;
-               }
-               if (READ_ONCE(psock->bpf_parse) && parse) {
-                       err = -EBUSY;
-                       goto out_progs;
-               }
-               if (READ_ONCE(psock->bpf_tx_msg) && tx_msg) {
-                       err = -EBUSY;
-                       goto out_progs;
-               }
-               if (!refcount_inc_not_zero(&psock->refcnt)) {
-                       err = -EAGAIN;
-                       goto out_progs;
-               }
-       } else {
-               psock = smap_init_psock(sock, map->numa_node);
-               if (IS_ERR(psock)) {
-                       err = PTR_ERR(psock);
-                       goto out_progs;
-               }
-
-               set_bit(SMAP_TX_RUNNING, &psock->state);
-               new = true;
-       }
-
-       /* 3. At this point we have a reference to a valid psock that is
-        * running. Attach any BPF programs needed.
-        */
-       if (tx_msg)
-               bpf_tcp_msg_add(psock, sock, tx_msg);
-       if (new) {
-               err = bpf_tcp_init(sock);
-               if (err)
-                       goto out_free;
-       }
-
-       if (parse && verdict && !psock->strp_enabled) {
-               err = smap_init_sock(psock, sock);
-               if (err)
-                       goto out_free;
-               smap_init_progs(psock, verdict, parse);
-               write_lock_bh(&sock->sk_callback_lock);
-               smap_start_sock(psock, sock);
-               write_unlock_bh(&sock->sk_callback_lock);
-       }
-
-       return err;
-out_free:
-       smap_release_sock(psock, sock);
-out_progs:
-       if (parse && verdict) {
-               bpf_prog_put(parse);
-               bpf_prog_put(verdict);
-       }
-       if (tx_msg)
-               bpf_prog_put(tx_msg);
-       return err;
-}
-
-static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
-                                   struct bpf_map *map,
-                                   void *key, u64 flags)
-{
-       struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
-       struct bpf_sock_progs *progs = &stab->progs;
-       struct sock *osock, *sock = skops->sk;
-       struct smap_psock_map_entry *e;
-       struct smap_psock *psock;
-       u32 i = *(u32 *)key;
-       int err;
-
-       if (unlikely(flags > BPF_EXIST))
-               return -EINVAL;
-       if (unlikely(i >= stab->map.max_entries))
-               return -E2BIG;
-
-       e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
-       if (!e)
-               return -ENOMEM;
-
-       err = __sock_map_ctx_update_elem(map, progs, sock, key);
-       if (err)
-               goto out;
-
-       /* psock guaranteed to be present. */
-       psock = smap_psock_sk(sock);
-       raw_spin_lock_bh(&stab->lock);
-       osock = stab->sock_map[i];
-       if (osock && flags == BPF_NOEXIST) {
-               err = -EEXIST;
-               goto out_unlock;
-       }
-       if (!osock && flags == BPF_EXIST) {
-               err = -ENOENT;
-               goto out_unlock;
-       }
-
-       e->entry = &stab->sock_map[i];
-       e->map = map;
-       spin_lock_bh(&psock->maps_lock);
-       list_add_tail(&e->list, &psock->maps);
-       spin_unlock_bh(&psock->maps_lock);
-
-       stab->sock_map[i] = sock;
-       if (osock) {
-               psock = smap_psock_sk(osock);
-               smap_list_map_remove(psock, &stab->sock_map[i]);
-               smap_release_sock(psock, osock);
-       }
-       raw_spin_unlock_bh(&stab->lock);
-       return 0;
-out_unlock:
-       smap_release_sock(psock, sock);
-       raw_spin_unlock_bh(&stab->lock);
-out:
-       kfree(e);
-       return err;
-}
-
-int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
-{
-       struct bpf_sock_progs *progs;
-       struct bpf_prog *orig;
-
-       if (map->map_type == BPF_MAP_TYPE_SOCKMAP) {
-               struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
-
-               progs = &stab->progs;
-       } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH) {
-               struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-
-               progs = &htab->progs;
-       } else {
-               return -EINVAL;
-       }
-
-       switch (type) {
-       case BPF_SK_MSG_VERDICT:
-               orig = xchg(&progs->bpf_tx_msg, prog);
-               break;
-       case BPF_SK_SKB_STREAM_PARSER:
-               orig = xchg(&progs->bpf_parse, prog);
-               break;
-       case BPF_SK_SKB_STREAM_VERDICT:
-               orig = xchg(&progs->bpf_verdict, prog);
-               break;
-       default:
-               return -EOPNOTSUPP;
-       }
-
-       if (orig)
-               bpf_prog_put(orig);
-
-       return 0;
-}
-
-int sockmap_get_from_fd(const union bpf_attr *attr, int type,
-                       struct bpf_prog *prog)
-{
-       int ufd = attr->target_fd;
-       struct bpf_map *map;
-       struct fd f;
-       int err;
-
-       f = fdget(ufd);
-       map = __bpf_map_get(f);
-       if (IS_ERR(map))
-               return PTR_ERR(map);
-
-       err = sock_map_prog(map, prog, attr->attach_type);
-       fdput(f);
-       return err;
-}
-
-static void *sock_map_lookup(struct bpf_map *map, void *key)
-{
-       return ERR_PTR(-EOPNOTSUPP);
-}
-
-static int sock_map_update_elem(struct bpf_map *map,
-                               void *key, void *value, u64 flags)
-{
-       struct bpf_sock_ops_kern skops;
-       u32 fd = *(u32 *)value;
-       struct socket *socket;
-       int err;
-
-       socket = sockfd_lookup(fd, &err);
-       if (!socket)
-               return err;
-
-       skops.sk = socket->sk;
-       if (!skops.sk) {
-               fput(socket->file);
-               return -EINVAL;
-       }
-
-       /* ULPs are currently supported only for TCP sockets in ESTABLISHED
-        * state.
-        */
-       if (skops.sk->sk_type != SOCK_STREAM ||
-           skops.sk->sk_protocol != IPPROTO_TCP ||
-           skops.sk->sk_state != TCP_ESTABLISHED) {
-               fput(socket->file);
-               return -EOPNOTSUPP;
-       }
-
-       lock_sock(skops.sk);
-       preempt_disable();
-       rcu_read_lock();
-       err = sock_map_ctx_update_elem(&skops, map, key, flags);
-       rcu_read_unlock();
-       preempt_enable();
-       release_sock(skops.sk);
-       fput(socket->file);
-       return err;
-}
-
-static void sock_map_release(struct bpf_map *map)
-{
-       struct bpf_sock_progs *progs;
-       struct bpf_prog *orig;
-
-       if (map->map_type == BPF_MAP_TYPE_SOCKMAP) {
-               struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
-
-               progs = &stab->progs;
-       } else {
-               struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-
-               progs = &htab->progs;
-       }
-
-       orig = xchg(&progs->bpf_parse, NULL);
-       if (orig)
-               bpf_prog_put(orig);
-       orig = xchg(&progs->bpf_verdict, NULL);
-       if (orig)
-               bpf_prog_put(orig);
-
-       orig = xchg(&progs->bpf_tx_msg, NULL);
-       if (orig)
-               bpf_prog_put(orig);
-}
-
-static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
-{
-       struct bpf_htab *htab;
-       int i, err;
-       u64 cost;
-
-       if (!capable(CAP_NET_ADMIN))
-               return ERR_PTR(-EPERM);
-
-       /* check sanity of attributes */
-       if (attr->max_entries == 0 ||
-           attr->key_size == 0 ||
-           attr->value_size != 4 ||
-           attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
-               return ERR_PTR(-EINVAL);
-
-       if (attr->key_size > MAX_BPF_STACK)
-               /* eBPF programs initialize keys on stack, so they cannot be
-                * larger than max stack size
-                */
-               return ERR_PTR(-E2BIG);
-
-       htab = kzalloc(sizeof(*htab), GFP_USER);
-       if (!htab)
-               return ERR_PTR(-ENOMEM);
-
-       bpf_map_init_from_attr(&htab->map, attr);
-
-       htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
-       htab->elem_size = sizeof(struct htab_elem) +
-                         round_up(htab->map.key_size, 8);
-       err = -EINVAL;
-       if (htab->n_buckets == 0 ||
-           htab->n_buckets > U32_MAX / sizeof(struct bucket))
-               goto free_htab;
-
-       cost = (u64) htab->n_buckets * sizeof(struct bucket) +
-              (u64) htab->elem_size * htab->map.max_entries;
-
-       if (cost >= U32_MAX - PAGE_SIZE)
-               goto free_htab;
-
-       htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
-       err = bpf_map_precharge_memlock(htab->map.pages);
-       if (err)
-               goto free_htab;
-
-       err = -ENOMEM;
-       htab->buckets = bpf_map_area_alloc(
-                               htab->n_buckets * sizeof(struct bucket),
-                               htab->map.numa_node);
-       if (!htab->buckets)
-               goto free_htab;
-
-       for (i = 0; i < htab->n_buckets; i++) {
-               INIT_HLIST_HEAD(&htab->buckets[i].head);
-               raw_spin_lock_init(&htab->buckets[i].lock);
-       }
-
-       return &htab->map;
-free_htab:
-       kfree(htab);
-       return ERR_PTR(err);
-}
-
-static void __bpf_htab_free(struct rcu_head *rcu)
-{
-       struct bpf_htab *htab;
-
-       htab = container_of(rcu, struct bpf_htab, rcu);
-       bpf_map_area_free(htab->buckets);
-       kfree(htab);
-}
-
-static void sock_hash_free(struct bpf_map *map)
-{
-       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-       int i;
-
-       synchronize_rcu();
-
-       /* At this point no update, lookup or delete operations can happen.
-        * However, be aware we can still get a socket state event updates,
-        * and data ready callabacks that reference the psock from sk_user_data
-        * Also psock worker threads are still in-flight. So smap_release_sock
-        * will only free the psock after cancel_sync on the worker threads
-        * and a grace period expire to ensure psock is really safe to remove.
-        */
-       rcu_read_lock();
-       for (i = 0; i < htab->n_buckets; i++) {
-               struct bucket *b = __select_bucket(htab, i);
-               struct hlist_head *head;
-               struct hlist_node *n;
-               struct htab_elem *l;
-
-               raw_spin_lock_bh(&b->lock);
-               head = &b->head;
-               hlist_for_each_entry_safe(l, n, head, hash_node) {
-                       struct sock *sock = l->sk;
-                       struct smap_psock *psock;
-
-                       hlist_del_rcu(&l->hash_node);
-                       psock = smap_psock_sk(sock);
-                       /* This check handles a racing sock event that can get
-                        * the sk_callback_lock before this case but after xchg
-                        * causing the refcnt to hit zero and sock user data
-                        * (psock) to be null and queued for garbage collection.
-                        */
-                       if (likely(psock)) {
-                               smap_list_hash_remove(psock, l);
-                               smap_release_sock(psock, sock);
-                       }
-                       free_htab_elem(htab, l);
-               }
-               raw_spin_unlock_bh(&b->lock);
-       }
-       rcu_read_unlock();
-       call_rcu(&htab->rcu, __bpf_htab_free);
-}
-
-static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab,
-                                             void *key, u32 key_size, u32 hash,
-                                             struct sock *sk,
-                                             struct htab_elem *old_elem)
-{
-       struct htab_elem *l_new;
-
-       if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
-               if (!old_elem) {
-                       atomic_dec(&htab->count);
-                       return ERR_PTR(-E2BIG);
-               }
-       }
-       l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
-                            htab->map.numa_node);
-       if (!l_new) {
-               atomic_dec(&htab->count);
-               return ERR_PTR(-ENOMEM);
-       }
-
-       memcpy(l_new->key, key, key_size);
-       l_new->sk = sk;
-       l_new->hash = hash;
-       return l_new;
-}
-
-static inline u32 htab_map_hash(const void *key, u32 key_len)
-{
-       return jhash(key, key_len, 0);
-}
-
-static int sock_hash_get_next_key(struct bpf_map *map,
-                                 void *key, void *next_key)
-{
-       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-       struct htab_elem *l, *next_l;
-       struct hlist_head *h;
-       u32 hash, key_size;
-       int i = 0;
-
-       WARN_ON_ONCE(!rcu_read_lock_held());
-
-       key_size = map->key_size;
-       if (!key)
-               goto find_first_elem;
-       hash = htab_map_hash(key, key_size);
-       h = select_bucket(htab, hash);
-
-       l = lookup_elem_raw(h, hash, key, key_size);
-       if (!l)
-               goto find_first_elem;
-       next_l = hlist_entry_safe(
-                    rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
-                    struct htab_elem, hash_node);
-       if (next_l) {
-               memcpy(next_key, next_l->key, key_size);
-               return 0;
-       }
-
-       /* no more elements in this hash list, go to the next bucket */
-       i = hash & (htab->n_buckets - 1);
-       i++;
-
-find_first_elem:
-       /* iterate over buckets */
-       for (; i < htab->n_buckets; i++) {
-               h = select_bucket(htab, i);
-
-               /* pick first element in the bucket */
-               next_l = hlist_entry_safe(
-                               rcu_dereference_raw(hlist_first_rcu(h)),
-                               struct htab_elem, hash_node);
-               if (next_l) {
-                       /* if it's not empty, just return it */
-                       memcpy(next_key, next_l->key, key_size);
-                       return 0;
-               }
-       }
-
-       /* iterated over all buckets and all elements */
-       return -ENOENT;
-}
-
-static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
-                                    struct bpf_map *map,
-                                    void *key, u64 map_flags)
-{
-       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-       struct bpf_sock_progs *progs = &htab->progs;
-       struct htab_elem *l_new = NULL, *l_old;
-       struct smap_psock_map_entry *e = NULL;
-       struct hlist_head *head;
-       struct smap_psock *psock;
-       u32 key_size, hash;
-       struct sock *sock;
-       struct bucket *b;
-       int err;
-
-       sock = skops->sk;
-
-       if (sock->sk_type != SOCK_STREAM ||
-           sock->sk_protocol != IPPROTO_TCP)
-               return -EOPNOTSUPP;
-
-       if (unlikely(map_flags > BPF_EXIST))
-               return -EINVAL;
-
-       e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
-       if (!e)
-               return -ENOMEM;
-
-       WARN_ON_ONCE(!rcu_read_lock_held());
-       key_size = map->key_size;
-       hash = htab_map_hash(key, key_size);
-       b = __select_bucket(htab, hash);
-       head = &b->head;
-
-       err = __sock_map_ctx_update_elem(map, progs, sock, key);
-       if (err)
-               goto err;
-
-       /* psock is valid here because otherwise above *ctx_update_elem would
-        * have thrown an error. It is safe to skip error check.
-        */
-       psock = smap_psock_sk(sock);
-       raw_spin_lock_bh(&b->lock);
-       l_old = lookup_elem_raw(head, hash, key, key_size);
-       if (l_old && map_flags == BPF_NOEXIST) {
-               err = -EEXIST;
-               goto bucket_err;
-       }
-       if (!l_old && map_flags == BPF_EXIST) {
-               err = -ENOENT;
-               goto bucket_err;
-       }
-
-       l_new = alloc_sock_hash_elem(htab, key, key_size, hash, sock, l_old);
-       if (IS_ERR(l_new)) {
-               err = PTR_ERR(l_new);
-               goto bucket_err;
-       }
-
-       rcu_assign_pointer(e->hash_link, l_new);
-       e->map = map;
-       spin_lock_bh(&psock->maps_lock);
-       list_add_tail(&e->list, &psock->maps);
-       spin_unlock_bh(&psock->maps_lock);
-
-       /* add new element to the head of the list, so that
-        * concurrent search will find it before old elem
-        */
-       hlist_add_head_rcu(&l_new->hash_node, head);
-       if (l_old) {
-               psock = smap_psock_sk(l_old->sk);
-
-               hlist_del_rcu(&l_old->hash_node);
-               smap_list_hash_remove(psock, l_old);
-               smap_release_sock(psock, l_old->sk);
-               free_htab_elem(htab, l_old);
-       }
-       raw_spin_unlock_bh(&b->lock);
-       return 0;
-bucket_err:
-       smap_release_sock(psock, sock);
-       raw_spin_unlock_bh(&b->lock);
-err:
-       kfree(e);
-       return err;
-}
-
-static int sock_hash_update_elem(struct bpf_map *map,
-                               void *key, void *value, u64 flags)
-{
-       struct bpf_sock_ops_kern skops;
-       u32 fd = *(u32 *)value;
-       struct socket *socket;
-       int err;
-
-       socket = sockfd_lookup(fd, &err);
-       if (!socket)
-               return err;
-
-       skops.sk = socket->sk;
-       if (!skops.sk) {
-               fput(socket->file);
-               return -EINVAL;
-       }
-
-       /* ULPs are currently supported only for TCP sockets in ESTABLISHED
-        * state.
-        */
-       if (skops.sk->sk_type != SOCK_STREAM ||
-           skops.sk->sk_protocol != IPPROTO_TCP ||
-           skops.sk->sk_state != TCP_ESTABLISHED) {
-               fput(socket->file);
-               return -EOPNOTSUPP;
-       }
-
-       lock_sock(skops.sk);
-       preempt_disable();
-       rcu_read_lock();
-       err = sock_hash_ctx_update_elem(&skops, map, key, flags);
-       rcu_read_unlock();
-       preempt_enable();
-       release_sock(skops.sk);
-       fput(socket->file);
-       return err;
-}
-
-static int sock_hash_delete_elem(struct bpf_map *map, void *key)
-{
-       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-       struct hlist_head *head;
-       struct bucket *b;
-       struct htab_elem *l;
-       u32 hash, key_size;
-       int ret = -ENOENT;
-
-       key_size = map->key_size;
-       hash = htab_map_hash(key, key_size);
-       b = __select_bucket(htab, hash);
-       head = &b->head;
-
-       raw_spin_lock_bh(&b->lock);
-       l = lookup_elem_raw(head, hash, key, key_size);
-       if (l) {
-               struct sock *sock = l->sk;
-               struct smap_psock *psock;
-
-               hlist_del_rcu(&l->hash_node);
-               psock = smap_psock_sk(sock);
-               /* This check handles a racing sock event that can get the
-                * sk_callback_lock before this case but after xchg happens
-                * causing the refcnt to hit zero and sock user data (psock)
-                * to be null and queued for garbage collection.
-                */
-               if (likely(psock)) {
-                       smap_list_hash_remove(psock, l);
-                       smap_release_sock(psock, sock);
-               }
-               free_htab_elem(htab, l);
-               ret = 0;
-       }
-       raw_spin_unlock_bh(&b->lock);
-       return ret;
-}
-
-struct sock  *__sock_hash_lookup_elem(struct bpf_map *map, void *key)
-{
-       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-       struct hlist_head *head;
-       struct htab_elem *l;
-       u32 key_size, hash;
-       struct bucket *b;
-       struct sock *sk;
-
-       key_size = map->key_size;
-       hash = htab_map_hash(key, key_size);
-       b = __select_bucket(htab, hash);
-       head = &b->head;
-
-       l = lookup_elem_raw(head, hash, key, key_size);
-       sk = l ? l->sk : NULL;
-       return sk;
-}
-
-const struct bpf_map_ops sock_map_ops = {
-       .map_alloc = sock_map_alloc,
-       .map_free = sock_map_free,
-       .map_lookup_elem = sock_map_lookup,
-       .map_get_next_key = sock_map_get_next_key,
-       .map_update_elem = sock_map_update_elem,
-       .map_delete_elem = sock_map_delete_elem,
-       .map_release_uref = sock_map_release,
-       .map_check_btf = map_check_no_btf,
-};
-
-const struct bpf_map_ops sock_hash_ops = {
-       .map_alloc = sock_hash_alloc,
-       .map_free = sock_hash_free,
-       .map_lookup_elem = sock_map_lookup,
-       .map_get_next_key = sock_hash_get_next_key,
-       .map_update_elem = sock_hash_update_elem,
-       .map_delete_elem = sock_hash_delete_elem,
-       .map_release_uref = sock_map_release,
-       .map_check_btf = map_check_no_btf,
-};
-
-static bool bpf_is_valid_sock_op(struct bpf_sock_ops_kern *ops)
-{
-       return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB ||
-              ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB;
-}
-BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
-          struct bpf_map *, map, void *, key, u64, flags)
-{
-       WARN_ON_ONCE(!rcu_read_lock_held());
-
-       /* ULPs are currently supported only for TCP sockets in ESTABLISHED
-        * state. This checks that the sock ops triggering the update is
-        * one indicating we are (or will be soon) in an ESTABLISHED state.
-        */
-       if (!bpf_is_valid_sock_op(bpf_sock))
-               return -EOPNOTSUPP;
-       return sock_map_ctx_update_elem(bpf_sock, map, key, flags);
-}
-
-const struct bpf_func_proto bpf_sock_map_update_proto = {
-       .func           = bpf_sock_map_update,
-       .gpl_only       = false,
-       .pkt_access     = true,
-       .ret_type       = RET_INTEGER,
-       .arg1_type      = ARG_PTR_TO_CTX,
-       .arg2_type      = ARG_CONST_MAP_PTR,
-       .arg3_type      = ARG_PTR_TO_MAP_KEY,
-       .arg4_type      = ARG_ANYTHING,
-};
-
-BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, bpf_sock,
-          struct bpf_map *, map, void *, key, u64, flags)
-{
-       WARN_ON_ONCE(!rcu_read_lock_held());
-
-       if (!bpf_is_valid_sock_op(bpf_sock))
-               return -EOPNOTSUPP;
-       return sock_hash_ctx_update_elem(bpf_sock, map, key, flags);
-}
-
-const struct bpf_func_proto bpf_sock_hash_update_proto = {
-       .func           = bpf_sock_hash_update,
-       .gpl_only       = false,
-       .pkt_access     = true,
-       .ret_type       = RET_INTEGER,
-       .arg1_type      = ARG_PTR_TO_CTX,
-       .arg2_type      = ARG_CONST_MAP_PTR,
-       .arg3_type      = ARG_PTR_TO_MAP_KEY,
-       .arg4_type      = ARG_ANYTHING,
-};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c

index 53968f8..f4ecd6e 100644 (file)
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1664,7 +1664,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
         switch (ptype) {
         case BPF_PROG_TYPE_SK_SKB:
         case BPF_PROG_TYPE_SK_MSG:
-               ret = sockmap_get_from_fd(attr, ptype, prog);
+               ret = sock_map_get_from_fd(attr, prog);
                 break;
         case BPF_PROG_TYPE_LIRC_MODE2:
                 ret = lirc_prog_attach(attr, prog);
@@ -1718,10 +1718,10 @@ static int bpf_prog_detach(const union bpf_attr *attr)
                 ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
                 break;
         case BPF_SK_MSG_VERDICT:
-               return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, NULL);
+               return sock_map_get_from_fd(attr, NULL);
         case BPF_SK_SKB_STREAM_PARSER:
         case BPF_SK_SKB_STREAM_VERDICT:
-               return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, NULL);
+               return sock_map_get_from_fd(attr, NULL);
         case BPF_LIRC_MODE2:
                 return lirc_prog_detach(attr);
         case BPF_FLOW_DISSECTOR:
diff --git a/net/Kconfig b/net/Kconfig

index 228dfa3..f235edb 100644 (file)
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -300,8 +300,11 @@ config BPF_JIT
  
  config BPF_STREAM_PARSER
         bool "enable BPF STREAM_PARSER"
+       depends on INET
         depends on BPF_SYSCALL
+       depends on CGROUP_BPF
         select STREAM_PARSER
+       select NET_SOCK_MSG
         ---help---
          Enabling this allows a stream parser to be used with
          BPF_MAP_TYPE_SOCKMAP.
@@ -413,6 +416,14 @@ config GRO_CELLS
  config SOCK_VALIDATE_XMIT
         bool
  
+config NET_SOCK_MSG
+       bool
+       default n
+       help
+         The NET_SOCK_MSG provides a framework for plain sockets (e.g. TCP) or
+         ULPs (upper layer modules, e.g. TLS) to process L7 application data
+         with the help of BPF programs.
+
  config NET_DEVLINK
         tristate "Network physical/parent device Netlink interface"
         help
diff --git a/net/core/Makefile b/net/core/Makefile

index 80175e6..fccd31e 100644 (file)
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -16,6 +16,7 @@ obj-y              += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
  obj-y += net-sysfs.o
  obj-$(CONFIG_PAGE_POOL) += page_pool.o
  obj-$(CONFIG_PROC_FS) += net-procfs.o
+obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o
  obj-$(CONFIG_NET_PKTGEN) += pktgen.o
  obj-$(CONFIG_NETPOLL) += netpoll.o
  obj-$(CONFIG_FIB_RULES) += fib_rules.o
@@ -27,6 +28,7 @@ obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
  obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
  obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
  obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o
+obj-$(CONFIG_BPF_STREAM_PARSER) += sock_map.o
  obj-$(CONFIG_DST_CACHE) += dst_cache.o
  obj-$(CONFIG_HWBM) += hwbm.o
  obj-$(CONFIG_NET_DEVLINK) += devlink.o
diff --git a/net/core/filter.c b/net/core/filter.c

index b844761..0f5260b 100644 (file)
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -38,6 +38,7 @@
  #include <net/protocol.h>
  #include <net/netlink.h>
  #include <linux/skbuff.h>
+#include <linux/skmsg.h>
  #include <net/sock.h>
  #include <net/flow_dissector.h>
  #include <linux/errno.h>
@@ -2142,123 +2143,7 @@ static const struct bpf_func_proto bpf_redirect_proto = {
         .arg2_type      = ARG_ANYTHING,
  };
  
-BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
-          struct bpf_map *, map, void *, key, u64, flags)
-{
-       struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
-
-       /* If user passes invalid input drop the packet. */
-       if (unlikely(flags & ~(BPF_F_INGRESS)))
-               return SK_DROP;
-
-       tcb->bpf.flags = flags;
-       tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key);
-       if (!tcb->bpf.sk_redir)
-               return SK_DROP;
-
-       return SK_PASS;
-}
-
-static const struct bpf_func_proto bpf_sk_redirect_hash_proto = {
-       .func           = bpf_sk_redirect_hash,
-       .gpl_only       = false,
-       .ret_type       = RET_INTEGER,
-       .arg1_type      = ARG_PTR_TO_CTX,
-       .arg2_type      = ARG_CONST_MAP_PTR,
-       .arg3_type      = ARG_PTR_TO_MAP_KEY,
-       .arg4_type      = ARG_ANYTHING,
-};
-
-BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
-          struct bpf_map *, map, u32, key, u64, flags)
-{
-       struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
-
-       /* If user passes invalid input drop the packet. */
-       if (unlikely(flags & ~(BPF_F_INGRESS)))
-               return SK_DROP;
-
-       tcb->bpf.flags = flags;
-       tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key);
-       if (!tcb->bpf.sk_redir)
-               return SK_DROP;
-
-       return SK_PASS;
-}
-
-struct sock *do_sk_redirect_map(struct sk_buff *skb)
-{
-       struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
-
-       return tcb->bpf.sk_redir;
-}
-
-static const struct bpf_func_proto bpf_sk_redirect_map_proto = {
-       .func           = bpf_sk_redirect_map,
-       .gpl_only       = false,
-       .ret_type       = RET_INTEGER,
-       .arg1_type      = ARG_PTR_TO_CTX,
-       .arg2_type      = ARG_CONST_MAP_PTR,
-       .arg3_type      = ARG_ANYTHING,
-       .arg4_type      = ARG_ANYTHING,
-};
-
-BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg_buff *, msg,
-          struct bpf_map *, map, void *, key, u64, flags)
-{
-       /* If user passes invalid input drop the packet. */
-       if (unlikely(flags & ~(BPF_F_INGRESS)))
-               return SK_DROP;
-
-       msg->flags = flags;
-       msg->sk_redir = __sock_hash_lookup_elem(map, key);
-       if (!msg->sk_redir)
-               return SK_DROP;
-
-       return SK_PASS;
-}
-
-static const struct bpf_func_proto bpf_msg_redirect_hash_proto = {
-       .func           = bpf_msg_redirect_hash,
-       .gpl_only       = false,
-       .ret_type       = RET_INTEGER,
-       .arg1_type      = ARG_PTR_TO_CTX,
-       .arg2_type      = ARG_CONST_MAP_PTR,
-       .arg3_type      = ARG_PTR_TO_MAP_KEY,
-       .arg4_type      = ARG_ANYTHING,
-};
-
-BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg,
-          struct bpf_map *, map, u32, key, u64, flags)
-{
-       /* If user passes invalid input drop the packet. */
-       if (unlikely(flags & ~(BPF_F_INGRESS)))
-               return SK_DROP;
-
-       msg->flags = flags;
-       msg->sk_redir = __sock_map_lookup_elem(map, key);
-       if (!msg->sk_redir)
-               return SK_DROP;
-
-       return SK_PASS;
-}
-
-struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
-{
-       return msg->sk_redir;
-}
-
-static const struct bpf_func_proto bpf_msg_redirect_map_proto = {
-       .func           = bpf_msg_redirect_map,
-       .gpl_only       = false,
-       .ret_type       = RET_INTEGER,
-       .arg1_type      = ARG_PTR_TO_CTX,
-       .arg2_type      = ARG_CONST_MAP_PTR,
-       .arg3_type      = ARG_ANYTHING,
-       .arg4_type      = ARG_ANYTHING,
-};
-
-BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg_buff *, msg, u32, bytes)
+BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes)
  {
         msg->apply_bytes = bytes;
         return 0;
@@ -2272,7 +2157,7 @@ static const struct bpf_func_proto bpf_msg_apply_bytes_proto = {
         .arg2_type      = ARG_ANYTHING,
  };
  
-BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg_buff *, msg, u32, bytes)
+BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg *, msg, u32, bytes)
  {
         msg->cork_bytes = bytes;
         return 0;
@@ -2286,45 +2171,37 @@ static const struct bpf_func_proto bpf_msg_cork_bytes_proto = {
         .arg2_type      = ARG_ANYTHING,
  };
  
-#define sk_msg_iter_var(var)                   \
-       do {                                    \
-               var++;                          \
-               if (var == MAX_SKB_FRAGS)       \
-                       var = 0;                \
-       } while (0)
-
-BPF_CALL_4(bpf_msg_pull_data,
-          struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags)
+BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
+          u32, end, u64, flags)
  {
-       unsigned int len = 0, offset = 0, copy = 0, poffset = 0;
-       int bytes = end - start, bytes_sg_total;
-       struct scatterlist *sg = msg->sg_data;
-       int first_sg, last_sg, i, shift;
-       unsigned char *p, *to, *from;
+       u32 len = 0, offset = 0, copy = 0, poffset = 0, bytes = end - start;
+       u32 first_sge, last_sge, i, shift, bytes_sg_total;
+       struct scatterlist *sge;
+       u8 *raw, *to, *from;
         struct page *page;
  
         if (unlikely(flags || end <= start))
                 return -EINVAL;
  
         /* First find the starting scatterlist element */
-       i = msg->sg_start;
+       i = msg->sg.start;
         do {
-               len = sg[i].length;
+               len = sk_msg_elem(msg, i)->length;
                 if (start < offset + len)
                         break;
                 offset += len;
-               sk_msg_iter_var(i);
-       } while (i != msg->sg_end);
+               sk_msg_iter_var_next(i);
+       } while (i != msg->sg.end);
  
         if (unlikely(start >= offset + len))
                 return -EINVAL;
  
-       first_sg = i;
+       first_sge = i;
         /* The start may point into the sg element so we need to also
          * account for the headroom.
          */
         bytes_sg_total = start - offset + bytes;
-       if (!msg->sg_copy[i] && bytes_sg_total <= len)
+       if (!msg->sg.copy[i] && bytes_sg_total <= len)
                 goto out;
  
         /* At this point we need to linearize multiple scatterlist
@@ -2338,12 +2215,12 @@ BPF_CALL_4(bpf_msg_pull_data,
          * will copy the entire sg entry.
          */
         do {
-               copy += sg[i].length;
-               sk_msg_iter_var(i);
+               copy += sk_msg_elem(msg, i)->length;
+               sk_msg_iter_var_next(i);
                 if (bytes_sg_total <= copy)
                         break;
-       } while (i != msg->sg_end);
-       last_sg = i;
+       } while (i != msg->sg.end);
+       last_sge = i;
  
         if (unlikely(bytes_sg_total > copy))
                 return -EINVAL;
@@ -2352,63 +2229,61 @@ BPF_CALL_4(bpf_msg_pull_data,
                            get_order(copy));
         if (unlikely(!page))
                 return -ENOMEM;
-       p = page_address(page);
  
-       i = first_sg;
+       raw = page_address(page);
+       i = first_sge;
         do {
-               from = sg_virt(&sg[i]);
-               len = sg[i].length;
-               to = p + poffset;
+               sge = sk_msg_elem(msg, i);
+               from = sg_virt(sge);
+               len = sge->length;
+               to = raw + poffset;
  
                 memcpy(to, from, len);
                 poffset += len;
-               sg[i].length = 0;
-               put_page(sg_page(&sg[i]));
+               sge->length = 0;
+               put_page(sg_page(sge));
  
-               sk_msg_iter_var(i);
-       } while (i != last_sg);
+               sk_msg_iter_var_next(i);
+       } while (i != last_sge);
  
-       sg[first_sg].length = copy;
-       sg_set_page(&sg[first_sg], page, copy, 0);
+       sg_set_page(&msg->sg.data[first_sge], page, copy, 0);
  
         /* To repair sg ring we need to shift entries. If we only
          * had a single entry though we can just replace it and
          * be done. Otherwise walk the ring and shift the entries.
          */
-       WARN_ON_ONCE(last_sg == first_sg);
-       shift = last_sg > first_sg ?
-               last_sg - first_sg - 1 :
-               MAX_SKB_FRAGS - first_sg + last_sg - 1;
+       WARN_ON_ONCE(last_sge == first_sge);
+       shift = last_sge > first_sge ?
+               last_sge - first_sge - 1 :
+               MAX_SKB_FRAGS - first_sge + last_sge - 1;
         if (!shift)
                 goto out;
  
-       i = first_sg;
-       sk_msg_iter_var(i);
+       i = first_sge;
+       sk_msg_iter_var_next(i);
         do {
-               int move_from;
+               u32 move_from;
  
-               if (i + shift >= MAX_SKB_FRAGS)
-                       move_from = i + shift - MAX_SKB_FRAGS;
+               if (i + shift >= MAX_MSG_FRAGS)
+                       move_from = i + shift - MAX_MSG_FRAGS;
                 else
                         move_from = i + shift;
-
-               if (move_from == msg->sg_end)
+               if (move_from == msg->sg.end)
                         break;
  
-               sg[i] = sg[move_from];
-               sg[move_from].length = 0;
-               sg[move_from].page_link = 0;
-               sg[move_from].offset = 0;
-
-               sk_msg_iter_var(i);
+               msg->sg.data[i] = msg->sg.data[move_from];
+               msg->sg.data[move_from].length = 0;
+               msg->sg.data[move_from].page_link = 0;
+               msg->sg.data[move_from].offset = 0;
+               sk_msg_iter_var_next(i);
         } while (1);
-       msg->sg_end -= shift;
-       if (msg->sg_end < 0)
-               msg->sg_end += MAX_SKB_FRAGS;
+
+       msg->sg.end = msg->sg.end - shift > msg->sg.end ?
+                     msg->sg.end - shift + MAX_MSG_FRAGS :
+                     msg->sg.end - shift;
  out:
-       msg->data = sg_virt(&sg[first_sg]) + start - offset;
+       msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset;
         msg->data_end = msg->data + bytes;
-
         return 0;
  }
  
@@ -5203,6 +5078,9 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
         }
  }
  
+const struct bpf_func_proto bpf_sock_map_update_proto __weak;
+const struct bpf_func_proto bpf_sock_hash_update_proto __weak;
+
  static const struct bpf_func_proto *
  sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
  {
@@ -5226,6 +5104,9 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
         }
  }
  
+const struct bpf_func_proto bpf_msg_redirect_map_proto __weak;
+const struct bpf_func_proto bpf_msg_redirect_hash_proto __weak;
+
  static const struct bpf_func_proto *
  sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
  {
@@ -5247,6 +5128,9 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
         }
  }
  
+const struct bpf_func_proto bpf_sk_redirect_map_proto __weak;
+const struct bpf_func_proto bpf_sk_redirect_hash_proto __weak;
+
  static const struct bpf_func_proto *
  sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
  {
@@ -7001,22 +6885,22 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
  
         switch (si->off) {
         case offsetof(struct sk_msg_md, data):
-               *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data),
+               *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data),
                                       si->dst_reg, si->src_reg,
-                                     offsetof(struct sk_msg_buff, data));
+                                     offsetof(struct sk_msg, data));
                 break;
         case offsetof(struct sk_msg_md, data_end):
-               *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data_end),
+               *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data_end),
                                       si->dst_reg, si->src_reg,
-                                     offsetof(struct sk_msg_buff, data_end));
+                                     offsetof(struct sk_msg, data_end));
                 break;
         case offsetof(struct sk_msg_md, family):
                 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2);
  
                 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
-                                             struct sk_msg_buff, sk),
+                                             struct sk_msg, sk),
                                       si->dst_reg, si->src_reg,
-                                     offsetof(struct sk_msg_buff, sk));
+                                     offsetof(struct sk_msg, sk));
                 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                       offsetof(struct sock_common, skc_family));
                 break;
@@ -7025,9 +6909,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
                 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4);
  
                 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
-                                               struct sk_msg_buff, sk),
+                                               struct sk_msg, sk),
                                       si->dst_reg, si->src_reg,
-                                     offsetof(struct sk_msg_buff, sk));
+                                     offsetof(struct sk_msg, sk));
                 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                       offsetof(struct sock_common, skc_daddr));
                 break;
@@ -7037,9 +6921,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
                                           skc_rcv_saddr) != 4);
  
                 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
-                                             struct sk_msg_buff, sk),
+                                             struct sk_msg, sk),
                                       si->dst_reg, si->src_reg,
-                                     offsetof(struct sk_msg_buff, sk));
+                                     offsetof(struct sk_msg, sk));
                 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                       offsetof(struct sock_common,
                                                skc_rcv_saddr));
@@ -7054,9 +6938,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
                 off = si->off;
                 off -= offsetof(struct sk_msg_md, remote_ip6[0]);
                 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
-                                               struct sk_msg_buff, sk),
+                                               struct sk_msg, sk),
                                       si->dst_reg, si->src_reg,
-                                     offsetof(struct sk_msg_buff, sk));
+                                     offsetof(struct sk_msg, sk));
                 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                       offsetof(struct sock_common,
                                                skc_v6_daddr.s6_addr32[0]) +
@@ -7075,9 +6959,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
                 off = si->off;
                 off -= offsetof(struct sk_msg_md, local_ip6[0]);
                 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
-                                               struct sk_msg_buff, sk),
+                                               struct sk_msg, sk),
                                       si->dst_reg, si->src_reg,
-                                     offsetof(struct sk_msg_buff, sk));
+                                     offsetof(struct sk_msg, sk));
                 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                       offsetof(struct sock_common,
                                                skc_v6_rcv_saddr.s6_addr32[0]) +
@@ -7091,9 +6975,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
                 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2);
  
                 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
-                                               struct sk_msg_buff, sk),
+                                               struct sk_msg, sk),
                                       si->dst_reg, si->src_reg,
-                                     offsetof(struct sk_msg_buff, sk));
+                                     offsetof(struct sk_msg, sk));
                 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                       offsetof(struct sock_common, skc_dport));
  #ifndef __BIG_ENDIAN_BITFIELD
@@ -7105,9 +6989,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
                 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2);
  
                 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
-                                               struct sk_msg_buff, sk),
+                                               struct sk_msg, sk),
                                       si->dst_reg, si->src_reg,
-                                     offsetof(struct sk_msg_buff, sk));
+                                     offsetof(struct sk_msg, sk));
                 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                       offsetof(struct sock_common, skc_num));
                 break;
diff --git a/net/core/skmsg.c b/net/core/skmsg.c

new file mode 100644 (file)

index 0000000..ae2b281
--- /dev/null
+++ b/net/core/skmsg.c
@@ -0,0 +1,763 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
+
+#include <linux/skmsg.h>
+#include <linux/skbuff.h>
+#include <linux/scatterlist.h>
+
+#include <net/sock.h>
+#include <net/tcp.h>
+
+static bool sk_msg_try_coalesce_ok(struct sk_msg *msg, int elem_first_coalesce)
+{
+       if (msg->sg.end > msg->sg.start &&
+           elem_first_coalesce < msg->sg.end)
+               return true;
+
+       if (msg->sg.end < msg->sg.start &&
+           (elem_first_coalesce > msg->sg.start ||
+            elem_first_coalesce < msg->sg.end))
+               return true;
+
+       return false;
+}
+
+int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len,
+                int elem_first_coalesce)
+{
+       struct page_frag *pfrag = sk_page_frag(sk);
+       int ret = 0;
+
+       len -= msg->sg.size;
+       while (len > 0) {
+               struct scatterlist *sge;
+               u32 orig_offset;
+               int use, i;
+
+               if (!sk_page_frag_refill(sk, pfrag))
+                       return -ENOMEM;
+
+               orig_offset = pfrag->offset;
+               use = min_t(int, len, pfrag->size - orig_offset);
+               if (!sk_wmem_schedule(sk, use))
+                       return -ENOMEM;
+
+               i = msg->sg.end;
+               sk_msg_iter_var_prev(i);
+               sge = &msg->sg.data[i];
+
+               if (sk_msg_try_coalesce_ok(msg, elem_first_coalesce) &&
+                   sg_page(sge) == pfrag->page &&
+                   sge->offset + sge->length == orig_offset) {
+                       sge->length += use;
+               } else {
+                       if (sk_msg_full(msg)) {
+                               ret = -ENOSPC;
+                               break;
+                       }
+
+                       sge = &msg->sg.data[msg->sg.end];
+                       sg_unmark_end(sge);
+                       sg_set_page(sge, pfrag->page, use, orig_offset);
+                       get_page(pfrag->page);
+                       sk_msg_iter_next(msg, end);
+               }
+
+               sk_mem_charge(sk, use);
+               msg->sg.size += use;
+               pfrag->offset += use;
+               len -= use;
+       }
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(sk_msg_alloc);
+
+void sk_msg_return_zero(struct sock *sk, struct sk_msg *msg, int bytes)
+{
+       int i = msg->sg.start;
+
+       do {
+               struct scatterlist *sge = sk_msg_elem(msg, i);
+
+               if (bytes < sge->length) {
+                       sge->length -= bytes;
+                       sge->offset += bytes;
+                       sk_mem_uncharge(sk, bytes);
+                       break;
+               }
+
+               sk_mem_uncharge(sk, sge->length);
+               bytes -= sge->length;
+               sge->length = 0;
+               sge->offset = 0;
+               sk_msg_iter_var_next(i);
+       } while (bytes && i != msg->sg.end);
+       msg->sg.start = i;
+}
+EXPORT_SYMBOL_GPL(sk_msg_return_zero);
+
+void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes)
+{
+       int i = msg->sg.start;
+
+       do {
+               struct scatterlist *sge = &msg->sg.data[i];
+               int uncharge = (bytes < sge->length) ? bytes : sge->length;
+
+               sk_mem_uncharge(sk, uncharge);
+               bytes -= uncharge;
+               sk_msg_iter_var_next(i);
+       } while (i != msg->sg.end);
+}
+EXPORT_SYMBOL_GPL(sk_msg_return);
+
+static int sk_msg_free_elem(struct sock *sk, struct sk_msg *msg, u32 i,
+                           bool charge)
+{
+       struct scatterlist *sge = sk_msg_elem(msg, i);
+       u32 len = sge->length;
+
+       if (charge)
+               sk_mem_uncharge(sk, len);
+       if (!msg->skb)
+               put_page(sg_page(sge));
+       memset(sge, 0, sizeof(*sge));
+       return len;
+}
+
+static int __sk_msg_free(struct sock *sk, struct sk_msg *msg, u32 i,
+                        bool charge)
+{
+       struct scatterlist *sge = sk_msg_elem(msg, i);
+       int freed = 0;
+
+       while (msg->sg.size) {
+               msg->sg.size -= sge->length;
+               freed += sk_msg_free_elem(sk, msg, i, charge);
+               sk_msg_iter_var_next(i);
+               sk_msg_check_to_free(msg, i, msg->sg.size);
+               sge = sk_msg_elem(msg, i);
+       }
+       if (msg->skb)
+               consume_skb(msg->skb);
+       sk_msg_init(msg);
+       return freed;
+}
+
+int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg)
+{
+       return __sk_msg_free(sk, msg, msg->sg.start, false);
+}
+EXPORT_SYMBOL_GPL(sk_msg_free_nocharge);
+
+int sk_msg_free(struct sock *sk, struct sk_msg *msg)
+{
+       return __sk_msg_free(sk, msg, msg->sg.start, true);
+}
+EXPORT_SYMBOL_GPL(sk_msg_free);
+
+static void __sk_msg_free_partial(struct sock *sk, struct sk_msg *msg,
+                                 u32 bytes, bool charge)
+{
+       struct scatterlist *sge;
+       u32 i = msg->sg.start;
+
+       while (bytes) {
+               sge = sk_msg_elem(msg, i);
+               if (!sge->length)
+                       break;
+               if (bytes < sge->length) {
+                       if (charge)
+                               sk_mem_uncharge(sk, bytes);
+                       sge->length -= bytes;
+                       sge->offset += bytes;
+                       msg->sg.size -= bytes;
+                       break;
+               }
+
+               msg->sg.size -= sge->length;
+               bytes -= sge->length;
+               sk_msg_free_elem(sk, msg, i, charge);
+               sk_msg_iter_var_next(i);
+               sk_msg_check_to_free(msg, i, bytes);
+       }
+       msg->sg.start = i;
+}
+
+void sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes)
+{
+       __sk_msg_free_partial(sk, msg, bytes, true);
+}
+EXPORT_SYMBOL_GPL(sk_msg_free_partial);
+
+void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg,
+                                 u32 bytes)
+{
+       __sk_msg_free_partial(sk, msg, bytes, false);
+}
+
+void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len)
+{
+       int trim = msg->sg.size - len;
+       u32 i = msg->sg.end;
+
+       if (trim <= 0) {
+               WARN_ON(trim < 0);
+               return;
+       }
+
+       sk_msg_iter_var_prev(i);
+       msg->sg.size = len;
+       while (msg->sg.data[i].length &&
+              trim >= msg->sg.data[i].length) {
+               trim -= msg->sg.data[i].length;
+               sk_msg_free_elem(sk, msg, i, true);
+               sk_msg_iter_var_prev(i);
+               if (!trim)
+                       goto out;
+       }
+
+       msg->sg.data[i].length -= trim;
+       sk_mem_uncharge(sk, trim);
+out:
+       /* If we trim data before curr pointer update copybreak and current
+        * so that any future copy operations start at new copy location.
+        * However trimed data that has not yet been used in a copy op
+        * does not require an update.
+        */
+       if (msg->sg.curr >= i) {
+               msg->sg.curr = i;
+               msg->sg.copybreak = msg->sg.data[i].length;
+       }
+       sk_msg_iter_var_next(i);
+       msg->sg.end = i;
+}
+EXPORT_SYMBOL_GPL(sk_msg_trim);
+
+int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
+                             struct sk_msg *msg, u32 bytes)
+{
+       int i, maxpages, ret = 0, num_elems = sk_msg_elem_used(msg);
+       const int to_max_pages = MAX_MSG_FRAGS;
+       struct page *pages[MAX_MSG_FRAGS];
+       ssize_t orig, copied, use, offset;
+
+       orig = msg->sg.size;
+       while (bytes > 0) {
+               i = 0;
+               maxpages = to_max_pages - num_elems;
+               if (maxpages == 0) {
+                       ret = -EFAULT;
+                       goto out;
+               }
+
+               copied = iov_iter_get_pages(from, pages, bytes, maxpages,
+                                           &offset);
+               if (copied <= 0) {
+                       ret = -EFAULT;
+                       goto out;
+               }
+
+               iov_iter_advance(from, copied);
+               bytes -= copied;
+               msg->sg.size += copied;
+
+               while (copied) {
+                       use = min_t(int, copied, PAGE_SIZE - offset);
+                       sg_set_page(&msg->sg.data[msg->sg.end],
+                                   pages[i], use, offset);
+                       sg_unmark_end(&msg->sg.data[msg->sg.end]);
+                       sk_mem_charge(sk, use);
+
+                       offset = 0;
+                       copied -= use;
+                       sk_msg_iter_next(msg, end);
+                       num_elems++;
+                       i++;
+               }
+               /* When zerocopy is mixed with sk_msg_*copy* operations we
+                * may have a copybreak set in this case clear and prefer
+                * zerocopy remainder when possible.
+                */
+               msg->sg.copybreak = 0;
+               msg->sg.curr = msg->sg.end;
+       }
+out:
+       /* Revert iov_iter updates, msg will need to use 'trim' later if it
+        * also needs to be cleared.
+        */
+       if (ret)
+               iov_iter_revert(from, msg->sg.size - orig);
+       return ret;
+}
+EXPORT_SYMBOL_GPL(sk_msg_zerocopy_from_iter);
+
+int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from,
+                            struct sk_msg *msg, u32 bytes)
+{
+       int ret = -ENOSPC, i = msg->sg.curr;
+       struct scatterlist *sge;
+       u32 copy, buf_size;
+       void *to;
+
+       do {
+               sge = sk_msg_elem(msg, i);
+               /* This is possible if a trim operation shrunk the buffer */
+               if (msg->sg.copybreak >= sge->length) {
+                       msg->sg.copybreak = 0;
+                       sk_msg_iter_var_next(i);
+                       if (i == msg->sg.end)
+                               break;
+                       sge = sk_msg_elem(msg, i);
+               }
+
+               buf_size = sge->length - msg->sg.copybreak;
+               copy = (buf_size > bytes) ? bytes : buf_size;
+               to = sg_virt(sge) + msg->sg.copybreak;
+               msg->sg.copybreak += copy;
+               if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY)
+                       ret = copy_from_iter_nocache(to, copy, from);
+               else
+                       ret = copy_from_iter(to, copy, from);
+               if (ret != copy) {
+                       ret = -EFAULT;
+                       goto out;
+               }
+               bytes -= copy;
+               if (!bytes)
+                       break;
+               msg->sg.copybreak = 0;
+               sk_msg_iter_var_next(i);
+       } while (i != msg->sg.end);
+out:
+       msg->sg.curr = i;
+       return ret;
+}
+EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter);
+
+static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
+{
+       struct sock *sk = psock->sk;
+       int copied = 0, num_sge;
+       struct sk_msg *msg;
+
+       msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC);
+       if (unlikely(!msg))
+               return -EAGAIN;
+       if (!sk_rmem_schedule(sk, skb, skb->len)) {
+               kfree(msg);
+               return -EAGAIN;
+       }
+
+       sk_msg_init(msg);
+       num_sge = skb_to_sgvec(skb, msg->sg.data, 0, skb->len);
+       if (unlikely(num_sge < 0)) {
+               kfree(msg);
+               return num_sge;
+       }
+
+       sk_mem_charge(sk, skb->len);
+       copied = skb->len;
+       msg->sg.start = 0;
+       msg->sg.end = num_sge == MAX_MSG_FRAGS ? 0 : num_sge;
+       msg->skb = skb;
+
+       sk_psock_queue_msg(psock, msg);
+       sk->sk_data_ready(sk);
+       return copied;
+}
+
+static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
+                              u32 off, u32 len, bool ingress)
+{
+       if (ingress)
+               return sk_psock_skb_ingress(psock, skb);
+       else
+               return skb_send_sock_locked(psock->sk, skb, off, len);
+}
+
+static void sk_psock_backlog(struct work_struct *work)
+{
+       struct sk_psock *psock = container_of(work, struct sk_psock, work);
+       struct sk_psock_work_state *state = &psock->work_state;
+       struct sk_buff *skb;
+       bool ingress;
+       u32 len, off;
+       int ret;
+
+       /* Lock sock to avoid losing sk_socket during loop. */
+       lock_sock(psock->sk);
+       if (state->skb) {
+               skb = state->skb;
+               len = state->len;
+               off = state->off;
+               state->skb = NULL;
+               goto start;
+       }
+
+       while ((skb = skb_dequeue(&psock->ingress_skb))) {
+               len = skb->len;
+               off = 0;
+start:
+               ingress = tcp_skb_bpf_ingress(skb);
+               do {
+                       ret = -EIO;
+                       if (likely(psock->sk->sk_socket))
+                               ret = sk_psock_handle_skb(psock, skb, off,
+                                                         len, ingress);
+                       if (ret <= 0) {
+                               if (ret == -EAGAIN) {
+                                       state->skb = skb;
+                                       state->len = len;
+                                       state->off = off;
+                                       goto end;
+                               }
+                               /* Hard errors break pipe and stop xmit. */
+                               sk_psock_report_error(psock, ret ? -ret : EPIPE);
+                               sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
+                               kfree_skb(skb);
+                               goto end;
+                       }
+                       off += ret;
+                       len -= ret;
+               } while (len);
+
+               if (!ingress)
+                       kfree_skb(skb);
+       }
+end:
+       release_sock(psock->sk);
+}
+
+struct sk_psock *sk_psock_init(struct sock *sk, int node)
+{
+       struct sk_psock *psock = kzalloc_node(sizeof(*psock),
+                                             GFP_ATOMIC | __GFP_NOWARN,
+                                             node);
+       if (!psock)
+               return NULL;
+
+       psock->sk = sk;
+       psock->eval =  __SK_NONE;
+
+       INIT_LIST_HEAD(&psock->link);
+       spin_lock_init(&psock->link_lock);
+
+       INIT_WORK(&psock->work, sk_psock_backlog);
+       INIT_LIST_HEAD(&psock->ingress_msg);
+       skb_queue_head_init(&psock->ingress_skb);
+
+       sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED);
+       refcount_set(&psock->refcnt, 1);
+
+       rcu_assign_sk_user_data(sk, psock);
+       sock_hold(sk);
+
+       return psock;
+}
+EXPORT_SYMBOL_GPL(sk_psock_init);
+
+struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock)
+{
+       struct sk_psock_link *link;
+
+       spin_lock_bh(&psock->link_lock);
+       link = list_first_entry_or_null(&psock->link, struct sk_psock_link,
+                                       list);
+       if (link)
+               list_del(&link->list);
+       spin_unlock_bh(&psock->link_lock);
+       return link;
+}
+
+void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
+{
+       struct sk_msg *msg, *tmp;
+
+       list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) {
+               list_del(&msg->list);
+               sk_msg_free(psock->sk, msg);
+               kfree(msg);
+       }
+}
+
+static void sk_psock_zap_ingress(struct sk_psock *psock)
+{
+       __skb_queue_purge(&psock->ingress_skb);
+       __sk_psock_purge_ingress_msg(psock);
+}
+
+static void sk_psock_link_destroy(struct sk_psock *psock)
+{
+       struct sk_psock_link *link, *tmp;
+
+       list_for_each_entry_safe(link, tmp, &psock->link, list) {
+               list_del(&link->list);
+               sk_psock_free_link(link);
+       }
+}
+
+static void sk_psock_destroy_deferred(struct work_struct *gc)
+{
+       struct sk_psock *psock = container_of(gc, struct sk_psock, gc);
+
+       /* No sk_callback_lock since already detached. */
+       if (psock->parser.enabled)
+               strp_done(&psock->parser.strp);
+
+       cancel_work_sync(&psock->work);
+
+       psock_progs_drop(&psock->progs);
+
+       sk_psock_link_destroy(psock);
+       sk_psock_cork_free(psock);
+       sk_psock_zap_ingress(psock);
+
+       if (psock->sk_redir)
+               sock_put(psock->sk_redir);
+       sock_put(psock->sk);
+       kfree(psock);
+}
+
+void sk_psock_destroy(struct rcu_head *rcu)
+{
+       struct sk_psock *psock = container_of(rcu, struct sk_psock, rcu);
+
+       INIT_WORK(&psock->gc, sk_psock_destroy_deferred);
+       schedule_work(&psock->gc);
+}
+EXPORT_SYMBOL_GPL(sk_psock_destroy);
+
+void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
+{
+       rcu_assign_sk_user_data(sk, NULL);
+       sk_psock_cork_free(psock);
+       sk_psock_restore_proto(sk, psock);
+
+       write_lock_bh(&sk->sk_callback_lock);
+       if (psock->progs.skb_parser)
+               sk_psock_stop_strp(sk, psock);
+       write_unlock_bh(&sk->sk_callback_lock);
+       sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
+
+       call_rcu_sched(&psock->rcu, sk_psock_destroy);
+}
+EXPORT_SYMBOL_GPL(sk_psock_drop);
+
+static int sk_psock_map_verd(int verdict, bool redir)
+{
+       switch (verdict) {
+       case SK_PASS:
+               return redir ? __SK_REDIRECT : __SK_PASS;
+       case SK_DROP:
+       default:
+               break;
+       }
+
+       return __SK_DROP;
+}
+
+int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
+                        struct sk_msg *msg)
+{
+       struct bpf_prog *prog;
+       int ret;
+
+       preempt_disable();
+       rcu_read_lock();
+       prog = READ_ONCE(psock->progs.msg_parser);
+       if (unlikely(!prog)) {
+               ret = __SK_PASS;
+               goto out;
+       }
+
+       sk_msg_compute_data_pointers(msg);
+       msg->sk = sk;
+       ret = BPF_PROG_RUN(prog, msg);
+       ret = sk_psock_map_verd(ret, msg->sk_redir);
+       psock->apply_bytes = msg->apply_bytes;
+       if (ret == __SK_REDIRECT) {
+               if (psock->sk_redir)
+                       sock_put(psock->sk_redir);
+               psock->sk_redir = msg->sk_redir;
+               if (!psock->sk_redir) {
+                       ret = __SK_DROP;
+                       goto out;
+               }
+               sock_hold(psock->sk_redir);
+       }
+out:
+       rcu_read_unlock();
+       preempt_enable();
+       return ret;
+}
+EXPORT_SYMBOL_GPL(sk_psock_msg_verdict);
+
+static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog,
+                           struct sk_buff *skb)
+{
+       int ret;
+
+       skb->sk = psock->sk;
+       bpf_compute_data_end_sk_skb(skb);
+       preempt_disable();
+       ret = BPF_PROG_RUN(prog, skb);
+       preempt_enable();
+       /* strparser clones the skb before handing it to a upper layer,
+        * meaning skb_orphan has been called. We NULL sk on the way out
+        * to ensure we don't trigger a BUG_ON() in skb/sk operations
+        * later and because we are not charging the memory of this skb
+        * to any socket yet.
+        */
+       skb->sk = NULL;
+       return ret;
+}
+
+static struct sk_psock *sk_psock_from_strp(struct strparser *strp)
+{
+       struct sk_psock_parser *parser;
+
+       parser = container_of(strp, struct sk_psock_parser, strp);
+       return container_of(parser, struct sk_psock, parser);
+}
+
+static void sk_psock_verdict_apply(struct sk_psock *psock,
+                                  struct sk_buff *skb, int verdict)
+{
+       struct sk_psock *psock_other;
+       struct sock *sk_other;
+       bool ingress;
+
+       switch (verdict) {
+       case __SK_REDIRECT:
+               sk_other = tcp_skb_bpf_redirect_fetch(skb);
+               if (unlikely(!sk_other))
+                       goto out_free;
+               psock_other = sk_psock(sk_other);
+               if (!psock_other || sock_flag(sk_other, SOCK_DEAD) ||
+                   !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED))
+                       goto out_free;
+               ingress = tcp_skb_bpf_ingress(skb);
+               if ((!ingress && sock_writeable(sk_other)) ||
+                   (ingress &&
+                    atomic_read(&sk_other->sk_rmem_alloc) <=
+                    sk_other->sk_rcvbuf)) {
+                       if (!ingress)
+                               skb_set_owner_w(skb, sk_other);
+                       skb_queue_tail(&psock_other->ingress_skb, skb);
+                       schedule_work(&psock_other->work);
+                       break;
+               }
+               /* fall-through */
+       case __SK_DROP:
+               /* fall-through */
+       default:
+out_free:
+               kfree_skb(skb);
+       }
+}
+
+static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
+{
+       struct sk_psock *psock = sk_psock_from_strp(strp);
+       struct bpf_prog *prog;
+       int ret = __SK_DROP;
+
+       rcu_read_lock();
+       prog = READ_ONCE(psock->progs.skb_verdict);
+       if (likely(prog)) {
+               skb_orphan(skb);
+               tcp_skb_bpf_redirect_clear(skb);
+               ret = sk_psock_bpf_run(psock, prog, skb);
+               ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
+       }
+       rcu_read_unlock();
+       sk_psock_verdict_apply(psock, skb, ret);
+}
+
+static int sk_psock_strp_read_done(struct strparser *strp, int err)
+{
+       return err;
+}
+
+static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb)
+{
+       struct sk_psock *psock = sk_psock_from_strp(strp);
+       struct bpf_prog *prog;
+       int ret = skb->len;
+
+       rcu_read_lock();
+       prog = READ_ONCE(psock->progs.skb_parser);
+       if (likely(prog))
+               ret = sk_psock_bpf_run(psock, prog, skb);
+       rcu_read_unlock();
+       return ret;
+}
+
+/* Called with socket lock held. */
+static void sk_psock_data_ready(struct sock *sk)
+{
+       struct sk_psock *psock;
+
+       rcu_read_lock();
+       psock = sk_psock(sk);
+       if (likely(psock)) {
+               write_lock_bh(&sk->sk_callback_lock);
+               strp_data_ready(&psock->parser.strp);
+               write_unlock_bh(&sk->sk_callback_lock);
+       }
+       rcu_read_unlock();
+}
+
+static void sk_psock_write_space(struct sock *sk)
+{
+       struct sk_psock *psock;
+       void (*write_space)(struct sock *sk);
+
+       rcu_read_lock();
+       psock = sk_psock(sk);
+       if (likely(psock && sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)))
+               schedule_work(&psock->work);
+       write_space = psock->saved_write_space;
+       rcu_read_unlock();
+       write_space(sk);
+}
+
+int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
+{
+       static const struct strp_callbacks cb = {
+               .rcv_msg        = sk_psock_strp_read,
+               .read_sock_done = sk_psock_strp_read_done,
+               .parse_msg      = sk_psock_strp_parse,
+       };
+
+       psock->parser.enabled = false;
+       return strp_init(&psock->parser.strp, sk, &cb);
+}
+
+void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
+{
+       struct sk_psock_parser *parser = &psock->parser;
+
+       if (parser->enabled)
+               return;
+
+       parser->saved_data_ready = sk->sk_data_ready;
+       sk->sk_data_ready = sk_psock_data_ready;
+       sk->sk_write_space = sk_psock_write_space;
+       parser->enabled = true;
+}
+
+void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock)
+{
+       struct sk_psock_parser *parser = &psock->parser;
+
+       if (!parser->enabled)
+               return;
+
+       sk->sk_data_ready = parser->saved_data_ready;
+       parser->saved_data_ready = NULL;
+       strp_stop(&parser->strp);
+       parser->enabled = false;
+}
diff --git a/net/core/sock_map.c b/net/core/sock_map.c

new file mode 100644 (file)

index 0000000..3c0e44c
--- /dev/null
+++ b/net/core/sock_map.c
@@ -0,0 +1,1002 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
+
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/net.h>
+#include <linux/workqueue.h>
+#include <linux/skmsg.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+
+struct bpf_stab {
+       struct bpf_map map;
+       struct sock **sks;
+       struct sk_psock_progs progs;
+       raw_spinlock_t lock;
+};
+
+#define SOCK_CREATE_FLAG_MASK                          \
+       (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
+static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
+{
+       struct bpf_stab *stab;
+       u64 cost;
+       int err;
+
+       if (!capable(CAP_NET_ADMIN))
+               return ERR_PTR(-EPERM);
+       if (attr->max_entries == 0 ||
+           attr->key_size    != 4 ||
+           attr->value_size  != 4 ||
+           attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
+               return ERR_PTR(-EINVAL);
+
+       stab = kzalloc(sizeof(*stab), GFP_USER);
+       if (!stab)
+               return ERR_PTR(-ENOMEM);
+
+       bpf_map_init_from_attr(&stab->map, attr);
+       raw_spin_lock_init(&stab->lock);
+
+       /* Make sure page count doesn't overflow. */
+       cost = (u64) stab->map.max_entries * sizeof(struct sock *);
+       if (cost >= U32_MAX - PAGE_SIZE) {
+               err = -EINVAL;
+               goto free_stab;
+       }
+
+       stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+       err = bpf_map_precharge_memlock(stab->map.pages);
+       if (err)
+               goto free_stab;
+
+       stab->sks = bpf_map_area_alloc(stab->map.max_entries *
+                                      sizeof(struct sock *),
+                                      stab->map.numa_node);
+       if (stab->sks)
+               return &stab->map;
+       err = -ENOMEM;
+free_stab:
+       kfree(stab);
+       return ERR_PTR(err);
+}
+
+int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+       u32 ufd = attr->target_fd;
+       struct bpf_map *map;
+       struct fd f;
+       int ret;
+
+       f = fdget(ufd);
+       map = __bpf_map_get(f);
+       if (IS_ERR(map))
+               return PTR_ERR(map);
+       ret = sock_map_prog_update(map, prog, attr->attach_type);
+       fdput(f);
+       return ret;
+}
+
+static void sock_map_sk_acquire(struct sock *sk)
+       __acquires(&sk->sk_lock.slock)
+{
+       lock_sock(sk);
+       preempt_disable();
+       rcu_read_lock();
+}
+
+static void sock_map_sk_release(struct sock *sk)
+       __releases(&sk->sk_lock.slock)
+{
+       rcu_read_unlock();
+       preempt_enable();
+       release_sock(sk);
+}
+
+static void sock_map_add_link(struct sk_psock *psock,
+                             struct sk_psock_link *link,
+                             struct bpf_map *map, void *link_raw)
+{
+       link->link_raw = link_raw;
+       link->map = map;
+       spin_lock_bh(&psock->link_lock);
+       list_add_tail(&link->list, &psock->link);
+       spin_unlock_bh(&psock->link_lock);
+}
+
+static void sock_map_del_link(struct sock *sk,
+                             struct sk_psock *psock, void *link_raw)
+{
+       struct sk_psock_link *link, *tmp;
+       bool strp_stop = false;
+
+       spin_lock_bh(&psock->link_lock);
+       list_for_each_entry_safe(link, tmp, &psock->link, list) {
+               if (link->link_raw == link_raw) {
+                       struct bpf_map *map = link->map;
+                       struct bpf_stab *stab = container_of(map, struct bpf_stab,
+                                                            map);
+                       if (psock->parser.enabled && stab->progs.skb_parser)
+                               strp_stop = true;
+                       list_del(&link->list);
+                       sk_psock_free_link(link);
+               }
+       }
+       spin_unlock_bh(&psock->link_lock);
+       if (strp_stop) {
+               write_lock_bh(&sk->sk_callback_lock);
+               sk_psock_stop_strp(sk, psock);
+               write_unlock_bh(&sk->sk_callback_lock);
+       }
+}
+
+static void sock_map_unref(struct sock *sk, void *link_raw)
+{
+       struct sk_psock *psock = sk_psock(sk);
+
+       if (likely(psock)) {
+               sock_map_del_link(sk, psock, link_raw);
+               sk_psock_put(sk, psock);
+       }
+}
+
+static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
+                        struct sock *sk)
+{
+       struct bpf_prog *msg_parser, *skb_parser, *skb_verdict;
+       bool skb_progs, sk_psock_is_new = false;
+       struct sk_psock *psock;
+       int ret;
+
+       skb_verdict = READ_ONCE(progs->skb_verdict);
+       skb_parser = READ_ONCE(progs->skb_parser);
+       skb_progs = skb_parser && skb_verdict;
+       if (skb_progs) {
+               skb_verdict = bpf_prog_inc_not_zero(skb_verdict);
+               if (IS_ERR(skb_verdict))
+                       return PTR_ERR(skb_verdict);
+               skb_parser = bpf_prog_inc_not_zero(skb_parser);
+               if (IS_ERR(skb_parser)) {
+                       bpf_prog_put(skb_verdict);
+                       return PTR_ERR(skb_parser);
+               }
+       }
+
+       msg_parser = READ_ONCE(progs->msg_parser);
+       if (msg_parser) {
+               msg_parser = bpf_prog_inc_not_zero(msg_parser);
+               if (IS_ERR(msg_parser)) {
+                       ret = PTR_ERR(msg_parser);
+                       goto out;
+               }
+       }
+
+       psock = sk_psock_get(sk);
+       if (psock) {
+               if (!sk_has_psock(sk)) {
+                       ret = -EBUSY;
+                       goto out_progs;
+               }
+               if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) ||
+                   (skb_progs  && READ_ONCE(psock->progs.skb_parser))) {
+                       sk_psock_put(sk, psock);
+                       ret = -EBUSY;
+                       goto out_progs;
+               }
+       } else {
+               psock = sk_psock_init(sk, map->numa_node);
+               if (!psock) {
+                       ret = -ENOMEM;
+                       goto out_progs;
+               }
+               sk_psock_is_new = true;
+       }
+
+       if (msg_parser)
+               psock_set_prog(&psock->progs.msg_parser, msg_parser);
+       if (sk_psock_is_new) {
+               ret = tcp_bpf_init(sk);
+               if (ret < 0)
+                       goto out_drop;
+       } else {
+               tcp_bpf_reinit(sk);
+       }
+
+       write_lock_bh(&sk->sk_callback_lock);
+       if (skb_progs && !psock->parser.enabled) {
+               ret = sk_psock_init_strp(sk, psock);
+               if (ret) {
+                       write_unlock_bh(&sk->sk_callback_lock);
+                       goto out_drop;
+               }
+               psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
+               psock_set_prog(&psock->progs.skb_parser, skb_parser);
+               sk_psock_start_strp(sk, psock);
+       }
+       write_unlock_bh(&sk->sk_callback_lock);
+       return 0;
+out_drop:
+       sk_psock_put(sk, psock);
+out_progs:
+       if (msg_parser)
+               bpf_prog_put(msg_parser);
+out:
+       if (skb_progs) {
+               bpf_prog_put(skb_verdict);
+               bpf_prog_put(skb_parser);
+       }
+       return ret;
+}
+
+static void sock_map_free(struct bpf_map *map)
+{
+       struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+       int i;
+
+       synchronize_rcu();
+       rcu_read_lock();
+       raw_spin_lock_bh(&stab->lock);
+       for (i = 0; i < stab->map.max_entries; i++) {
+               struct sock **psk = &stab->sks[i];
+               struct sock *sk;
+
+               sk = xchg(psk, NULL);
+               if (sk)
+                       sock_map_unref(sk, psk);
+       }
+       raw_spin_unlock_bh(&stab->lock);
+       rcu_read_unlock();
+
+       bpf_map_area_free(stab->sks);
+       kfree(stab);
+}
+
+static void sock_map_release_progs(struct bpf_map *map)
+{
+       psock_progs_drop(&container_of(map, struct bpf_stab, map)->progs);
+}
+
+static struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+       struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+
+       WARN_ON_ONCE(!rcu_read_lock_held());
+
+       if (unlikely(key >= map->max_entries))
+               return NULL;
+       return READ_ONCE(stab->sks[key]);
+}
+
+static void *sock_map_lookup(struct bpf_map *map, void *key)
+{
+       return ERR_PTR(-EOPNOTSUPP);
+}
+
+static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test,
+                            struct sock **psk)
+{
+       struct sock *sk;
+
+       raw_spin_lock_bh(&stab->lock);
+       sk = *psk;
+       if (!sk_test || sk_test == sk)
+               *psk = NULL;
+       raw_spin_unlock_bh(&stab->lock);
+       if (unlikely(!sk))
+               return -EINVAL;
+       sock_map_unref(sk, psk);
+       return 0;
+}
+
+static void sock_map_delete_from_link(struct bpf_map *map, struct sock *sk,
+                                     void *link_raw)
+{
+       struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+
+       __sock_map_delete(stab, sk, link_raw);
+}
+
+static int sock_map_delete_elem(struct bpf_map *map, void *key)
+{
+       struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+       u32 i = *(u32 *)key;
+       struct sock **psk;
+
+       if (unlikely(i >= map->max_entries))
+               return -EINVAL;
+
+       psk = &stab->sks[i];
+       return __sock_map_delete(stab, NULL, psk);
+}
+
+static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next)
+{
+       struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+       u32 i = key ? *(u32 *)key : U32_MAX;
+       u32 *key_next = next;
+
+       if (i == stab->map.max_entries - 1)
+               return -ENOENT;
+       if (i >= stab->map.max_entries)
+               *key_next = 0;
+       else
+               *key_next = i + 1;
+       return 0;
+}
+
+static int sock_map_update_common(struct bpf_map *map, u32 idx,
+                                 struct sock *sk, u64 flags)
+{
+       struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+       struct sk_psock_link *link;
+       struct sk_psock *psock;
+       struct sock *osk;
+       int ret;
+
+       WARN_ON_ONCE(!rcu_read_lock_held());
+       if (unlikely(flags > BPF_EXIST))
+               return -EINVAL;
+       if (unlikely(idx >= map->max_entries))
+               return -E2BIG;
+
+       link = sk_psock_init_link();
+       if (!link)
+               return -ENOMEM;
+
+       ret = sock_map_link(map, &stab->progs, sk);
+       if (ret < 0)
+               goto out_free;
+
+       psock = sk_psock(sk);
+       WARN_ON_ONCE(!psock);
+
+       raw_spin_lock_bh(&stab->lock);
+       osk = stab->sks[idx];
+       if (osk && flags == BPF_NOEXIST) {
+               ret = -EEXIST;
+               goto out_unlock;
+       } else if (!osk && flags == BPF_EXIST) {
+               ret = -ENOENT;
+               goto out_unlock;
+       }
+
+       sock_map_add_link(psock, link, map, &stab->sks[idx]);
+       stab->sks[idx] = sk;
+       if (osk)
+               sock_map_unref(osk, &stab->sks[idx]);
+       raw_spin_unlock_bh(&stab->lock);
+       return 0;
+out_unlock:
+       raw_spin_unlock_bh(&stab->lock);
+       if (psock)
+               sk_psock_put(sk, psock);
+out_free:
+       sk_psock_free_link(link);
+       return ret;
+}
+
+static bool sock_map_op_okay(const struct bpf_sock_ops_kern *ops)
+{
+       return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB ||
+              ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB;
+}
+
+static bool sock_map_sk_is_suitable(const struct sock *sk)
+{
+       return sk->sk_type == SOCK_STREAM &&
+              sk->sk_protocol == IPPROTO_TCP;
+}
+
+static int sock_map_update_elem(struct bpf_map *map, void *key,
+                               void *value, u64 flags)
+{
+       u32 ufd = *(u32 *)value;
+       u32 idx = *(u32 *)key;
+       struct socket *sock;
+       struct sock *sk;
+       int ret;
+
+       sock = sockfd_lookup(ufd, &ret);
+       if (!sock)
+               return ret;
+       sk = sock->sk;
+       if (!sk) {
+               ret = -EINVAL;
+               goto out;
+       }
+       if (!sock_map_sk_is_suitable(sk) ||
+           sk->sk_state != TCP_ESTABLISHED) {
+               ret = -EOPNOTSUPP;
+               goto out;
+       }
+
+       sock_map_sk_acquire(sk);
+       ret = sock_map_update_common(map, idx, sk, flags);
+       sock_map_sk_release(sk);
+out:
+       fput(sock->file);
+       return ret;
+}
+
+BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, sops,
+          struct bpf_map *, map, void *, key, u64, flags)
+{
+       WARN_ON_ONCE(!rcu_read_lock_held());
+
+       if (likely(sock_map_sk_is_suitable(sops->sk) &&
+                  sock_map_op_okay(sops)))
+               return sock_map_update_common(map, *(u32 *)key, sops->sk,
+                                             flags);
+       return -EOPNOTSUPP;
+}
+
+const struct bpf_func_proto bpf_sock_map_update_proto = {
+       .func           = bpf_sock_map_update,
+       .gpl_only       = false,
+       .pkt_access     = true,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+       .arg2_type      = ARG_CONST_MAP_PTR,
+       .arg3_type      = ARG_PTR_TO_MAP_KEY,
+       .arg4_type      = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
+          struct bpf_map *, map, u32, key, u64, flags)
+{
+       struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+       if (unlikely(flags & ~(BPF_F_INGRESS)))
+               return SK_DROP;
+       tcb->bpf.flags = flags;
+       tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key);
+       if (!tcb->bpf.sk_redir)
+               return SK_DROP;
+       return SK_PASS;
+}
+
+const struct bpf_func_proto bpf_sk_redirect_map_proto = {
+       .func           = bpf_sk_redirect_map,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+       .arg2_type      = ARG_CONST_MAP_PTR,
+       .arg3_type      = ARG_ANYTHING,
+       .arg4_type      = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg,
+          struct bpf_map *, map, u32, key, u64, flags)
+{
+       if (unlikely(flags & ~(BPF_F_INGRESS)))
+               return SK_DROP;
+       msg->flags = flags;
+       msg->sk_redir = __sock_map_lookup_elem(map, key);
+       if (!msg->sk_redir)
+               return SK_DROP;
+       return SK_PASS;
+}
+
+const struct bpf_func_proto bpf_msg_redirect_map_proto = {
+       .func           = bpf_msg_redirect_map,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+       .arg2_type      = ARG_CONST_MAP_PTR,
+       .arg3_type      = ARG_ANYTHING,
+       .arg4_type      = ARG_ANYTHING,
+};
+
+const struct bpf_map_ops sock_map_ops = {
+       .map_alloc              = sock_map_alloc,
+       .map_free               = sock_map_free,
+       .map_get_next_key       = sock_map_get_next_key,
+       .map_update_elem        = sock_map_update_elem,
+       .map_delete_elem        = sock_map_delete_elem,
+       .map_lookup_elem        = sock_map_lookup,
+       .map_release_uref       = sock_map_release_progs,
+       .map_check_btf          = map_check_no_btf,
+};
+
+struct bpf_htab_elem {
+       struct rcu_head rcu;
+       u32 hash;
+       struct sock *sk;
+       struct hlist_node node;
+       u8 key[0];
+};
+
+struct bpf_htab_bucket {
+       struct hlist_head head;
+       raw_spinlock_t lock;
+};
+
+struct bpf_htab {
+       struct bpf_map map;
+       struct bpf_htab_bucket *buckets;
+       u32 buckets_num;
+       u32 elem_size;
+       struct sk_psock_progs progs;
+       atomic_t count;
+};
+
+static inline u32 sock_hash_bucket_hash(const void *key, u32 len)
+{
+       return jhash(key, len, 0);
+}
+
+static struct bpf_htab_bucket *sock_hash_select_bucket(struct bpf_htab *htab,
+                                                      u32 hash)
+{
+       return &htab->buckets[hash & (htab->buckets_num - 1)];
+}
+
+static struct bpf_htab_elem *
+sock_hash_lookup_elem_raw(struct hlist_head *head, u32 hash, void *key,
+                         u32 key_size)
+{
+       struct bpf_htab_elem *elem;
+
+       hlist_for_each_entry_rcu(elem, head, node) {
+               if (elem->hash == hash &&
+                   !memcmp(&elem->key, key, key_size))
+                       return elem;
+       }
+
+       return NULL;
+}
+
+static struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key)
+{
+       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+       u32 key_size = map->key_size, hash;
+       struct bpf_htab_bucket *bucket;
+       struct bpf_htab_elem *elem;
+
+       WARN_ON_ONCE(!rcu_read_lock_held());
+
+       hash = sock_hash_bucket_hash(key, key_size);
+       bucket = sock_hash_select_bucket(htab, hash);
+       elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size);
+
+       return elem ? elem->sk : NULL;
+}
+
+static void sock_hash_free_elem(struct bpf_htab *htab,
+                               struct bpf_htab_elem *elem)
+{
+       atomic_dec(&htab->count);
+       kfree_rcu(elem, rcu);
+}
+
+static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk,
+                                      void *link_raw)
+{
+       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+       struct bpf_htab_elem *elem_probe, *elem = link_raw;
+       struct bpf_htab_bucket *bucket;
+
+       WARN_ON_ONCE(!rcu_read_lock_held());
+       bucket = sock_hash_select_bucket(htab, elem->hash);
+
+       /* elem may be deleted in parallel from the map, but access here
+        * is okay since it's going away only after RCU grace period.
+        * However, we need to check whether it's still present.
+        */
+       raw_spin_lock_bh(&bucket->lock);
+       elem_probe = sock_hash_lookup_elem_raw(&bucket->head, elem->hash,
+                                              elem->key, map->key_size);
+       if (elem_probe && elem_probe == elem) {
+               hlist_del_rcu(&elem->node);
+               sock_map_unref(elem->sk, elem);
+               sock_hash_free_elem(htab, elem);
+       }
+       raw_spin_unlock_bh(&bucket->lock);
+}
+
+static int sock_hash_delete_elem(struct bpf_map *map, void *key)
+{
+       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+       u32 hash, key_size = map->key_size;
+       struct bpf_htab_bucket *bucket;
+       struct bpf_htab_elem *elem;
+       int ret = -ENOENT;
+
+       hash = sock_hash_bucket_hash(key, key_size);
+       bucket = sock_hash_select_bucket(htab, hash);
+
+       raw_spin_lock_bh(&bucket->lock);
+       elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size);
+       if (elem) {
+               hlist_del_rcu(&elem->node);
+               sock_map_unref(elem->sk, elem);
+               sock_hash_free_elem(htab, elem);
+               ret = 0;
+       }
+       raw_spin_unlock_bh(&bucket->lock);
+       return ret;
+}
+
+static struct bpf_htab_elem *sock_hash_alloc_elem(struct bpf_htab *htab,
+                                                 void *key, u32 key_size,
+                                                 u32 hash, struct sock *sk,
+                                                 struct bpf_htab_elem *old)
+{
+       struct bpf_htab_elem *new;
+
+       if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
+               if (!old) {
+                       atomic_dec(&htab->count);
+                       return ERR_PTR(-E2BIG);
+               }
+       }
+
+       new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
+                          htab->map.numa_node);
+       if (!new) {
+               atomic_dec(&htab->count);
+               return ERR_PTR(-ENOMEM);
+       }
+       memcpy(new->key, key, key_size);
+       new->sk = sk;
+       new->hash = hash;
+       return new;
+}
+
+static int sock_hash_update_common(struct bpf_map *map, void *key,
+                                  struct sock *sk, u64 flags)
+{
+       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+       u32 key_size = map->key_size, hash;
+       struct bpf_htab_elem *elem, *elem_new;
+       struct bpf_htab_bucket *bucket;
+       struct sk_psock_link *link;
+       struct sk_psock *psock;
+       int ret;
+
+       WARN_ON_ONCE(!rcu_read_lock_held());
+       if (unlikely(flags > BPF_EXIST))
+               return -EINVAL;
+
+       link = sk_psock_init_link();
+       if (!link)
+               return -ENOMEM;
+
+       ret = sock_map_link(map, &htab->progs, sk);
+       if (ret < 0)
+               goto out_free;
+
+       psock = sk_psock(sk);
+       WARN_ON_ONCE(!psock);
+
+       hash = sock_hash_bucket_hash(key, key_size);
+       bucket = sock_hash_select_bucket(htab, hash);
+
+       raw_spin_lock_bh(&bucket->lock);
+       elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size);
+       if (elem && flags == BPF_NOEXIST) {
+               ret = -EEXIST;
+               goto out_unlock;
+       } else if (!elem && flags == BPF_EXIST) {
+               ret = -ENOENT;
+               goto out_unlock;
+       }
+
+       elem_new = sock_hash_alloc_elem(htab, key, key_size, hash, sk, elem);
+       if (IS_ERR(elem_new)) {
+               ret = PTR_ERR(elem_new);
+               goto out_unlock;
+       }
+
+       sock_map_add_link(psock, link, map, elem_new);
+       /* Add new element to the head of the list, so that
+        * concurrent search will find it before old elem.
+        */
+       hlist_add_head_rcu(&elem_new->node, &bucket->head);
+       if (elem) {
+               hlist_del_rcu(&elem->node);
+               sock_map_unref(elem->sk, elem);
+               sock_hash_free_elem(htab, elem);
+       }
+       raw_spin_unlock_bh(&bucket->lock);
+       return 0;
+out_unlock:
+       raw_spin_unlock_bh(&bucket->lock);
+       sk_psock_put(sk, psock);
+out_free:
+       sk_psock_free_link(link);
+       return ret;
+}
+
+static int sock_hash_update_elem(struct bpf_map *map, void *key,
+                                void *value, u64 flags)
+{
+       u32 ufd = *(u32 *)value;
+       struct socket *sock;
+       struct sock *sk;
+       int ret;
+
+       sock = sockfd_lookup(ufd, &ret);
+       if (!sock)
+               return ret;
+       sk = sock->sk;
+       if (!sk) {
+               ret = -EINVAL;
+               goto out;
+       }
+       if (!sock_map_sk_is_suitable(sk) ||
+           sk->sk_state != TCP_ESTABLISHED) {
+               ret = -EOPNOTSUPP;
+               goto out;
+       }
+
+       sock_map_sk_acquire(sk);
+       ret = sock_hash_update_common(map, key, sk, flags);
+       sock_map_sk_release(sk);
+out:
+       fput(sock->file);
+       return ret;
+}
+
+static int sock_hash_get_next_key(struct bpf_map *map, void *key,
+                                 void *key_next)
+{
+       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+       struct bpf_htab_elem *elem, *elem_next;
+       u32 hash, key_size = map->key_size;
+       struct hlist_head *head;
+       int i = 0;
+
+       if (!key)
+               goto find_first_elem;
+       hash = sock_hash_bucket_hash(key, key_size);
+       head = &sock_hash_select_bucket(htab, hash)->head;
+       elem = sock_hash_lookup_elem_raw(head, hash, key, key_size);
+       if (!elem)
+               goto find_first_elem;
+
+       elem_next = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&elem->node)),
+                                    struct bpf_htab_elem, node);
+       if (elem_next) {
+               memcpy(key_next, elem_next->key, key_size);
+               return 0;
+       }
+
+       i = hash & (htab->buckets_num - 1);
+       i++;
+find_first_elem:
+       for (; i < htab->buckets_num; i++) {
+               head = &sock_hash_select_bucket(htab, i)->head;
+               elem_next = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
+                                            struct bpf_htab_elem, node);
+               if (elem_next) {
+                       memcpy(key_next, elem_next->key, key_size);
+                       return 0;
+               }
+       }
+
+       return -ENOENT;
+}
+
+static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
+{
+       struct bpf_htab *htab;
+       int i, err;
+       u64 cost;
+
+       if (!capable(CAP_NET_ADMIN))
+               return ERR_PTR(-EPERM);
+       if (attr->max_entries == 0 ||
+           attr->key_size    == 0 ||
+           attr->value_size  != 4 ||
+           attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
+               return ERR_PTR(-EINVAL);
+       if (attr->key_size > MAX_BPF_STACK)
+               return ERR_PTR(-E2BIG);
+
+       htab = kzalloc(sizeof(*htab), GFP_USER);
+       if (!htab)
+               return ERR_PTR(-ENOMEM);
+
+       bpf_map_init_from_attr(&htab->map, attr);
+
+       htab->buckets_num = roundup_pow_of_two(htab->map.max_entries);
+       htab->elem_size = sizeof(struct bpf_htab_elem) +
+                         round_up(htab->map.key_size, 8);
+       if (htab->buckets_num == 0 ||
+           htab->buckets_num > U32_MAX / sizeof(struct bpf_htab_bucket)) {
+               err = -EINVAL;
+               goto free_htab;
+       }
+
+       cost = (u64) htab->buckets_num * sizeof(struct bpf_htab_bucket) +
+              (u64) htab->elem_size * htab->map.max_entries;
+       if (cost >= U32_MAX - PAGE_SIZE) {
+               err = -EINVAL;
+               goto free_htab;
+       }
+
+       htab->buckets = bpf_map_area_alloc(htab->buckets_num *
+                                          sizeof(struct bpf_htab_bucket),
+                                          htab->map.numa_node);
+       if (!htab->buckets) {
+               err = -ENOMEM;
+               goto free_htab;
+       }
+
+       for (i = 0; i < htab->buckets_num; i++) {
+               INIT_HLIST_HEAD(&htab->buckets[i].head);
+               raw_spin_lock_init(&htab->buckets[i].lock);
+       }
+
+       return &htab->map;
+free_htab:
+       kfree(htab);
+       return ERR_PTR(err);
+}
+
+static void sock_hash_free(struct bpf_map *map)
+{
+       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+       struct bpf_htab_bucket *bucket;
+       struct bpf_htab_elem *elem;
+       struct hlist_node *node;
+       int i;
+
+       synchronize_rcu();
+       rcu_read_lock();
+       for (i = 0; i < htab->buckets_num; i++) {
+               bucket = sock_hash_select_bucket(htab, i);
+               raw_spin_lock_bh(&bucket->lock);
+               hlist_for_each_entry_safe(elem, node, &bucket->head, node) {
+                       hlist_del_rcu(&elem->node);
+                       sock_map_unref(elem->sk, elem);
+               }
+               raw_spin_unlock_bh(&bucket->lock);
+       }
+       rcu_read_unlock();
+
+       bpf_map_area_free(htab->buckets);
+       kfree(htab);
+}
+
+static void sock_hash_release_progs(struct bpf_map *map)
+{
+       psock_progs_drop(&container_of(map, struct bpf_htab, map)->progs);
+}
+
+BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, sops,
+          struct bpf_map *, map, void *, key, u64, flags)
+{
+       WARN_ON_ONCE(!rcu_read_lock_held());
+
+       if (likely(sock_map_sk_is_suitable(sops->sk) &&
+                  sock_map_op_okay(sops)))
+               return sock_hash_update_common(map, key, sops->sk, flags);
+       return -EOPNOTSUPP;
+}
+
+const struct bpf_func_proto bpf_sock_hash_update_proto = {
+       .func           = bpf_sock_hash_update,
+       .gpl_only       = false,
+       .pkt_access     = true,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+       .arg2_type      = ARG_CONST_MAP_PTR,
+       .arg3_type      = ARG_PTR_TO_MAP_KEY,
+       .arg4_type      = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
+          struct bpf_map *, map, void *, key, u64, flags)
+{
+       struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+       if (unlikely(flags & ~(BPF_F_INGRESS)))
+               return SK_DROP;
+       tcb->bpf.flags = flags;
+       tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key);
+       if (!tcb->bpf.sk_redir)
+               return SK_DROP;
+       return SK_PASS;
+}
+
+const struct bpf_func_proto bpf_sk_redirect_hash_proto = {
+       .func           = bpf_sk_redirect_hash,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+       .arg2_type      = ARG_CONST_MAP_PTR,
+       .arg3_type      = ARG_PTR_TO_MAP_KEY,
+       .arg4_type      = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg *, msg,
+          struct bpf_map *, map, void *, key, u64, flags)
+{
+       if (unlikely(flags & ~(BPF_F_INGRESS)))
+               return SK_DROP;
+       msg->flags = flags;
+       msg->sk_redir = __sock_hash_lookup_elem(map, key);
+       if (!msg->sk_redir)
+               return SK_DROP;
+       return SK_PASS;
+}
+
+const struct bpf_func_proto bpf_msg_redirect_hash_proto = {
+       .func           = bpf_msg_redirect_hash,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+       .arg2_type      = ARG_CONST_MAP_PTR,
+       .arg3_type      = ARG_PTR_TO_MAP_KEY,
+       .arg4_type      = ARG_ANYTHING,
+};
+
+const struct bpf_map_ops sock_hash_ops = {
+       .map_alloc              = sock_hash_alloc,
+       .map_free               = sock_hash_free,
+       .map_get_next_key       = sock_hash_get_next_key,
+       .map_update_elem        = sock_hash_update_elem,
+       .map_delete_elem        = sock_hash_delete_elem,
+       .map_lookup_elem        = sock_map_lookup,
+       .map_release_uref       = sock_hash_release_progs,
+       .map_check_btf          = map_check_no_btf,
+};
+
+static struct sk_psock_progs *sock_map_progs(struct bpf_map *map)
+{
+       switch (map->map_type) {
+       case BPF_MAP_TYPE_SOCKMAP:
+               return &container_of(map, struct bpf_stab, map)->progs;
+       case BPF_MAP_TYPE_SOCKHASH:
+               return &container_of(map, struct bpf_htab, map)->progs;
+       default:
+               break;
+       }
+
+       return NULL;
+}
+
+int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
+                        u32 which)
+{
+       struct sk_psock_progs *progs = sock_map_progs(map);
+
+       if (!progs)
+               return -EOPNOTSUPP;
+
+       switch (which) {
+       case BPF_SK_MSG_VERDICT:
+               psock_set_prog(&progs->msg_parser, prog);
+               break;
+       case BPF_SK_SKB_STREAM_PARSER:
+               psock_set_prog(&progs->skb_parser, prog);
+               break;
+       case BPF_SK_SKB_STREAM_VERDICT:
+               psock_set_prog(&progs->skb_verdict, prog);
+               break;
+       default:
+               return -EOPNOTSUPP;
+       }
+
+       return 0;
+}
+
+void sk_psock_unlink(struct sock *sk, struct sk_psock_link *link)
+{
+       switch (link->map->map_type) {
+       case BPF_MAP_TYPE_SOCKMAP:
+               return sock_map_delete_from_link(link->map, sk,
+                                                link->link_raw);
+       case BPF_MAP_TYPE_SOCKHASH:
+               return sock_hash_delete_from_link(link->map, sk,
+                                                 link->link_raw);
+       default:
+               break;
+       }
+}
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile

index 7446b98..5862931 100644 (file)
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -63,6 +63,7 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
  obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
  obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
  obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
+obj-$(CONFIG_NET_SOCK_MSG) += tcp_bpf.o
  obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
  
  obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c

new file mode 100644 (file)

index 0000000..80debb0
--- /dev/null
+++ b/net/ipv4/tcp_bpf.c
@@ -0,0 +1,655 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
+
+#include <linux/skmsg.h>
+#include <linux/filter.h>
+#include <linux/bpf.h>
+#include <linux/init.h>
+#include <linux/wait.h>
+
+#include <net/inet_common.h>
+
+static bool tcp_bpf_stream_read(const struct sock *sk)
+{
+       struct sk_psock *psock;
+       bool empty = true;
+
+       rcu_read_lock();
+       psock = sk_psock(sk);
+       if (likely(psock))
+               empty = list_empty(&psock->ingress_msg);
+       rcu_read_unlock();
+       return !empty;
+}
+
+static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock,
+                            int flags, long timeo, int *err)
+{
+       DEFINE_WAIT_FUNC(wait, woken_wake_function);
+       int ret;
+
+       add_wait_queue(sk_sleep(sk), &wait);
+       sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+       ret = sk_wait_event(sk, &timeo,
+                           !list_empty(&psock->ingress_msg) ||
+                           !skb_queue_empty(&sk->sk_receive_queue), &wait);
+       sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+       remove_wait_queue(sk_sleep(sk), &wait);
+       return ret;
+}
+
+int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
+                     struct msghdr *msg, int len)
+{
+       struct iov_iter *iter = &msg->msg_iter;
+       int i, ret, copied = 0;
+
+       while (copied != len) {
+               struct scatterlist *sge;
+               struct sk_msg *msg_rx;
+
+               msg_rx = list_first_entry_or_null(&psock->ingress_msg,
+                                                 struct sk_msg, list);
+               if (unlikely(!msg_rx))
+                       break;
+
+               i = msg_rx->sg.start;
+               do {
+                       struct page *page;
+                       int copy;
+
+                       sge = sk_msg_elem(msg_rx, i);
+                       copy = sge->length;
+                       page = sg_page(sge);
+                       if (copied + copy > len)
+                               copy = len - copied;
+                       ret = copy_page_to_iter(page, sge->offset, copy, iter);
+                       if (ret != copy) {
+                               msg_rx->sg.start = i;
+                               return -EFAULT;
+                       }
+
+                       copied += copy;
+                       sge->offset += copy;
+                       sge->length -= copy;
+                       sk_mem_uncharge(sk, copy);
+                       if (!sge->length) {
+                               i++;
+                               if (i == MAX_SKB_FRAGS)
+                                       i = 0;
+                               if (!msg_rx->skb)
+                                       put_page(page);
+                       }
+
+                       if (copied == len)
+                               break;
+               } while (i != msg_rx->sg.end);
+
+               msg_rx->sg.start = i;
+               if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) {
+                       list_del(&msg_rx->list);
+                       if (msg_rx->skb)
+                               consume_skb(msg_rx->skb);
+                       kfree(msg_rx);
+               }
+       }
+
+       return copied;
+}
+EXPORT_SYMBOL_GPL(__tcp_bpf_recvmsg);
+
+int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+                   int nonblock, int flags, int *addr_len)
+{
+       struct sk_psock *psock;
+       int copied, ret;
+
+       if (unlikely(flags & MSG_ERRQUEUE))
+               return inet_recv_error(sk, msg, len, addr_len);
+       if (!skb_queue_empty(&sk->sk_receive_queue))
+               return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+
+       psock = sk_psock_get(sk);
+       if (unlikely(!psock))
+               return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+       lock_sock(sk);
+msg_bytes_ready:
+       copied = __tcp_bpf_recvmsg(sk, psock, msg, len);
+       if (!copied) {
+               int data, err = 0;
+               long timeo;
+
+               timeo = sock_rcvtimeo(sk, nonblock);
+               data = tcp_bpf_wait_data(sk, psock, flags, timeo, &err);
+               if (data) {
+                       if (skb_queue_empty(&sk->sk_receive_queue))
+                               goto msg_bytes_ready;
+                       release_sock(sk);
+                       sk_psock_put(sk, psock);
+                       return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+               }
+               if (err) {
+                       ret = err;
+                       goto out;
+               }
+       }
+       ret = copied;
+out:
+       release_sock(sk);
+       sk_psock_put(sk, psock);
+       return ret;
+}
+
+static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock,
+                          struct sk_msg *msg, u32 apply_bytes, int flags)
+{
+       bool apply = apply_bytes;
+       struct scatterlist *sge;
+       u32 size, copied = 0;
+       struct sk_msg *tmp;
+       int i, ret = 0;
+
+       tmp = kzalloc(sizeof(*tmp), __GFP_NOWARN | GFP_KERNEL);
+       if (unlikely(!tmp))
+               return -ENOMEM;
+
+       lock_sock(sk);
+       tmp->sg.start = msg->sg.start;
+       i = msg->sg.start;
+       do {
+               sge = sk_msg_elem(msg, i);
+               size = (apply && apply_bytes < sge->length) ?
+                       apply_bytes : sge->length;
+               if (!sk_wmem_schedule(sk, size)) {
+                       if (!copied)
+                               ret = -ENOMEM;
+                       break;
+               }
+
+               sk_mem_charge(sk, size);
+               sk_msg_xfer(tmp, msg, i, size);
+               copied += size;
+               if (sge->length)
+                       get_page(sk_msg_page(tmp, i));
+               sk_msg_iter_var_next(i);
+               tmp->sg.end = i;
+               if (apply) {
+                       apply_bytes -= size;
+                       if (!apply_bytes)
+                               break;
+               }
+       } while (i != msg->sg.end);
+
+       if (!ret) {
+               msg->sg.start = i;
+               msg->sg.size -= apply_bytes;
+               sk_psock_queue_msg(psock, tmp);
+               sk->sk_data_ready(sk);
+       } else {
+               sk_msg_free(sk, tmp);
+               kfree(tmp);
+       }
+
+       release_sock(sk);
+       return ret;
+}
+
+static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes,
+                       int flags, bool uncharge)
+{
+       bool apply = apply_bytes;
+       struct scatterlist *sge;
+       struct page *page;
+       int size, ret = 0;
+       u32 off;
+
+       while (1) {
+               sge = sk_msg_elem(msg, msg->sg.start);
+               size = (apply && apply_bytes < sge->length) ?
+                       apply_bytes : sge->length;
+               off  = sge->offset;
+               page = sg_page(sge);
+
+               tcp_rate_check_app_limited(sk);
+retry:
+               ret = do_tcp_sendpages(sk, page, off, size, flags);
+               if (ret <= 0)
+                       return ret;
+               if (apply)
+                       apply_bytes -= ret;
+               msg->sg.size -= ret;
+               sge->offset += ret;
+               sge->length -= ret;
+               if (uncharge)
+                       sk_mem_uncharge(sk, ret);
+               if (ret != size) {
+                       size -= ret;
+                       off  += ret;
+                       goto retry;
+               }
+               if (!sge->length) {
+                       put_page(page);
+                       sk_msg_iter_next(msg, start);
+                       sg_init_table(sge, 1);
+                       if (msg->sg.start == msg->sg.end)
+                               break;
+               }
+               if (apply && !apply_bytes)
+                       break;
+       }
+
+       return 0;
+}
+
+static int tcp_bpf_push_locked(struct sock *sk, struct sk_msg *msg,
+                              u32 apply_bytes, int flags, bool uncharge)
+{
+       int ret;
+
+       lock_sock(sk);
+       ret = tcp_bpf_push(sk, msg, apply_bytes, flags, uncharge);
+       release_sock(sk);
+       return ret;
+}
+
+int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg,
+                         u32 bytes, int flags)
+{
+       bool ingress = sk_msg_to_ingress(msg);
+       struct sk_psock *psock = sk_psock_get(sk);
+       int ret;
+
+       if (unlikely(!psock)) {
+               sk_msg_free(sk, msg);
+               return 0;
+       }
+       ret = ingress ? bpf_tcp_ingress(sk, psock, msg, bytes, flags) :
+                       tcp_bpf_push_locked(sk, msg, bytes, flags, false);
+       sk_psock_put(sk, psock);
+       return ret;
+}
+EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir);
+
+static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
+                               struct sk_msg *msg, int *copied, int flags)
+{
+       bool cork = false, enospc = msg->sg.start == msg->sg.end;
+       struct sock *sk_redir;
+       u32 tosend;
+       int ret;
+
+more_data:
+       if (psock->eval == __SK_NONE)
+               psock->eval = sk_psock_msg_verdict(sk, psock, msg);
+
+       if (msg->cork_bytes &&
+           msg->cork_bytes > msg->sg.size && !enospc) {
+               psock->cork_bytes = msg->cork_bytes - msg->sg.size;
+               if (!psock->cork) {
+                       psock->cork = kzalloc(sizeof(*psock->cork),
+                                             GFP_ATOMIC | __GFP_NOWARN);
+                       if (!psock->cork)
+                               return -ENOMEM;
+               }
+               memcpy(psock->cork, msg, sizeof(*msg));
+               return 0;
+       }
+
+       tosend = msg->sg.size;
+       if (psock->apply_bytes && psock->apply_bytes < tosend)
+               tosend = psock->apply_bytes;
+
+       switch (psock->eval) {
+       case __SK_PASS:
+               ret = tcp_bpf_push(sk, msg, tosend, flags, true);
+               if (unlikely(ret)) {
+                       *copied -= sk_msg_free(sk, msg);
+                       break;
+               }
+               sk_msg_apply_bytes(psock, tosend);
+               break;
+       case __SK_REDIRECT:
+               sk_redir = psock->sk_redir;
+               sk_msg_apply_bytes(psock, tosend);
+               if (psock->cork) {
+                       cork = true;
+                       psock->cork = NULL;
+               }
+               sk_msg_return(sk, msg, tosend);
+               release_sock(sk);
+               ret = tcp_bpf_sendmsg_redir(sk_redir, msg, tosend, flags);
+               lock_sock(sk);
+               if (unlikely(ret < 0)) {
+                       int free = sk_msg_free_nocharge(sk, msg);
+
+                       if (!cork)
+                               *copied -= free;
+               }
+               if (cork) {
+                       sk_msg_free(sk, msg);
+                       kfree(msg);
+                       msg = NULL;
+                       ret = 0;
+               }
+               break;
+       case __SK_DROP:
+       default:
+               sk_msg_free_partial(sk, msg, tosend);
+               sk_msg_apply_bytes(psock, tosend);
+               *copied -= tosend;
+               return -EACCES;
+       }
+
+       if (likely(!ret)) {
+               if (!psock->apply_bytes) {
+                       psock->eval =  __SK_NONE;
+                       if (psock->sk_redir) {
+                               sock_put(psock->sk_redir);
+                               psock->sk_redir = NULL;
+                       }
+               }
+               if (msg &&
+                   msg->sg.data[msg->sg.start].page_link &&
+                   msg->sg.data[msg->sg.start].length)
+                       goto more_data;
+       }
+       return ret;
+}
+
+static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
+{
+       struct sk_msg tmp, *msg_tx = NULL;
+       int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS;
+       int copied = 0, err = 0;
+       struct sk_psock *psock;
+       long timeo;
+
+       psock = sk_psock_get(sk);
+       if (unlikely(!psock))
+               return tcp_sendmsg(sk, msg, size);
+
+       lock_sock(sk);
+       timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+       while (msg_data_left(msg)) {
+               bool enospc = false;
+               u32 copy, osize;
+
+               if (sk->sk_err) {
+                       err = -sk->sk_err;
+                       goto out_err;
+               }
+
+               copy = msg_data_left(msg);
+               if (!sk_stream_memory_free(sk))
+                       goto wait_for_sndbuf;
+               if (psock->cork) {
+                       msg_tx = psock->cork;
+               } else {
+                       msg_tx = &tmp;
+                       sk_msg_init(msg_tx);
+               }
+
+               osize = msg_tx->sg.size;
+               err = sk_msg_alloc(sk, msg_tx, msg_tx->sg.size + copy, msg_tx->sg.end - 1);
+               if (err) {
+                       if (err != -ENOSPC)
+                               goto wait_for_memory;
+                       enospc = true;
+                       copy = msg_tx->sg.size - osize;
+               }
+
+               err = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_tx,
+                                              copy);
+               if (err < 0) {
+                       sk_msg_trim(sk, msg_tx, osize);
+                       goto out_err;
+               }
+
+               copied += copy;
+               if (psock->cork_bytes) {
+                       if (size > psock->cork_bytes)
+                               psock->cork_bytes = 0;
+                       else
+                               psock->cork_bytes -= size;
+                       if (psock->cork_bytes && !enospc)
+                               goto out_err;
+                       /* All cork bytes are accounted, rerun the prog. */
+                       psock->eval = __SK_NONE;
+                       psock->cork_bytes = 0;
+               }
+
+               err = tcp_bpf_send_verdict(sk, psock, msg_tx, &copied, flags);
+               if (unlikely(err < 0))
+                       goto out_err;
+               continue;
+wait_for_sndbuf:
+               set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+wait_for_memory:
+               err = sk_stream_wait_memory(sk, &timeo);
+               if (err) {
+                       if (msg_tx && msg_tx != psock->cork)
+                               sk_msg_free(sk, msg_tx);
+                       goto out_err;
+               }
+       }
+out_err:
+       if (err < 0)
+               err = sk_stream_error(sk, msg->msg_flags, err);
+       release_sock(sk);
+       sk_psock_put(sk, psock);
+       return copied ? copied : err;
+}
+
+static int tcp_bpf_sendpage(struct sock *sk, struct page *page, int offset,
+                           size_t size, int flags)
+{
+       struct sk_msg tmp, *msg = NULL;
+       int err = 0, copied = 0;
+       struct sk_psock *psock;
+       bool enospc = false;
+
+       psock = sk_psock_get(sk);
+       if (unlikely(!psock))
+               return tcp_sendpage(sk, page, offset, size, flags);
+
+       lock_sock(sk);
+       if (psock->cork) {
+               msg = psock->cork;
+       } else {
+               msg = &tmp;
+               sk_msg_init(msg);
+       }
+
+       /* Catch case where ring is full and sendpage is stalled. */
+       if (unlikely(sk_msg_full(msg)))
+               goto out_err;
+
+       sk_msg_page_add(msg, page, size, offset);
+       sk_mem_charge(sk, size);
+       copied = size;
+       if (sk_msg_full(msg))
+               enospc = true;
+       if (psock->cork_bytes) {
+               if (size > psock->cork_bytes)
+                       psock->cork_bytes = 0;
+               else
+                       psock->cork_bytes -= size;
+               if (psock->cork_bytes && !enospc)
+                       goto out_err;
+               /* All cork bytes are accounted, rerun the prog. */
+               psock->eval = __SK_NONE;
+               psock->cork_bytes = 0;
+       }
+
+       err = tcp_bpf_send_verdict(sk, psock, msg, &copied, flags);
+out_err:
+       release_sock(sk);
+       sk_psock_put(sk, psock);
+       return copied ? copied : err;
+}
+
+static void tcp_bpf_remove(struct sock *sk, struct sk_psock *psock)
+{
+       struct sk_psock_link *link;
+
+       sk_psock_cork_free(psock);
+       __sk_psock_purge_ingress_msg(psock);
+       while ((link = sk_psock_link_pop(psock))) {
+               sk_psock_unlink(sk, link);
+               sk_psock_free_link(link);
+       }
+}
+
+static void tcp_bpf_unhash(struct sock *sk)
+{
+       void (*saved_unhash)(struct sock *sk);
+       struct sk_psock *psock;
+
+       rcu_read_lock();
+       psock = sk_psock(sk);
+       if (unlikely(!psock)) {
+               rcu_read_unlock();
+               if (sk->sk_prot->unhash)
+                       sk->sk_prot->unhash(sk);
+               return;
+       }
+
+       saved_unhash = psock->saved_unhash;
+       tcp_bpf_remove(sk, psock);
+       rcu_read_unlock();
+       saved_unhash(sk);
+}
+
+static void tcp_bpf_close(struct sock *sk, long timeout)
+{
+       void (*saved_close)(struct sock *sk, long timeout);
+       struct sk_psock *psock;
+
+       lock_sock(sk);
+       rcu_read_lock();
+       psock = sk_psock(sk);
+       if (unlikely(!psock)) {
+               rcu_read_unlock();
+               release_sock(sk);
+               return sk->sk_prot->close(sk, timeout);
+       }
+
+       saved_close = psock->saved_close;
+       tcp_bpf_remove(sk, psock);
+       rcu_read_unlock();
+       release_sock(sk);
+       saved_close(sk, timeout);
+}
+
+enum {
+       TCP_BPF_IPV4,
+       TCP_BPF_IPV6,
+       TCP_BPF_NUM_PROTS,
+};
+
+enum {
+       TCP_BPF_BASE,
+       TCP_BPF_TX,
+       TCP_BPF_NUM_CFGS,
+};
+
+static struct proto *tcpv6_prot_saved __read_mostly;
+static DEFINE_SPINLOCK(tcpv6_prot_lock);
+static struct proto tcp_bpf_prots[TCP_BPF_NUM_PROTS][TCP_BPF_NUM_CFGS];
+
+static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
+                                  struct proto *base)
+{
+       prot[TCP_BPF_BASE]                      = *base;
+       prot[TCP_BPF_BASE].unhash               = tcp_bpf_unhash;
+       prot[TCP_BPF_BASE].close                = tcp_bpf_close;
+       prot[TCP_BPF_BASE].recvmsg              = tcp_bpf_recvmsg;
+       prot[TCP_BPF_BASE].stream_memory_read   = tcp_bpf_stream_read;
+
+       prot[TCP_BPF_TX]                        = prot[TCP_BPF_BASE];
+       prot[TCP_BPF_TX].sendmsg                = tcp_bpf_sendmsg;
+       prot[TCP_BPF_TX].sendpage               = tcp_bpf_sendpage;
+}
+
+static void tcp_bpf_check_v6_needs_rebuild(struct sock *sk, struct proto *ops)
+{
+       if (sk->sk_family == AF_INET6 &&
+           unlikely(ops != smp_load_acquire(&tcpv6_prot_saved))) {
+               spin_lock_bh(&tcpv6_prot_lock);
+               if (likely(ops != tcpv6_prot_saved)) {
+                       tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV6], ops);
+                       smp_store_release(&tcpv6_prot_saved, ops);
+               }
+               spin_unlock_bh(&tcpv6_prot_lock);
+       }
+}
+
+static int __init tcp_bpf_v4_build_proto(void)
+{
+       tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV4], &tcp_prot);
+       return 0;
+}
+core_initcall(tcp_bpf_v4_build_proto);
+
+static void tcp_bpf_update_sk_prot(struct sock *sk, struct sk_psock *psock)
+{
+       int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
+       int config = psock->progs.msg_parser   ? TCP_BPF_TX   : TCP_BPF_BASE;
+
+       sk_psock_update_proto(sk, psock, &tcp_bpf_prots[family][config]);
+}
+
+static void tcp_bpf_reinit_sk_prot(struct sock *sk, struct sk_psock *psock)
+{
+       int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
+       int config = psock->progs.msg_parser   ? TCP_BPF_TX   : TCP_BPF_BASE;
+
+       /* Reinit occurs when program types change e.g. TCP_BPF_TX is removed
+        * or added requiring sk_prot hook updates. We keep original saved
+        * hooks in this case.
+        */
+       sk->sk_prot = &tcp_bpf_prots[family][config];
+}
+
+static int tcp_bpf_assert_proto_ops(struct proto *ops)
+{
+       /* In order to avoid retpoline, we make assumptions when we call
+        * into ops if e.g. a psock is not present. Make sure they are
+        * indeed valid assumptions.
+        */
+       return ops->recvmsg  == tcp_recvmsg &&
+              ops->sendmsg  == tcp_sendmsg &&
+              ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP;
+}
+
+void tcp_bpf_reinit(struct sock *sk)
+{
+       struct sk_psock *psock;
+
+       sock_owned_by_me(sk);
+
+       rcu_read_lock();
+       psock = sk_psock(sk);
+       tcp_bpf_reinit_sk_prot(sk, psock);
+       rcu_read_unlock();
+}
+
+int tcp_bpf_init(struct sock *sk)
+{
+       struct proto *ops = READ_ONCE(sk->sk_prot);
+       struct sk_psock *psock;
+
+       sock_owned_by_me(sk);
+
+       rcu_read_lock();
+       psock = sk_psock(sk);
+       if (unlikely(!psock || psock->sk_proto ||
+                    tcp_bpf_assert_proto_ops(ops))) {
+               rcu_read_unlock();
+               return -EINVAL;
+       }
+       tcp_bpf_check_v6_needs_rebuild(sk, ops);
+       tcp_bpf_update_sk_prot(sk, psock);
+       rcu_read_unlock();
+       return 0;
+}
diff --git a/net/strparser/Kconfig b/net/strparser/Kconfig

index 6cff3f6..94da19a 100644 (file)
--- a/net/strparser/Kconfig
+++ b/net/strparser/Kconfig
@@ -1,4 +1,2 @@
-
  config STREAM_PARSER
-       tristate
-       default n
+       def_bool n
author	Daniel Borkmann <daniel@iogearbox.net>
	Sat, 13 Oct 2018 00:45:58 +0000 (02:45 +0200)
committer	Alexei Starovoitov <ast@kernel.org>
	Mon, 15 Oct 2018 19:23:19 +0000 (12:23 -0700)
include/linux/bpf.h		patch \| blob \| history
include/linux/bpf_types.h		patch \| blob \| history
include/linux/filter.h		patch \| blob \| history
include/linux/skmsg.h	[new file with mode: 0644]	patch \| blob
include/net/tcp.h		patch \| blob \| history
kernel/bpf/Makefile		patch \| blob \| history
kernel/bpf/core.c		patch \| blob \| history
kernel/bpf/sockmap.c	[deleted file]	patch \| blob \| history
kernel/bpf/syscall.c		patch \| blob \| history
net/Kconfig		patch \| blob \| history
net/core/Makefile		patch \| blob \| history
net/core/filter.c		patch \| blob \| history
net/core/skmsg.c	[new file with mode: 0644]	patch \| blob
net/core/sock_map.c	[new file with mode: 0644]	patch \| blob
net/ipv4/Makefile		patch \| blob \| history
net/ipv4/tcp_bpf.c	[new file with mode: 0644]	patch \| blob
net/strparser/Kconfig		patch \| blob \| history