Merge branch 'uaccess.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs

[linux-2.6-microblaze.git] / kernel / bpf / syscall.c
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c

index 41ba746..9693730 100644 (file)
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -25,7 +25,10 @@
  #include <linux/nospec.h>
  #include <linux/audit.h>
  #include <uapi/linux/btf.h>
+#include <linux/pgtable.h>
  #include <linux/bpf_lsm.h>
+#include <linux/poll.h>
+#include <linux/bpf-netns.h>
  
  #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
                           (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
@@ -42,6 +45,8 @@ static DEFINE_IDR(prog_idr);
  static DEFINE_SPINLOCK(prog_idr_lock);
  static DEFINE_IDR(map_idr);
  static DEFINE_SPINLOCK(map_idr_lock);
+static DEFINE_IDR(link_idr);
+static DEFINE_SPINLOCK(link_idr_lock);
  
  int sysctl_unprivileged_bpf_disabled __read_mostly;
  
@@ -49,9 +54,11 @@ static const struct bpf_map_ops * const bpf_map_types[] = {
  #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
  #define BPF_MAP_TYPE(_id, _ops) \
         [_id] = &_ops,
+#define BPF_LINK_TYPE(_id, _name)
  #include <linux/bpf_types.h>
  #undef BPF_PROG_TYPE
  #undef BPF_MAP_TYPE
+#undef BPF_LINK_TYPE
  };
  
  /*
@@ -268,27 +275,29 @@ static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable)
          * __GFP_RETRY_MAYFAIL to avoid such situations.
          */
  
-       const gfp_t flags = __GFP_NOWARN | __GFP_ZERO;
+       const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO;
+       unsigned int flags = 0;
+       unsigned long align = 1;
         void *area;
  
         if (size >= SIZE_MAX)
                 return NULL;
  
         /* kmalloc()'ed memory can't be mmap()'ed */
-       if (!mmapable && size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
-               area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags,
+       if (mmapable) {
+               BUG_ON(!PAGE_ALIGNED(size));
+               align = SHMLBA;
+               flags = VM_USERMAP;
+       } else if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
+               area = kmalloc_node(size, gfp | GFP_USER | __GFP_NORETRY,
                                     numa_node);
                 if (area != NULL)
                         return area;
         }
-       if (mmapable) {
-               BUG_ON(!PAGE_ALIGNED(size));
-               return vmalloc_user_node_flags(size, numa_node, GFP_KERNEL |
-                                              __GFP_RETRY_MAYFAIL | flags);
-       }
-       return __vmalloc_node_flags_caller(size, numa_node,
-                                          GFP_KERNEL | __GFP_RETRY_MAYFAIL |
-                                          flags, __builtin_return_address(0));
+
+       return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
+                       gfp | GFP_KERNEL | __GFP_RETRY_MAYFAIL, PAGE_KERNEL,
+                       flags, numa_node, __builtin_return_address(0));
  }
  
  void *bpf_map_area_alloc(u64 size, int numa_node)
@@ -573,9 +582,7 @@ static void bpf_map_mmap_open(struct vm_area_struct *vma)
  {
         struct bpf_map *map = vma->vm_file->private_data;
  
-       bpf_map_inc_with_uref(map);
-
-       if (vma->vm_flags & VM_WRITE) {
+       if (vma->vm_flags & VM_MAYWRITE) {
                 mutex_lock(&map->freeze_mutex);
                 map->writecnt++;
                 mutex_unlock(&map->freeze_mutex);
@@ -587,13 +594,11 @@ static void bpf_map_mmap_close(struct vm_area_struct *vma)
  {
         struct bpf_map *map = vma->vm_file->private_data;
  
-       if (vma->vm_flags & VM_WRITE) {
+       if (vma->vm_flags & VM_MAYWRITE) {
                 mutex_lock(&map->freeze_mutex);
                 map->writecnt--;
                 mutex_unlock(&map->freeze_mutex);
         }
-
-       bpf_map_put_with_uref(map);
  }
  
  static const struct vm_operations_struct bpf_map_default_vmops = {
@@ -614,28 +619,51 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
  
         mutex_lock(&map->freeze_mutex);
  
-       if ((vma->vm_flags & VM_WRITE) && map->frozen) {
-               err = -EPERM;
-               goto out;
+       if (vma->vm_flags & VM_WRITE) {
+               if (map->frozen) {
+                       err = -EPERM;
+                       goto out;
+               }
+               /* map is meant to be read-only, so do not allow mapping as
+                * writable, because it's possible to leak a writable page
+                * reference and allows user-space to still modify it after
+                * freezing, while verifier will assume contents do not change
+                */
+               if (map->map_flags & BPF_F_RDONLY_PROG) {
+                       err = -EACCES;
+                       goto out;
+               }
         }
  
         /* set default open/close callbacks */
         vma->vm_ops = &bpf_map_default_vmops;
         vma->vm_private_data = map;
+       vma->vm_flags &= ~VM_MAYEXEC;
+       if (!(vma->vm_flags & VM_WRITE))
+               /* disallow re-mapping with PROT_WRITE */
+               vma->vm_flags &= ~VM_MAYWRITE;
  
         err = map->ops->map_mmap(map, vma);
         if (err)
                 goto out;
  
-       bpf_map_inc_with_uref(map);
-
-       if (vma->vm_flags & VM_WRITE)
+       if (vma->vm_flags & VM_MAYWRITE)
                 map->writecnt++;
  out:
         mutex_unlock(&map->freeze_mutex);
         return err;
  }
  
+static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts)
+{
+       struct bpf_map *map = filp->private_data;
+
+       if (map->ops->map_poll)
+               return map->ops->map_poll(map, filp, pts);
+
+       return EPOLLERR;
+}
+
  const struct file_operations bpf_map_fops = {
  #ifdef CONFIG_PROC_FS
         .show_fdinfo    = bpf_map_show_fdinfo,
@@ -644,6 +672,7 @@ const struct file_operations bpf_map_fops = {
         .read           = bpf_dummy_read,
         .write          = bpf_dummy_write,
         .mmap           = bpf_map_mmap,
+       .poll           = bpf_map_poll,
  };
  
  int bpf_map_new_fd(struct bpf_map *map, int flags)
@@ -1361,7 +1390,7 @@ int generic_map_lookup_batch(struct bpf_map *map,
  
         buf = kmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN);
         if (!buf) {
-               kvfree(buf_prevkey);
+               kfree(buf_prevkey);
                 return -ENOMEM;
         }
  
@@ -1446,7 +1475,8 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
         map = __bpf_map_get(f);
         if (IS_ERR(map))
                 return PTR_ERR(map);
-       if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
+       if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) ||
+           !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
                 err = -EPERM;
                 goto err_put;
         }
@@ -1474,8 +1504,10 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
         if (err)
                 goto free_value;
  
-       if (copy_to_user(uvalue, value, value_size) != 0)
+       if (copy_to_user(uvalue, value, value_size) != 0) {
+               err = -EFAULT;
                 goto free_value;
+       }
  
         err = 0;
  
@@ -1519,7 +1551,7 @@ static int map_freeze(const union bpf_attr *attr)
                 err = -EBUSY;
                 goto err_put;
         }
-       if (!capable(CAP_SYS_ADMIN)) {
+       if (!bpf_capable()) {
                 err = -EPERM;
                 goto err_put;
         }
@@ -1535,9 +1567,11 @@ static const struct bpf_prog_ops * const bpf_prog_types[] = {
  #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
         [_id] = & _name ## _prog_ops,
  #define BPF_MAP_TYPE(_id, _ops)
+#define BPF_LINK_TYPE(_id, _name)
  #include <linux/bpf_types.h>
  #undef BPF_PROG_TYPE
  #undef BPF_MAP_TYPE
+#undef BPF_LINK_TYPE
  };
  
  static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
@@ -1959,6 +1993,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
                 case BPF_CGROUP_INET6_BIND:
                 case BPF_CGROUP_INET4_CONNECT:
                 case BPF_CGROUP_INET6_CONNECT:
+               case BPF_CGROUP_INET4_GETPEERNAME:
+               case BPF_CGROUP_INET6_GETPEERNAME:
+               case BPF_CGROUP_INET4_GETSOCKNAME:
+               case BPF_CGROUP_INET6_GETSOCKNAME:
                 case BPF_CGROUP_UDP4_SENDMSG:
                 case BPF_CGROUP_UDP6_SENDMSG:
                 case BPF_CGROUP_UDP4_RECVMSG:
@@ -1992,6 +2030,55 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
         }
  }
  
+static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)
+{
+       switch (prog_type) {
+       case BPF_PROG_TYPE_SCHED_CLS:
+       case BPF_PROG_TYPE_SCHED_ACT:
+       case BPF_PROG_TYPE_XDP:
+       case BPF_PROG_TYPE_LWT_IN:
+       case BPF_PROG_TYPE_LWT_OUT:
+       case BPF_PROG_TYPE_LWT_XMIT:
+       case BPF_PROG_TYPE_LWT_SEG6LOCAL:
+       case BPF_PROG_TYPE_SK_SKB:
+       case BPF_PROG_TYPE_SK_MSG:
+       case BPF_PROG_TYPE_LIRC_MODE2:
+       case BPF_PROG_TYPE_FLOW_DISSECTOR:
+       case BPF_PROG_TYPE_CGROUP_DEVICE:
+       case BPF_PROG_TYPE_CGROUP_SOCK:
+       case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+       case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+       case BPF_PROG_TYPE_CGROUP_SYSCTL:
+       case BPF_PROG_TYPE_SOCK_OPS:
+       case BPF_PROG_TYPE_EXT: /* extends any prog */
+               return true;
+       case BPF_PROG_TYPE_CGROUP_SKB:
+               /* always unpriv */
+       case BPF_PROG_TYPE_SK_REUSEPORT:
+               /* equivalent to SOCKET_FILTER. need CAP_BPF only */
+       default:
+               return false;
+       }
+}
+
+static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
+{
+       switch (prog_type) {
+       case BPF_PROG_TYPE_KPROBE:
+       case BPF_PROG_TYPE_TRACEPOINT:
+       case BPF_PROG_TYPE_PERF_EVENT:
+       case BPF_PROG_TYPE_RAW_TRACEPOINT:
+       case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
+       case BPF_PROG_TYPE_TRACING:
+       case BPF_PROG_TYPE_LSM:
+       case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */
+       case BPF_PROG_TYPE_EXT: /* extends any prog */
+               return true;
+       default:
+               return false;
+       }
+}
+
  /* last field in 'union bpf_attr' used by this command */
  #define        BPF_PROG_LOAD_LAST_FIELD attach_prog_fd
  
@@ -2014,7 +2101,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
  
         if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
             (attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
-           !capable(CAP_SYS_ADMIN))
+           !bpf_capable())
                 return -EPERM;
  
         /* copy eBPF program license from user space */
@@ -2027,11 +2114,16 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
         is_gpl = license_is_gpl_compatible(license);
  
         if (attr->insn_cnt == 0 ||
-           attr->insn_cnt > (capable(CAP_SYS_ADMIN) ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
+           attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
                 return -E2BIG;
         if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
             type != BPF_PROG_TYPE_CGROUP_SKB &&
-           !capable(CAP_SYS_ADMIN))
+           !bpf_capable())
+               return -EPERM;
+
+       if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN))
+               return -EPERM;
+       if (is_perfmon_prog_type(type) && !perfmon_capable())
                 return -EPERM;
  
         bpf_prog_load_fixup_attach_type(attr);
@@ -2170,25 +2262,39 @@ static int bpf_obj_get(const union bpf_attr *attr)
                                 attr->file_flags);
  }
  
-void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops,
-                  struct bpf_prog *prog)
+void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
+                  const struct bpf_link_ops *ops, struct bpf_prog *prog)
  {
         atomic64_set(&link->refcnt, 1);
+       link->type = type;
+       link->id = 0;
         link->ops = ops;
         link->prog = prog;
  }
  
+static void bpf_link_free_id(int id)
+{
+       if (!id)
+               return;
+
+       spin_lock_bh(&link_idr_lock);
+       idr_remove(&link_idr, id);
+       spin_unlock_bh(&link_idr_lock);
+}
+
  /* Clean up bpf_link and corresponding anon_inode file and FD. After
   * anon_inode is created, bpf_link can't be just kfree()'d due to deferred
- * anon_inode's release() call. This helper manages marking bpf_link as
- * defunct, releases anon_inode file and puts reserved FD.
+ * anon_inode's release() call. This helper marksbpf_link as
+ * defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt
+ * is not decremented, it's the responsibility of a calling code that failed
+ * to complete bpf_link initialization.
   */
-void bpf_link_cleanup(struct bpf_link *link, struct file *link_file,
-                     int link_fd)
+void bpf_link_cleanup(struct bpf_link_primer *primer)
  {
-       link->prog = NULL;
-       fput(link_file);
-       put_unused_fd(link_fd);
+       primer->link->prog = NULL;
+       bpf_link_free_id(primer->id);
+       fput(primer->file);
+       put_unused_fd(primer->fd);
  }
  
  void bpf_link_inc(struct bpf_link *link)
@@ -2199,6 +2305,7 @@ void bpf_link_inc(struct bpf_link *link)
  /* bpf_link_free is guaranteed to be called from process context */
  static void bpf_link_free(struct bpf_link *link)
  {
+       bpf_link_free_id(link->id);
         if (link->prog) {
                 /* detach BPF program, clean up used resources */
                 link->ops->release(link);
@@ -2240,39 +2347,39 @@ static int bpf_link_release(struct inode *inode, struct file *filp)
  }
  
  #ifdef CONFIG_PROC_FS
-static const struct bpf_link_ops bpf_raw_tp_lops;
-static const struct bpf_link_ops bpf_tracing_link_lops;
+#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
+#define BPF_MAP_TYPE(_id, _ops)
+#define BPF_LINK_TYPE(_id, _name) [_id] = #_name,
+static const char *bpf_link_type_strs[] = {
+       [BPF_LINK_TYPE_UNSPEC] = "<invalid>",
+#include <linux/bpf_types.h>
+};
+#undef BPF_PROG_TYPE
+#undef BPF_MAP_TYPE
+#undef BPF_LINK_TYPE
  
  static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
  {
         const struct bpf_link *link = filp->private_data;
         const struct bpf_prog *prog = link->prog;
         char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
-       const char *link_type;
-
-       if (link->ops == &bpf_raw_tp_lops)
-               link_type = "raw_tracepoint";
-       else if (link->ops == &bpf_tracing_link_lops)
-               link_type = "tracing";
-#ifdef CONFIG_CGROUP_BPF
-       else if (link->ops == &bpf_cgroup_link_lops)
-               link_type = "cgroup";
-#endif
-       else
-               link_type = "unknown";
  
         bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
         seq_printf(m,
                    "link_type:\t%s\n"
+                  "link_id:\t%u\n"
                    "prog_tag:\t%s\n"
                    "prog_id:\t%u\n",
-                  link_type,
+                  bpf_link_type_strs[link->type],
+                  link->id,
                    prog_tag,
                    prog->aux->id);
+       if (link->ops->show_fdinfo)
+               link->ops->show_fdinfo(link, m);
  }
  #endif
  
-const struct file_operations bpf_link_fops = {
+static const struct file_operations bpf_link_fops = {
  #ifdef CONFIG_PROC_FS
         .show_fdinfo    = bpf_link_show_fdinfo,
  #endif
@@ -2281,36 +2388,77 @@ const struct file_operations bpf_link_fops = {
         .write          = bpf_dummy_write,
  };
  
-int bpf_link_new_fd(struct bpf_link *link)
+static int bpf_link_alloc_id(struct bpf_link *link)
  {
-       return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC);
-}
+       int id;
  
-/* Similar to bpf_link_new_fd, create anon_inode for given bpf_link, but
- * instead of immediately installing fd in fdtable, just reserve it and
- * return. Caller then need to either install it with fd_install(fd, file) or
- * release with put_unused_fd(fd).
- * This is useful for cases when bpf_link attachment/detachment are
- * complicated and expensive operations and should be delayed until all the fd
- * reservation and anon_inode creation succeeds.
+       idr_preload(GFP_KERNEL);
+       spin_lock_bh(&link_idr_lock);
+       id = idr_alloc_cyclic(&link_idr, link, 1, INT_MAX, GFP_ATOMIC);
+       spin_unlock_bh(&link_idr_lock);
+       idr_preload_end();
+
+       return id;
+}
+
+/* Prepare bpf_link to be exposed to user-space by allocating anon_inode file,
+ * reserving unused FD and allocating ID from link_idr. This is to be paired
+ * with bpf_link_settle() to install FD and ID and expose bpf_link to
+ * user-space, if bpf_link is successfully attached. If not, bpf_link and
+ * pre-allocated resources are to be freed with bpf_cleanup() call. All the
+ * transient state is passed around in struct bpf_link_primer.
+ * This is preferred way to create and initialize bpf_link, especially when
+ * there are complicated and expensive operations inbetween creating bpf_link
+ * itself and attaching it to BPF hook. By using bpf_link_prime() and
+ * bpf_link_settle() kernel code using bpf_link doesn't have to perform
+ * expensive (and potentially failing) roll back operations in a rare case
+ * that file, FD, or ID can't be allocated.
   */
-struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd)
+int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer)
  {
         struct file *file;
-       int fd;
+       int fd, id;
  
         fd = get_unused_fd_flags(O_CLOEXEC);
         if (fd < 0)
-               return ERR_PTR(fd);
+               return fd;
+
+
+       id = bpf_link_alloc_id(link);
+       if (id < 0) {
+               put_unused_fd(fd);
+               return id;
+       }
  
         file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC);
         if (IS_ERR(file)) {
+               bpf_link_free_id(id);
                 put_unused_fd(fd);
-               return file;
+               return PTR_ERR(file);
         }
  
-       *reserved_fd = fd;
-       return file;
+       primer->link = link;
+       primer->file = file;
+       primer->fd = fd;
+       primer->id = id;
+       return 0;
+}
+
+int bpf_link_settle(struct bpf_link_primer *primer)
+{
+       /* make bpf_link fetchable by ID */
+       spin_lock_bh(&link_idr_lock);
+       primer->link->id = primer->id;
+       spin_unlock_bh(&link_idr_lock);
+       /* make bpf_link fetchable by FD */
+       fd_install(primer->fd, primer->file);
+       /* pass through installed FD */
+       return primer->fd;
+}
+
+int bpf_link_new_fd(struct bpf_link *link)
+{
+       return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC);
  }
  
  struct bpf_link *bpf_link_get_from_fd(u32 ufd)
@@ -2334,6 +2482,7 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd)
  
  struct bpf_tracing_link {
         struct bpf_link link;
+       enum bpf_attach_type attach_type;
  };
  
  static void bpf_tracing_link_release(struct bpf_link *link)
@@ -2349,16 +2498,40 @@ static void bpf_tracing_link_dealloc(struct bpf_link *link)
         kfree(tr_link);
  }
  
+static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link,
+                                        struct seq_file *seq)
+{
+       struct bpf_tracing_link *tr_link =
+               container_of(link, struct bpf_tracing_link, link);
+
+       seq_printf(seq,
+                  "attach_type:\t%d\n",
+                  tr_link->attach_type);
+}
+
+static int bpf_tracing_link_fill_link_info(const struct bpf_link *link,
+                                          struct bpf_link_info *info)
+{
+       struct bpf_tracing_link *tr_link =
+               container_of(link, struct bpf_tracing_link, link);
+
+       info->tracing.attach_type = tr_link->attach_type;
+
+       return 0;
+}
+
  static const struct bpf_link_ops bpf_tracing_link_lops = {
         .release = bpf_tracing_link_release,
         .dealloc = bpf_tracing_link_dealloc,
+       .show_fdinfo = bpf_tracing_link_show_fdinfo,
+       .fill_link_info = bpf_tracing_link_fill_link_info,
  };
  
  static int bpf_tracing_prog_attach(struct bpf_prog *prog)
  {
+       struct bpf_link_primer link_primer;
         struct bpf_tracing_link *link;
-       struct file *link_file;
-       int link_fd, err;
+       int err;
  
         switch (prog->type) {
         case BPF_PROG_TYPE_TRACING:
@@ -2391,24 +2564,23 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog)
                 err = -ENOMEM;
                 goto out_put_prog;
         }
-       bpf_link_init(&link->link, &bpf_tracing_link_lops, prog);
+       bpf_link_init(&link->link, BPF_LINK_TYPE_TRACING,
+                     &bpf_tracing_link_lops, prog);
+       link->attach_type = prog->expected_attach_type;
  
-       link_file = bpf_link_new_file(&link->link, &link_fd);
-       if (IS_ERR(link_file)) {
+       err = bpf_link_prime(&link->link, &link_primer);
+       if (err) {
                 kfree(link);
-               err = PTR_ERR(link_file);
                 goto out_put_prog;
         }
  
         err = bpf_trampoline_link_prog(prog);
         if (err) {
-               bpf_link_cleanup(&link->link, link_file, link_fd);
+               bpf_link_cleanup(&link_primer);
                 goto out_put_prog;
         }
  
-       fd_install(link_fd, link_file);
-       return link_fd;
-
+       return bpf_link_settle(&link_primer);
  out_put_prog:
         bpf_prog_put(prog);
         return err;
@@ -2436,22 +2608,69 @@ static void bpf_raw_tp_link_dealloc(struct bpf_link *link)
         kfree(raw_tp);
  }
  
-static const struct bpf_link_ops bpf_raw_tp_lops = {
+static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link,
+                                       struct seq_file *seq)
+{
+       struct bpf_raw_tp_link *raw_tp_link =
+               container_of(link, struct bpf_raw_tp_link, link);
+
+       seq_printf(seq,
+                  "tp_name:\t%s\n",
+                  raw_tp_link->btp->tp->name);
+}
+
+static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
+                                         struct bpf_link_info *info)
+{
+       struct bpf_raw_tp_link *raw_tp_link =
+               container_of(link, struct bpf_raw_tp_link, link);
+       char __user *ubuf = u64_to_user_ptr(info->raw_tracepoint.tp_name);
+       const char *tp_name = raw_tp_link->btp->tp->name;
+       u32 ulen = info->raw_tracepoint.tp_name_len;
+       size_t tp_len = strlen(tp_name);
+
+       if (ulen && !ubuf)
+               return -EINVAL;
+
+       info->raw_tracepoint.tp_name_len = tp_len + 1;
+
+       if (!ubuf)
+               return 0;
+
+       if (ulen >= tp_len + 1) {
+               if (copy_to_user(ubuf, tp_name, tp_len + 1))
+                       return -EFAULT;
+       } else {
+               char zero = '\0';
+
+               if (copy_to_user(ubuf, tp_name, ulen - 1))
+                       return -EFAULT;
+               if (put_user(zero, ubuf + ulen - 1))
+                       return -EFAULT;
+               return -ENOSPC;
+       }
+
+       return 0;
+}
+
+static const struct bpf_link_ops bpf_raw_tp_link_lops = {
         .release = bpf_raw_tp_link_release,
         .dealloc = bpf_raw_tp_link_dealloc,
+       .show_fdinfo = bpf_raw_tp_link_show_fdinfo,
+       .fill_link_info = bpf_raw_tp_link_fill_link_info,
  };
  
  #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
  
  static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
  {
+       struct bpf_link_primer link_primer;
         struct bpf_raw_tp_link *link;
         struct bpf_raw_event_map *btp;
-       struct file *link_file;
         struct bpf_prog *prog;
         const char *tp_name;
         char buf[128];
-       int link_fd, err;
+       int err;
  
         if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN))
                 return -EINVAL;
@@ -2504,24 +2723,23 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
                 err = -ENOMEM;
                 goto out_put_btp;
         }
-       bpf_link_init(&link->link, &bpf_raw_tp_lops, prog);
+       bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT,
+                     &bpf_raw_tp_link_lops, prog);
         link->btp = btp;
  
-       link_file = bpf_link_new_file(&link->link, &link_fd);
-       if (IS_ERR(link_file)) {
+       err = bpf_link_prime(&link->link, &link_primer);
+       if (err) {
                 kfree(link);
-               err = PTR_ERR(link_file);
                 goto out_put_btp;
         }
  
         err = bpf_probe_register(link->btp, prog);
         if (err) {
-               bpf_link_cleanup(&link->link, link_file, link_fd);
+               bpf_link_cleanup(&link_primer);
                 goto out_put_btp;
         }
  
-       fd_install(link_fd, link_file);
-       return link_fd;
+       return bpf_link_settle(&link_primer);
  
  out_put_btp:
         bpf_put_raw_tracepoint(btp);
@@ -2539,6 +2757,11 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
                 return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
         case BPF_PROG_TYPE_CGROUP_SKB:
+               if (!capable(CAP_NET_ADMIN))
+                       /* cg-skb progs can be loaded by unpriv user.
+                        * check permissions at attach time.
+                        */
+                       return -EPERM;
                 return prog->enforce_expected_attach_type &&
                         prog->expected_attach_type != attach_type ?
                         -EINVAL : 0;
@@ -2563,6 +2786,10 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
         case BPF_CGROUP_INET6_BIND:
         case BPF_CGROUP_INET4_CONNECT:
         case BPF_CGROUP_INET6_CONNECT:
+       case BPF_CGROUP_INET4_GETPEERNAME:
+       case BPF_CGROUP_INET6_GETPEERNAME:
+       case BPF_CGROUP_INET4_GETSOCKNAME:
+       case BPF_CGROUP_INET6_GETSOCKNAME:
         case BPF_CGROUP_UDP4_SENDMSG:
         case BPF_CGROUP_UDP6_SENDMSG:
         case BPF_CGROUP_UDP4_RECVMSG:
@@ -2586,6 +2813,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
         case BPF_CGROUP_GETSOCKOPT:
         case BPF_CGROUP_SETSOCKOPT:
                 return BPF_PROG_TYPE_CGROUP_SOCKOPT;
+       case BPF_TRACE_ITER:
+               return BPF_PROG_TYPE_TRACING;
         default:
                 return BPF_PROG_TYPE_UNSPEC;
         }
@@ -2602,9 +2831,6 @@ static int bpf_prog_attach(const union bpf_attr *attr)
         struct bpf_prog *prog;
         int ret;
  
-       if (!capable(CAP_NET_ADMIN))
-               return -EPERM;
-
         if (CHECK_ATTR(BPF_PROG_ATTACH))
                 return -EINVAL;
  
@@ -2633,7 +2859,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
                 ret = lirc_prog_attach(attr, prog);
                 break;
         case BPF_PROG_TYPE_FLOW_DISSECTOR:
-               ret = skb_flow_dissector_bpf_prog_attach(attr, prog);
+               ret = netns_bpf_prog_attach(attr, prog);
                 break;
         case BPF_PROG_TYPE_CGROUP_DEVICE:
         case BPF_PROG_TYPE_CGROUP_SKB:
@@ -2659,9 +2885,6 @@ static int bpf_prog_detach(const union bpf_attr *attr)
  {
         enum bpf_prog_type ptype;
  
-       if (!capable(CAP_NET_ADMIN))
-               return -EPERM;
-
         if (CHECK_ATTR(BPF_PROG_DETACH))
                 return -EINVAL;
  
@@ -2674,7 +2897,9 @@ static int bpf_prog_detach(const union bpf_attr *attr)
         case BPF_PROG_TYPE_LIRC_MODE2:
                 return lirc_prog_detach(attr);
         case BPF_PROG_TYPE_FLOW_DISSECTOR:
-               return skb_flow_dissector_bpf_prog_detach(attr);
+               if (!capable(CAP_NET_ADMIN))
+                       return -EPERM;
+               return netns_bpf_prog_detach(attr);
         case BPF_PROG_TYPE_CGROUP_DEVICE:
         case BPF_PROG_TYPE_CGROUP_SKB:
         case BPF_PROG_TYPE_CGROUP_SOCK:
@@ -2710,6 +2935,10 @@ static int bpf_prog_query(const union bpf_attr *attr,
         case BPF_CGROUP_INET6_POST_BIND:
         case BPF_CGROUP_INET4_CONNECT:
         case BPF_CGROUP_INET6_CONNECT:
+       case BPF_CGROUP_INET4_GETPEERNAME:
+       case BPF_CGROUP_INET6_GETPEERNAME:
+       case BPF_CGROUP_INET4_GETSOCKNAME:
+       case BPF_CGROUP_INET6_GETSOCKNAME:
         case BPF_CGROUP_UDP4_SENDMSG:
         case BPF_CGROUP_UDP6_SENDMSG:
         case BPF_CGROUP_UDP4_RECVMSG:
@@ -2723,7 +2952,7 @@ static int bpf_prog_query(const union bpf_attr *attr,
         case BPF_LIRC_MODE2:
                 return lirc_prog_query(attr, uattr);
         case BPF_FLOW_DISSECTOR:
-               return skb_flow_dissector_prog_query(attr, uattr);
+               return netns_bpf_prog_query(attr, uattr);
         default:
                 return -EINVAL;
         }
@@ -2737,8 +2966,6 @@ static int bpf_prog_test_run(const union bpf_attr *attr,
         struct bpf_prog *prog;
         int ret = -ENOTSUPP;
  
-       if (!capable(CAP_SYS_ADMIN))
-               return -EPERM;
         if (CHECK_ATTR(BPF_PROG_TEST_RUN))
                 return -EINVAL;
  
@@ -2789,6 +3016,25 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr,
         return err;
  }
  
+struct bpf_map *bpf_map_get_curr_or_next(u32 *id)
+{
+       struct bpf_map *map;
+
+       spin_lock_bh(&map_idr_lock);
+again:
+       map = idr_get_next(&map_idr, id);
+       if (map) {
+               map = __bpf_map_inc_not_zero(map, false);
+               if (IS_ERR(map)) {
+                       (*id)++;
+                       goto again;
+               }
+       }
+       spin_unlock_bh(&map_idr_lock);
+
+       return map;
+}
+
  #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
  
  struct bpf_prog *bpf_prog_by_id(u32 id)
@@ -3020,7 +3266,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
         info.run_time_ns = stats.nsecs;
         info.run_cnt = stats.cnt;
  
-       if (!capable(CAP_SYS_ADMIN)) {
+       if (!bpf_capable()) {
                 info.jited_prog_len = 0;
                 info.xlated_prog_len = 0;
                 info.nr_jited_ksyms = 0;
@@ -3302,6 +3548,42 @@ static int bpf_btf_get_info_by_fd(struct btf *btf,
         return btf_get_info_by_fd(btf, attr, uattr);
  }
  
+static int bpf_link_get_info_by_fd(struct bpf_link *link,
+                                 const union bpf_attr *attr,
+                                 union bpf_attr __user *uattr)
+{
+       struct bpf_link_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+       struct bpf_link_info info;
+       u32 info_len = attr->info.info_len;
+       int err;
+
+       err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+       if (err)
+               return err;
+       info_len = min_t(u32, sizeof(info), info_len);
+
+       memset(&info, 0, sizeof(info));
+       if (copy_from_user(&info, uinfo, info_len))
+               return -EFAULT;
+
+       info.type = link->type;
+       info.id = link->id;
+       info.prog_id = link->prog->aux->id;
+
+       if (link->ops->fill_link_info) {
+               err = link->ops->fill_link_info(link, &info);
+               if (err)
+                       return err;
+       }
+
+       if (copy_to_user(uinfo, &info, info_len) ||
+           put_user(info_len, &uattr->info.info_len))
+               return -EFAULT;
+
+       return 0;
+}
+
+
  #define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info
  
  static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
@@ -3326,6 +3608,9 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
                                              uattr);
         else if (f.file->f_op == &btf_fops)
                 err = bpf_btf_get_info_by_fd(f.file->private_data, attr, uattr);
+       else if (f.file->f_op == &bpf_link_fops)
+               err = bpf_link_get_info_by_fd(f.file->private_data,
+                                             attr, uattr);
         else
                 err = -EINVAL;
  
@@ -3340,7 +3625,7 @@ static int bpf_btf_load(const union bpf_attr *attr)
         if (CHECK_ATTR(BPF_BTF_LOAD))
                 return -EINVAL;
  
-       if (!capable(CAP_SYS_ADMIN))
+       if (!bpf_capable())
                 return -EPERM;
  
         return btf_new_fd(attr);
@@ -3453,7 +3738,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
         if (file->f_op == &bpf_link_fops) {
                 struct bpf_link *link = file->private_data;
  
-               if (link->ops == &bpf_raw_tp_lops) {
+               if (link->ops == &bpf_raw_tp_link_lops) {
                         struct bpf_raw_tp_link *raw_tp =
                                 container_of(link, struct bpf_raw_tp_link, link);
                         struct bpf_raw_event_map *btp = raw_tp->btp;
@@ -3547,6 +3832,15 @@ err_put:
         return err;
  }
  
+static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+       if (attr->link_create.attach_type == BPF_TRACE_ITER &&
+           prog->expected_attach_type == BPF_TRACE_ITER)
+               return bpf_iter_link_attach(attr, prog);
+
+       return -EINVAL;
+}
+
  #define BPF_LINK_CREATE_LAST_FIELD link_create.flags
  static int link_create(union bpf_attr *attr)
  {
@@ -3554,9 +3848,6 @@ static int link_create(union bpf_attr *attr)
         struct bpf_prog *prog;
         int ret;
  
-       if (!capable(CAP_NET_ADMIN))
-               return -EPERM;
-
         if (CHECK_ATTR(BPF_LINK_CREATE))
                 return -EINVAL;
  
@@ -3583,6 +3874,12 @@ static int link_create(union bpf_attr *attr)
         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
                 ret = cgroup_bpf_link_attach(attr, prog);
                 break;
+       case BPF_PROG_TYPE_TRACING:
+               ret = tracing_bpf_link_attach(attr, prog);
+               break;
+       case BPF_PROG_TYPE_FLOW_DISSECTOR:
+               ret = netns_bpf_link_create(attr, prog);
+               break;
         default:
                 ret = -EINVAL;
         }
@@ -3602,9 +3899,6 @@ static int link_update(union bpf_attr *attr)
         u32 flags;
         int ret;
  
-       if (!capable(CAP_NET_ADMIN))
-               return -EPERM;
-
         if (CHECK_ATTR(BPF_LINK_UPDATE))
                 return -EINVAL;
  
@@ -3617,8 +3911,10 @@ static int link_update(union bpf_attr *attr)
                 return PTR_ERR(link);
  
         new_prog = bpf_prog_get(attr->link_update.new_prog_fd);
-       if (IS_ERR(new_prog))
-               return PTR_ERR(new_prog);
+       if (IS_ERR(new_prog)) {
+               ret = PTR_ERR(new_prog);
+               goto out_put_link;
+       }
  
         if (flags & BPF_F_REPLACE) {
                 old_prog = bpf_prog_get(attr->link_update.old_prog_fd);
@@ -3627,30 +3923,151 @@ static int link_update(union bpf_attr *attr)
                         old_prog = NULL;
                         goto out_put_progs;
                 }
-       }
-
-#ifdef CONFIG_CGROUP_BPF
-       if (link->ops == &bpf_cgroup_link_lops) {
-               ret = cgroup_bpf_replace(link, old_prog, new_prog);
+       } else if (attr->link_update.old_prog_fd) {
+               ret = -EINVAL;
                 goto out_put_progs;
         }
-#endif
-       ret = -EINVAL;
+
+       if (link->ops->update_prog)
+               ret = link->ops->update_prog(link, new_prog, old_prog);
+       else
+               ret = -EINVAL;
  
  out_put_progs:
         if (old_prog)
                 bpf_prog_put(old_prog);
         if (ret)
                 bpf_prog_put(new_prog);
+out_put_link:
+       bpf_link_put(link);
         return ret;
  }
  
+static int bpf_link_inc_not_zero(struct bpf_link *link)
+{
+       return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? 0 : -ENOENT;
+}
+
+#define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id
+
+static int bpf_link_get_fd_by_id(const union bpf_attr *attr)
+{
+       struct bpf_link *link;
+       u32 id = attr->link_id;
+       int fd, err;
+
+       if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID))
+               return -EINVAL;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       spin_lock_bh(&link_idr_lock);
+       link = idr_find(&link_idr, id);
+       /* before link is "settled", ID is 0, pretend it doesn't exist yet */
+       if (link) {
+               if (link->id)
+                       err = bpf_link_inc_not_zero(link);
+               else
+                       err = -EAGAIN;
+       } else {
+               err = -ENOENT;
+       }
+       spin_unlock_bh(&link_idr_lock);
+
+       if (err)
+               return err;
+
+       fd = bpf_link_new_fd(link);
+       if (fd < 0)
+               bpf_link_put(link);
+
+       return fd;
+}
+
+DEFINE_MUTEX(bpf_stats_enabled_mutex);
+
+static int bpf_stats_release(struct inode *inode, struct file *file)
+{
+       mutex_lock(&bpf_stats_enabled_mutex);
+       static_key_slow_dec(&bpf_stats_enabled_key.key);
+       mutex_unlock(&bpf_stats_enabled_mutex);
+       return 0;
+}
+
+static const struct file_operations bpf_stats_fops = {
+       .release = bpf_stats_release,
+};
+
+static int bpf_enable_runtime_stats(void)
+{
+       int fd;
+
+       mutex_lock(&bpf_stats_enabled_mutex);
+
+       /* Set a very high limit to avoid overflow */
+       if (static_key_count(&bpf_stats_enabled_key.key) > INT_MAX / 2) {
+               mutex_unlock(&bpf_stats_enabled_mutex);
+               return -EBUSY;
+       }
+
+       fd = anon_inode_getfd("bpf-stats", &bpf_stats_fops, NULL, O_CLOEXEC);
+       if (fd >= 0)
+               static_key_slow_inc(&bpf_stats_enabled_key.key);
+
+       mutex_unlock(&bpf_stats_enabled_mutex);
+       return fd;
+}
+
+#define BPF_ENABLE_STATS_LAST_FIELD enable_stats.type
+
+static int bpf_enable_stats(union bpf_attr *attr)
+{
+
+       if (CHECK_ATTR(BPF_ENABLE_STATS))
+               return -EINVAL;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       switch (attr->enable_stats.type) {
+       case BPF_STATS_RUN_TIME:
+               return bpf_enable_runtime_stats();
+       default:
+               break;
+       }
+       return -EINVAL;
+}
+
+#define BPF_ITER_CREATE_LAST_FIELD iter_create.flags
+
+static int bpf_iter_create(union bpf_attr *attr)
+{
+       struct bpf_link *link;
+       int err;
+
+       if (CHECK_ATTR(BPF_ITER_CREATE))
+               return -EINVAL;
+
+       if (attr->iter_create.flags)
+               return -EINVAL;
+
+       link = bpf_link_get_from_fd(attr->iter_create.link_fd);
+       if (IS_ERR(link))
+               return PTR_ERR(link);
+
+       err = bpf_iter_new_fd(link);
+       bpf_link_put(link);
+
+       return err;
+}
+
  SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
  {
         union bpf_attr attr;
         int err;
  
-       if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))
+       if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
                 return -EPERM;
  
         err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
@@ -3762,6 +4179,19 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
         case BPF_LINK_UPDATE:
                 err = link_update(&attr);
                 break;
+       case BPF_LINK_GET_FD_BY_ID:
+               err = bpf_link_get_fd_by_id(&attr);
+               break;
+       case BPF_LINK_GET_NEXT_ID:
+               err = bpf_obj_get_next_id(&attr, uattr,
+                                         &link_idr, &link_idr_lock);
+               break;
+       case BPF_ENABLE_STATS:
+               err = bpf_enable_stats(&attr);
+               break;
+       case BPF_ITER_CREATE:
+               err = bpf_iter_create(&attr);
+               break;
         default:
                 err = -EINVAL;
                 break;