Merge branch 'uaccess.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
[linux-2.6-microblaze.git] / kernel / bpf / syscall.c
index 41ba746..9693730 100644 (file)
 #include <linux/nospec.h>
 #include <linux/audit.h>
 #include <uapi/linux/btf.h>
+#include <linux/pgtable.h>
 #include <linux/bpf_lsm.h>
+#include <linux/poll.h>
+#include <linux/bpf-netns.h>
 
 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
                          (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
@@ -42,6 +45,8 @@ static DEFINE_IDR(prog_idr);
 static DEFINE_SPINLOCK(prog_idr_lock);
 static DEFINE_IDR(map_idr);
 static DEFINE_SPINLOCK(map_idr_lock);
+static DEFINE_IDR(link_idr);
+static DEFINE_SPINLOCK(link_idr_lock);
 
 int sysctl_unprivileged_bpf_disabled __read_mostly;
 
@@ -49,9 +54,11 @@ static const struct bpf_map_ops * const bpf_map_types[] = {
 #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
 #define BPF_MAP_TYPE(_id, _ops) \
        [_id] = &_ops,
+#define BPF_LINK_TYPE(_id, _name)
 #include <linux/bpf_types.h>
 #undef BPF_PROG_TYPE
 #undef BPF_MAP_TYPE
+#undef BPF_LINK_TYPE
 };
 
 /*
@@ -268,27 +275,29 @@ static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable)
         * __GFP_RETRY_MAYFAIL to avoid such situations.
         */
 
-       const gfp_t flags = __GFP_NOWARN | __GFP_ZERO;
+       const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO;
+       unsigned int flags = 0;
+       unsigned long align = 1;
        void *area;
 
        if (size >= SIZE_MAX)
                return NULL;
 
        /* kmalloc()'ed memory can't be mmap()'ed */
-       if (!mmapable && size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
-               area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags,
+       if (mmapable) {
+               BUG_ON(!PAGE_ALIGNED(size));
+               align = SHMLBA;
+               flags = VM_USERMAP;
+       } else if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
+               area = kmalloc_node(size, gfp | GFP_USER | __GFP_NORETRY,
                                    numa_node);
                if (area != NULL)
                        return area;
        }
-       if (mmapable) {
-               BUG_ON(!PAGE_ALIGNED(size));
-               return vmalloc_user_node_flags(size, numa_node, GFP_KERNEL |
-                                              __GFP_RETRY_MAYFAIL | flags);
-       }
-       return __vmalloc_node_flags_caller(size, numa_node,
-                                          GFP_KERNEL | __GFP_RETRY_MAYFAIL |
-                                          flags, __builtin_return_address(0));
+
+       return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
+                       gfp | GFP_KERNEL | __GFP_RETRY_MAYFAIL, PAGE_KERNEL,
+                       flags, numa_node, __builtin_return_address(0));
 }
 
 void *bpf_map_area_alloc(u64 size, int numa_node)
@@ -573,9 +582,7 @@ static void bpf_map_mmap_open(struct vm_area_struct *vma)
 {
        struct bpf_map *map = vma->vm_file->private_data;
 
-       bpf_map_inc_with_uref(map);
-
-       if (vma->vm_flags & VM_WRITE) {
+       if (vma->vm_flags & VM_MAYWRITE) {
                mutex_lock(&map->freeze_mutex);
                map->writecnt++;
                mutex_unlock(&map->freeze_mutex);
@@ -587,13 +594,11 @@ static void bpf_map_mmap_close(struct vm_area_struct *vma)
 {
        struct bpf_map *map = vma->vm_file->private_data;
 
-       if (vma->vm_flags & VM_WRITE) {
+       if (vma->vm_flags & VM_MAYWRITE) {
                mutex_lock(&map->freeze_mutex);
                map->writecnt--;
                mutex_unlock(&map->freeze_mutex);
        }
-
-       bpf_map_put_with_uref(map);
 }
 
 static const struct vm_operations_struct bpf_map_default_vmops = {
@@ -614,28 +619,51 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
 
        mutex_lock(&map->freeze_mutex);
 
-       if ((vma->vm_flags & VM_WRITE) && map->frozen) {
-               err = -EPERM;
-               goto out;
+       if (vma->vm_flags & VM_WRITE) {
+               if (map->frozen) {
+                       err = -EPERM;
+                       goto out;
+               }
+               /* map is meant to be read-only, so do not allow mapping as
+                * writable, because it's possible to leak a writable page
+                * reference and allows user-space to still modify it after
+                * freezing, while verifier will assume contents do not change
+                */
+               if (map->map_flags & BPF_F_RDONLY_PROG) {
+                       err = -EACCES;
+                       goto out;
+               }
        }
 
        /* set default open/close callbacks */
        vma->vm_ops = &bpf_map_default_vmops;
        vma->vm_private_data = map;
+       vma->vm_flags &= ~VM_MAYEXEC;
+       if (!(vma->vm_flags & VM_WRITE))
+               /* disallow re-mapping with PROT_WRITE */
+               vma->vm_flags &= ~VM_MAYWRITE;
 
        err = map->ops->map_mmap(map, vma);
        if (err)
                goto out;
 
-       bpf_map_inc_with_uref(map);
-
-       if (vma->vm_flags & VM_WRITE)
+       if (vma->vm_flags & VM_MAYWRITE)
                map->writecnt++;
 out:
        mutex_unlock(&map->freeze_mutex);
        return err;
 }
 
+static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts)
+{
+       struct bpf_map *map = filp->private_data;
+
+       if (map->ops->map_poll)
+               return map->ops->map_poll(map, filp, pts);
+
+       return EPOLLERR;
+}
+
 const struct file_operations bpf_map_fops = {
 #ifdef CONFIG_PROC_FS
        .show_fdinfo    = bpf_map_show_fdinfo,
@@ -644,6 +672,7 @@ const struct file_operations bpf_map_fops = {
        .read           = bpf_dummy_read,
        .write          = bpf_dummy_write,
        .mmap           = bpf_map_mmap,
+       .poll           = bpf_map_poll,
 };
 
 int bpf_map_new_fd(struct bpf_map *map, int flags)
@@ -1361,7 +1390,7 @@ int generic_map_lookup_batch(struct bpf_map *map,
 
        buf = kmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN);
        if (!buf) {
-               kvfree(buf_prevkey);
+               kfree(buf_prevkey);
                return -ENOMEM;
        }
 
@@ -1446,7 +1475,8 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
        map = __bpf_map_get(f);
        if (IS_ERR(map))
                return PTR_ERR(map);
-       if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
+       if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) ||
+           !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
                err = -EPERM;
                goto err_put;
        }
@@ -1474,8 +1504,10 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
        if (err)
                goto free_value;
 
-       if (copy_to_user(uvalue, value, value_size) != 0)
+       if (copy_to_user(uvalue, value, value_size) != 0) {
+               err = -EFAULT;
                goto free_value;
+       }
 
        err = 0;
 
@@ -1519,7 +1551,7 @@ static int map_freeze(const union bpf_attr *attr)
                err = -EBUSY;
                goto err_put;
        }
-       if (!capable(CAP_SYS_ADMIN)) {
+       if (!bpf_capable()) {
                err = -EPERM;
                goto err_put;
        }
@@ -1535,9 +1567,11 @@ static const struct bpf_prog_ops * const bpf_prog_types[] = {
 #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
        [_id] = & _name ## _prog_ops,
 #define BPF_MAP_TYPE(_id, _ops)
+#define BPF_LINK_TYPE(_id, _name)
 #include <linux/bpf_types.h>
 #undef BPF_PROG_TYPE
 #undef BPF_MAP_TYPE
+#undef BPF_LINK_TYPE
 };
 
 static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
@@ -1959,6 +1993,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
                case BPF_CGROUP_INET6_BIND:
                case BPF_CGROUP_INET4_CONNECT:
                case BPF_CGROUP_INET6_CONNECT:
+               case BPF_CGROUP_INET4_GETPEERNAME:
+               case BPF_CGROUP_INET6_GETPEERNAME:
+               case BPF_CGROUP_INET4_GETSOCKNAME:
+               case BPF_CGROUP_INET6_GETSOCKNAME:
                case BPF_CGROUP_UDP4_SENDMSG:
                case BPF_CGROUP_UDP6_SENDMSG:
                case BPF_CGROUP_UDP4_RECVMSG:
@@ -1992,6 +2030,55 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
        }
 }
 
+static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)
+{
+       switch (prog_type) {
+       case BPF_PROG_TYPE_SCHED_CLS:
+       case BPF_PROG_TYPE_SCHED_ACT:
+       case BPF_PROG_TYPE_XDP:
+       case BPF_PROG_TYPE_LWT_IN:
+       case BPF_PROG_TYPE_LWT_OUT:
+       case BPF_PROG_TYPE_LWT_XMIT:
+       case BPF_PROG_TYPE_LWT_SEG6LOCAL:
+       case BPF_PROG_TYPE_SK_SKB:
+       case BPF_PROG_TYPE_SK_MSG:
+       case BPF_PROG_TYPE_LIRC_MODE2:
+       case BPF_PROG_TYPE_FLOW_DISSECTOR:
+       case BPF_PROG_TYPE_CGROUP_DEVICE:
+       case BPF_PROG_TYPE_CGROUP_SOCK:
+       case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+       case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+       case BPF_PROG_TYPE_CGROUP_SYSCTL:
+       case BPF_PROG_TYPE_SOCK_OPS:
+       case BPF_PROG_TYPE_EXT: /* extends any prog */
+               return true;
+       case BPF_PROG_TYPE_CGROUP_SKB:
+               /* always unpriv */
+       case BPF_PROG_TYPE_SK_REUSEPORT:
+               /* equivalent to SOCKET_FILTER. need CAP_BPF only */
+       default:
+               return false;
+       }
+}
+
+static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
+{
+       switch (prog_type) {
+       case BPF_PROG_TYPE_KPROBE:
+       case BPF_PROG_TYPE_TRACEPOINT:
+       case BPF_PROG_TYPE_PERF_EVENT:
+       case BPF_PROG_TYPE_RAW_TRACEPOINT:
+       case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
+       case BPF_PROG_TYPE_TRACING:
+       case BPF_PROG_TYPE_LSM:
+       case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */
+       case BPF_PROG_TYPE_EXT: /* extends any prog */
+               return true;
+       default:
+               return false;
+       }
+}
+
 /* last field in 'union bpf_attr' used by this command */
 #define        BPF_PROG_LOAD_LAST_FIELD attach_prog_fd
 
@@ -2014,7 +2101,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 
        if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
            (attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
-           !capable(CAP_SYS_ADMIN))
+           !bpf_capable())
                return -EPERM;
 
        /* copy eBPF program license from user space */
@@ -2027,11 +2114,16 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
        is_gpl = license_is_gpl_compatible(license);
 
        if (attr->insn_cnt == 0 ||
-           attr->insn_cnt > (capable(CAP_SYS_ADMIN) ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
+           attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
                return -E2BIG;
        if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
            type != BPF_PROG_TYPE_CGROUP_SKB &&
-           !capable(CAP_SYS_ADMIN))
+           !bpf_capable())
+               return -EPERM;
+
+       if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN))
+               return -EPERM;
+       if (is_perfmon_prog_type(type) && !perfmon_capable())
                return -EPERM;
 
        bpf_prog_load_fixup_attach_type(attr);
@@ -2170,25 +2262,39 @@ static int bpf_obj_get(const union bpf_attr *attr)
                                attr->file_flags);
 }
 
-void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops,
-                  struct bpf_prog *prog)
+void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
+                  const struct bpf_link_ops *ops, struct bpf_prog *prog)
 {
        atomic64_set(&link->refcnt, 1);
+       link->type = type;
+       link->id = 0;
        link->ops = ops;
        link->prog = prog;
 }
 
+static void bpf_link_free_id(int id)
+{
+       if (!id)
+               return;
+
+       spin_lock_bh(&link_idr_lock);
+       idr_remove(&link_idr, id);
+       spin_unlock_bh(&link_idr_lock);
+}
+
 /* Clean up bpf_link and corresponding anon_inode file and FD. After
  * anon_inode is created, bpf_link can't be just kfree()'d due to deferred
- * anon_inode's release() call. This helper manages marking bpf_link as
- * defunct, releases anon_inode file and puts reserved FD.
+ * anon_inode's release() call. This helper marksbpf_link as
+ * defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt
+ * is not decremented, it's the responsibility of a calling code that failed
+ * to complete bpf_link initialization.
  */
-void bpf_link_cleanup(struct bpf_link *link, struct file *link_file,
-                     int link_fd)
+void bpf_link_cleanup(struct bpf_link_primer *primer)
 {
-       link->prog = NULL;
-       fput(link_file);
-       put_unused_fd(link_fd);
+       primer->link->prog = NULL;
+       bpf_link_free_id(primer->id);
+       fput(primer->file);
+       put_unused_fd(primer->fd);
 }
 
 void bpf_link_inc(struct bpf_link *link)
@@ -2199,6 +2305,7 @@ void bpf_link_inc(struct bpf_link *link)
 /* bpf_link_free is guaranteed to be called from process context */
 static void bpf_link_free(struct bpf_link *link)
 {
+       bpf_link_free_id(link->id);
        if (link->prog) {
                /* detach BPF program, clean up used resources */
                link->ops->release(link);
@@ -2240,39 +2347,39 @@ static int bpf_link_release(struct inode *inode, struct file *filp)
 }
 
 #ifdef CONFIG_PROC_FS
-static const struct bpf_link_ops bpf_raw_tp_lops;
-static const struct bpf_link_ops bpf_tracing_link_lops;
+#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
+#define BPF_MAP_TYPE(_id, _ops)
+#define BPF_LINK_TYPE(_id, _name) [_id] = #_name,
+static const char *bpf_link_type_strs[] = {
+       [BPF_LINK_TYPE_UNSPEC] = "<invalid>",
+#include <linux/bpf_types.h>
+};
+#undef BPF_PROG_TYPE
+#undef BPF_MAP_TYPE
+#undef BPF_LINK_TYPE
 
 static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
 {
        const struct bpf_link *link = filp->private_data;
        const struct bpf_prog *prog = link->prog;
        char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
-       const char *link_type;
-
-       if (link->ops == &bpf_raw_tp_lops)
-               link_type = "raw_tracepoint";
-       else if (link->ops == &bpf_tracing_link_lops)
-               link_type = "tracing";
-#ifdef CONFIG_CGROUP_BPF
-       else if (link->ops == &bpf_cgroup_link_lops)
-               link_type = "cgroup";
-#endif
-       else
-               link_type = "unknown";
 
        bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
        seq_printf(m,
                   "link_type:\t%s\n"
+                  "link_id:\t%u\n"
                   "prog_tag:\t%s\n"
                   "prog_id:\t%u\n",
-                  link_type,
+                  bpf_link_type_strs[link->type],
+                  link->id,
                   prog_tag,
                   prog->aux->id);
+       if (link->ops->show_fdinfo)
+               link->ops->show_fdinfo(link, m);
 }
 #endif
 
-const struct file_operations bpf_link_fops = {
+static const struct file_operations bpf_link_fops = {
 #ifdef CONFIG_PROC_FS
        .show_fdinfo    = bpf_link_show_fdinfo,
 #endif
@@ -2281,36 +2388,77 @@ const struct file_operations bpf_link_fops = {
        .write          = bpf_dummy_write,
 };
 
-int bpf_link_new_fd(struct bpf_link *link)
+static int bpf_link_alloc_id(struct bpf_link *link)
 {
-       return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC);
-}
+       int id;
 
-/* Similar to bpf_link_new_fd, create anon_inode for given bpf_link, but
- * instead of immediately installing fd in fdtable, just reserve it and
- * return. Caller then need to either install it with fd_install(fd, file) or
- * release with put_unused_fd(fd).
- * This is useful for cases when bpf_link attachment/detachment are
- * complicated and expensive operations and should be delayed until all the fd
- * reservation and anon_inode creation succeeds.
+       idr_preload(GFP_KERNEL);
+       spin_lock_bh(&link_idr_lock);
+       id = idr_alloc_cyclic(&link_idr, link, 1, INT_MAX, GFP_ATOMIC);
+       spin_unlock_bh(&link_idr_lock);
+       idr_preload_end();
+
+       return id;
+}
+
+/* Prepare bpf_link to be exposed to user-space by allocating anon_inode file,
+ * reserving unused FD and allocating ID from link_idr. This is to be paired
+ * with bpf_link_settle() to install FD and ID and expose bpf_link to
+ * user-space, if bpf_link is successfully attached. If not, bpf_link and
+ * pre-allocated resources are to be freed with bpf_cleanup() call. All the
+ * transient state is passed around in struct bpf_link_primer.
+ * This is preferred way to create and initialize bpf_link, especially when
+ * there are complicated and expensive operations inbetween creating bpf_link
+ * itself and attaching it to BPF hook. By using bpf_link_prime() and
+ * bpf_link_settle() kernel code using bpf_link doesn't have to perform
+ * expensive (and potentially failing) roll back operations in a rare case
+ * that file, FD, or ID can't be allocated.
  */
-struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd)
+int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer)
 {
        struct file *file;
-       int fd;
+       int fd, id;
 
        fd = get_unused_fd_flags(O_CLOEXEC);
        if (fd < 0)
-               return ERR_PTR(fd);
+               return fd;
+
+
+       id = bpf_link_alloc_id(link);
+       if (id < 0) {
+               put_unused_fd(fd);
+               return id;
+       }
 
        file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC);
        if (IS_ERR(file)) {
+               bpf_link_free_id(id);
                put_unused_fd(fd);
-               return file;
+               return PTR_ERR(file);
        }
 
-       *reserved_fd = fd;
-       return file;
+       primer->link = link;
+       primer->file = file;
+       primer->fd = fd;
+       primer->id = id;
+       return 0;
+}
+
+int bpf_link_settle(struct bpf_link_primer *primer)
+{
+       /* make bpf_link fetchable by ID */
+       spin_lock_bh(&link_idr_lock);
+       primer->link->id = primer->id;
+       spin_unlock_bh(&link_idr_lock);
+       /* make bpf_link fetchable by FD */
+       fd_install(primer->fd, primer->file);
+       /* pass through installed FD */
+       return primer->fd;
+}
+
+int bpf_link_new_fd(struct bpf_link *link)
+{
+       return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC);
 }
 
 struct bpf_link *bpf_link_get_from_fd(u32 ufd)
@@ -2334,6 +2482,7 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd)
 
 struct bpf_tracing_link {
        struct bpf_link link;
+       enum bpf_attach_type attach_type;
 };
 
 static void bpf_tracing_link_release(struct bpf_link *link)
@@ -2349,16 +2498,40 @@ static void bpf_tracing_link_dealloc(struct bpf_link *link)
        kfree(tr_link);
 }
 
+static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link,
+                                        struct seq_file *seq)
+{
+       struct bpf_tracing_link *tr_link =
+               container_of(link, struct bpf_tracing_link, link);
+
+       seq_printf(seq,
+                  "attach_type:\t%d\n",
+                  tr_link->attach_type);
+}
+
+static int bpf_tracing_link_fill_link_info(const struct bpf_link *link,
+                                          struct bpf_link_info *info)
+{
+       struct bpf_tracing_link *tr_link =
+               container_of(link, struct bpf_tracing_link, link);
+
+       info->tracing.attach_type = tr_link->attach_type;
+
+       return 0;
+}
+
 static const struct bpf_link_ops bpf_tracing_link_lops = {
        .release = bpf_tracing_link_release,
        .dealloc = bpf_tracing_link_dealloc,
+       .show_fdinfo = bpf_tracing_link_show_fdinfo,
+       .fill_link_info = bpf_tracing_link_fill_link_info,
 };
 
 static int bpf_tracing_prog_attach(struct bpf_prog *prog)
 {
+       struct bpf_link_primer link_primer;
        struct bpf_tracing_link *link;
-       struct file *link_file;
-       int link_fd, err;
+       int err;
 
        switch (prog->type) {
        case BPF_PROG_TYPE_TRACING:
@@ -2391,24 +2564,23 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog)
                err = -ENOMEM;
                goto out_put_prog;
        }
-       bpf_link_init(&link->link, &bpf_tracing_link_lops, prog);
+       bpf_link_init(&link->link, BPF_LINK_TYPE_TRACING,
+                     &bpf_tracing_link_lops, prog);
+       link->attach_type = prog->expected_attach_type;
 
-       link_file = bpf_link_new_file(&link->link, &link_fd);
-       if (IS_ERR(link_file)) {
+       err = bpf_link_prime(&link->link, &link_primer);
+       if (err) {
                kfree(link);
-               err = PTR_ERR(link_file);
                goto out_put_prog;
        }
 
        err = bpf_trampoline_link_prog(prog);
        if (err) {
-               bpf_link_cleanup(&link->link, link_file, link_fd);
+               bpf_link_cleanup(&link_primer);
                goto out_put_prog;
        }
 
-       fd_install(link_fd, link_file);
-       return link_fd;
-
+       return bpf_link_settle(&link_primer);
 out_put_prog:
        bpf_prog_put(prog);
        return err;
@@ -2436,22 +2608,69 @@ static void bpf_raw_tp_link_dealloc(struct bpf_link *link)
        kfree(raw_tp);
 }
 
-static const struct bpf_link_ops bpf_raw_tp_lops = {
+static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link,
+                                       struct seq_file *seq)
+{
+       struct bpf_raw_tp_link *raw_tp_link =
+               container_of(link, struct bpf_raw_tp_link, link);
+
+       seq_printf(seq,
+                  "tp_name:\t%s\n",
+                  raw_tp_link->btp->tp->name);
+}
+
+static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
+                                         struct bpf_link_info *info)
+{
+       struct bpf_raw_tp_link *raw_tp_link =
+               container_of(link, struct bpf_raw_tp_link, link);
+       char __user *ubuf = u64_to_user_ptr(info->raw_tracepoint.tp_name);
+       const char *tp_name = raw_tp_link->btp->tp->name;
+       u32 ulen = info->raw_tracepoint.tp_name_len;
+       size_t tp_len = strlen(tp_name);
+
+       if (ulen && !ubuf)
+               return -EINVAL;
+
+       info->raw_tracepoint.tp_name_len = tp_len + 1;
+
+       if (!ubuf)
+               return 0;
+
+       if (ulen >= tp_len + 1) {
+               if (copy_to_user(ubuf, tp_name, tp_len + 1))
+                       return -EFAULT;
+       } else {
+               char zero = '\0';
+
+               if (copy_to_user(ubuf, tp_name, ulen - 1))
+                       return -EFAULT;
+               if (put_user(zero, ubuf + ulen - 1))
+                       return -EFAULT;
+               return -ENOSPC;
+       }
+
+       return 0;
+}
+
+static const struct bpf_link_ops bpf_raw_tp_link_lops = {
        .release = bpf_raw_tp_link_release,
        .dealloc = bpf_raw_tp_link_dealloc,
+       .show_fdinfo = bpf_raw_tp_link_show_fdinfo,
+       .fill_link_info = bpf_raw_tp_link_fill_link_info,
 };
 
 #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
 
 static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
 {
+       struct bpf_link_primer link_primer;
        struct bpf_raw_tp_link *link;
        struct bpf_raw_event_map *btp;
-       struct file *link_file;
        struct bpf_prog *prog;
        const char *tp_name;
        char buf[128];
-       int link_fd, err;
+       int err;
 
        if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN))
                return -EINVAL;
@@ -2504,24 +2723,23 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
                err = -ENOMEM;
                goto out_put_btp;
        }
-       bpf_link_init(&link->link, &bpf_raw_tp_lops, prog);
+       bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT,
+                     &bpf_raw_tp_link_lops, prog);
        link->btp = btp;
 
-       link_file = bpf_link_new_file(&link->link, &link_fd);
-       if (IS_ERR(link_file)) {
+       err = bpf_link_prime(&link->link, &link_primer);
+       if (err) {
                kfree(link);
-               err = PTR_ERR(link_file);
                goto out_put_btp;
        }
 
        err = bpf_probe_register(link->btp, prog);
        if (err) {
-               bpf_link_cleanup(&link->link, link_file, link_fd);
+               bpf_link_cleanup(&link_primer);
                goto out_put_btp;
        }
 
-       fd_install(link_fd, link_file);
-       return link_fd;
+       return bpf_link_settle(&link_primer);
 
 out_put_btp:
        bpf_put_raw_tracepoint(btp);
@@ -2539,6 +2757,11 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
        case BPF_PROG_TYPE_CGROUP_SOCKOPT:
                return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
        case BPF_PROG_TYPE_CGROUP_SKB:
+               if (!capable(CAP_NET_ADMIN))
+                       /* cg-skb progs can be loaded by unpriv user.
+                        * check permissions at attach time.
+                        */
+                       return -EPERM;
                return prog->enforce_expected_attach_type &&
                        prog->expected_attach_type != attach_type ?
                        -EINVAL : 0;
@@ -2563,6 +2786,10 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
        case BPF_CGROUP_INET6_BIND:
        case BPF_CGROUP_INET4_CONNECT:
        case BPF_CGROUP_INET6_CONNECT:
+       case BPF_CGROUP_INET4_GETPEERNAME:
+       case BPF_CGROUP_INET6_GETPEERNAME:
+       case BPF_CGROUP_INET4_GETSOCKNAME:
+       case BPF_CGROUP_INET6_GETSOCKNAME:
        case BPF_CGROUP_UDP4_SENDMSG:
        case BPF_CGROUP_UDP6_SENDMSG:
        case BPF_CGROUP_UDP4_RECVMSG:
@@ -2586,6 +2813,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
        case BPF_CGROUP_GETSOCKOPT:
        case BPF_CGROUP_SETSOCKOPT:
                return BPF_PROG_TYPE_CGROUP_SOCKOPT;
+       case BPF_TRACE_ITER:
+               return BPF_PROG_TYPE_TRACING;
        default:
                return BPF_PROG_TYPE_UNSPEC;
        }
@@ -2602,9 +2831,6 @@ static int bpf_prog_attach(const union bpf_attr *attr)
        struct bpf_prog *prog;
        int ret;
 
-       if (!capable(CAP_NET_ADMIN))
-               return -EPERM;
-
        if (CHECK_ATTR(BPF_PROG_ATTACH))
                return -EINVAL;
 
@@ -2633,7 +2859,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
                ret = lirc_prog_attach(attr, prog);
                break;
        case BPF_PROG_TYPE_FLOW_DISSECTOR:
-               ret = skb_flow_dissector_bpf_prog_attach(attr, prog);
+               ret = netns_bpf_prog_attach(attr, prog);
                break;
        case BPF_PROG_TYPE_CGROUP_DEVICE:
        case BPF_PROG_TYPE_CGROUP_SKB:
@@ -2659,9 +2885,6 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 {
        enum bpf_prog_type ptype;
 
-       if (!capable(CAP_NET_ADMIN))
-               return -EPERM;
-
        if (CHECK_ATTR(BPF_PROG_DETACH))
                return -EINVAL;
 
@@ -2674,7 +2897,9 @@ static int bpf_prog_detach(const union bpf_attr *attr)
        case BPF_PROG_TYPE_LIRC_MODE2:
                return lirc_prog_detach(attr);
        case BPF_PROG_TYPE_FLOW_DISSECTOR:
-               return skb_flow_dissector_bpf_prog_detach(attr);
+               if (!capable(CAP_NET_ADMIN))
+                       return -EPERM;
+               return netns_bpf_prog_detach(attr);
        case BPF_PROG_TYPE_CGROUP_DEVICE:
        case BPF_PROG_TYPE_CGROUP_SKB:
        case BPF_PROG_TYPE_CGROUP_SOCK:
@@ -2710,6 +2935,10 @@ static int bpf_prog_query(const union bpf_attr *attr,
        case BPF_CGROUP_INET6_POST_BIND:
        case BPF_CGROUP_INET4_CONNECT:
        case BPF_CGROUP_INET6_CONNECT:
+       case BPF_CGROUP_INET4_GETPEERNAME:
+       case BPF_CGROUP_INET6_GETPEERNAME:
+       case BPF_CGROUP_INET4_GETSOCKNAME:
+       case BPF_CGROUP_INET6_GETSOCKNAME:
        case BPF_CGROUP_UDP4_SENDMSG:
        case BPF_CGROUP_UDP6_SENDMSG:
        case BPF_CGROUP_UDP4_RECVMSG:
@@ -2723,7 +2952,7 @@ static int bpf_prog_query(const union bpf_attr *attr,
        case BPF_LIRC_MODE2:
                return lirc_prog_query(attr, uattr);
        case BPF_FLOW_DISSECTOR:
-               return skb_flow_dissector_prog_query(attr, uattr);
+               return netns_bpf_prog_query(attr, uattr);
        default:
                return -EINVAL;
        }
@@ -2737,8 +2966,6 @@ static int bpf_prog_test_run(const union bpf_attr *attr,
        struct bpf_prog *prog;
        int ret = -ENOTSUPP;
 
-       if (!capable(CAP_SYS_ADMIN))
-               return -EPERM;
        if (CHECK_ATTR(BPF_PROG_TEST_RUN))
                return -EINVAL;
 
@@ -2789,6 +3016,25 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr,
        return err;
 }
 
+struct bpf_map *bpf_map_get_curr_or_next(u32 *id)
+{
+       struct bpf_map *map;
+
+       spin_lock_bh(&map_idr_lock);
+again:
+       map = idr_get_next(&map_idr, id);
+       if (map) {
+               map = __bpf_map_inc_not_zero(map, false);
+               if (IS_ERR(map)) {
+                       (*id)++;
+                       goto again;
+               }
+       }
+       spin_unlock_bh(&map_idr_lock);
+
+       return map;
+}
+
 #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
 
 struct bpf_prog *bpf_prog_by_id(u32 id)
@@ -3020,7 +3266,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
        info.run_time_ns = stats.nsecs;
        info.run_cnt = stats.cnt;
 
-       if (!capable(CAP_SYS_ADMIN)) {
+       if (!bpf_capable()) {
                info.jited_prog_len = 0;
                info.xlated_prog_len = 0;
                info.nr_jited_ksyms = 0;
@@ -3302,6 +3548,42 @@ static int bpf_btf_get_info_by_fd(struct btf *btf,
        return btf_get_info_by_fd(btf, attr, uattr);
 }
 
+static int bpf_link_get_info_by_fd(struct bpf_link *link,
+                                 const union bpf_attr *attr,
+                                 union bpf_attr __user *uattr)
+{
+       struct bpf_link_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+       struct bpf_link_info info;
+       u32 info_len = attr->info.info_len;
+       int err;
+
+       err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+       if (err)
+               return err;
+       info_len = min_t(u32, sizeof(info), info_len);
+
+       memset(&info, 0, sizeof(info));
+       if (copy_from_user(&info, uinfo, info_len))
+               return -EFAULT;
+
+       info.type = link->type;
+       info.id = link->id;
+       info.prog_id = link->prog->aux->id;
+
+       if (link->ops->fill_link_info) {
+               err = link->ops->fill_link_info(link, &info);
+               if (err)
+                       return err;
+       }
+
+       if (copy_to_user(uinfo, &info, info_len) ||
+           put_user(info_len, &uattr->info.info_len))
+               return -EFAULT;
+
+       return 0;
+}
+
+
 #define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info
 
 static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
@@ -3326,6 +3608,9 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
                                             uattr);
        else if (f.file->f_op == &btf_fops)
                err = bpf_btf_get_info_by_fd(f.file->private_data, attr, uattr);
+       else if (f.file->f_op == &bpf_link_fops)
+               err = bpf_link_get_info_by_fd(f.file->private_data,
+                                             attr, uattr);
        else
                err = -EINVAL;
 
@@ -3340,7 +3625,7 @@ static int bpf_btf_load(const union bpf_attr *attr)
        if (CHECK_ATTR(BPF_BTF_LOAD))
                return -EINVAL;
 
-       if (!capable(CAP_SYS_ADMIN))
+       if (!bpf_capable())
                return -EPERM;
 
        return btf_new_fd(attr);
@@ -3453,7 +3738,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
        if (file->f_op == &bpf_link_fops) {
                struct bpf_link *link = file->private_data;
 
-               if (link->ops == &bpf_raw_tp_lops) {
+               if (link->ops == &bpf_raw_tp_link_lops) {
                        struct bpf_raw_tp_link *raw_tp =
                                container_of(link, struct bpf_raw_tp_link, link);
                        struct bpf_raw_event_map *btp = raw_tp->btp;
@@ -3547,6 +3832,15 @@ err_put:
        return err;
 }
 
+static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+       if (attr->link_create.attach_type == BPF_TRACE_ITER &&
+           prog->expected_attach_type == BPF_TRACE_ITER)
+               return bpf_iter_link_attach(attr, prog);
+
+       return -EINVAL;
+}
+
 #define BPF_LINK_CREATE_LAST_FIELD link_create.flags
 static int link_create(union bpf_attr *attr)
 {
@@ -3554,9 +3848,6 @@ static int link_create(union bpf_attr *attr)
        struct bpf_prog *prog;
        int ret;
 
-       if (!capable(CAP_NET_ADMIN))
-               return -EPERM;
-
        if (CHECK_ATTR(BPF_LINK_CREATE))
                return -EINVAL;
 
@@ -3583,6 +3874,12 @@ static int link_create(union bpf_attr *attr)
        case BPF_PROG_TYPE_CGROUP_SOCKOPT:
                ret = cgroup_bpf_link_attach(attr, prog);
                break;
+       case BPF_PROG_TYPE_TRACING:
+               ret = tracing_bpf_link_attach(attr, prog);
+               break;
+       case BPF_PROG_TYPE_FLOW_DISSECTOR:
+               ret = netns_bpf_link_create(attr, prog);
+               break;
        default:
                ret = -EINVAL;
        }
@@ -3602,9 +3899,6 @@ static int link_update(union bpf_attr *attr)
        u32 flags;
        int ret;
 
-       if (!capable(CAP_NET_ADMIN))
-               return -EPERM;
-
        if (CHECK_ATTR(BPF_LINK_UPDATE))
                return -EINVAL;
 
@@ -3617,8 +3911,10 @@ static int link_update(union bpf_attr *attr)
                return PTR_ERR(link);
 
        new_prog = bpf_prog_get(attr->link_update.new_prog_fd);
-       if (IS_ERR(new_prog))
-               return PTR_ERR(new_prog);
+       if (IS_ERR(new_prog)) {
+               ret = PTR_ERR(new_prog);
+               goto out_put_link;
+       }
 
        if (flags & BPF_F_REPLACE) {
                old_prog = bpf_prog_get(attr->link_update.old_prog_fd);
@@ -3627,30 +3923,151 @@ static int link_update(union bpf_attr *attr)
                        old_prog = NULL;
                        goto out_put_progs;
                }
-       }
-
-#ifdef CONFIG_CGROUP_BPF
-       if (link->ops == &bpf_cgroup_link_lops) {
-               ret = cgroup_bpf_replace(link, old_prog, new_prog);
+       } else if (attr->link_update.old_prog_fd) {
+               ret = -EINVAL;
                goto out_put_progs;
        }
-#endif
-       ret = -EINVAL;
+
+       if (link->ops->update_prog)
+               ret = link->ops->update_prog(link, new_prog, old_prog);
+       else
+               ret = -EINVAL;
 
 out_put_progs:
        if (old_prog)
                bpf_prog_put(old_prog);
        if (ret)
                bpf_prog_put(new_prog);
+out_put_link:
+       bpf_link_put(link);
        return ret;
 }
 
+static int bpf_link_inc_not_zero(struct bpf_link *link)
+{
+       return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? 0 : -ENOENT;
+}
+
+#define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id
+
+static int bpf_link_get_fd_by_id(const union bpf_attr *attr)
+{
+       struct bpf_link *link;
+       u32 id = attr->link_id;
+       int fd, err;
+
+       if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID))
+               return -EINVAL;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       spin_lock_bh(&link_idr_lock);
+       link = idr_find(&link_idr, id);
+       /* before link is "settled", ID is 0, pretend it doesn't exist yet */
+       if (link) {
+               if (link->id)
+                       err = bpf_link_inc_not_zero(link);
+               else
+                       err = -EAGAIN;
+       } else {
+               err = -ENOENT;
+       }
+       spin_unlock_bh(&link_idr_lock);
+
+       if (err)
+               return err;
+
+       fd = bpf_link_new_fd(link);
+       if (fd < 0)
+               bpf_link_put(link);
+
+       return fd;
+}
+
+DEFINE_MUTEX(bpf_stats_enabled_mutex);
+
+static int bpf_stats_release(struct inode *inode, struct file *file)
+{
+       mutex_lock(&bpf_stats_enabled_mutex);
+       static_key_slow_dec(&bpf_stats_enabled_key.key);
+       mutex_unlock(&bpf_stats_enabled_mutex);
+       return 0;
+}
+
+static const struct file_operations bpf_stats_fops = {
+       .release = bpf_stats_release,
+};
+
+static int bpf_enable_runtime_stats(void)
+{
+       int fd;
+
+       mutex_lock(&bpf_stats_enabled_mutex);
+
+       /* Set a very high limit to avoid overflow */
+       if (static_key_count(&bpf_stats_enabled_key.key) > INT_MAX / 2) {
+               mutex_unlock(&bpf_stats_enabled_mutex);
+               return -EBUSY;
+       }
+
+       fd = anon_inode_getfd("bpf-stats", &bpf_stats_fops, NULL, O_CLOEXEC);
+       if (fd >= 0)
+               static_key_slow_inc(&bpf_stats_enabled_key.key);
+
+       mutex_unlock(&bpf_stats_enabled_mutex);
+       return fd;
+}
+
+#define BPF_ENABLE_STATS_LAST_FIELD enable_stats.type
+
+static int bpf_enable_stats(union bpf_attr *attr)
+{
+
+       if (CHECK_ATTR(BPF_ENABLE_STATS))
+               return -EINVAL;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       switch (attr->enable_stats.type) {
+       case BPF_STATS_RUN_TIME:
+               return bpf_enable_runtime_stats();
+       default:
+               break;
+       }
+       return -EINVAL;
+}
+
+#define BPF_ITER_CREATE_LAST_FIELD iter_create.flags
+
+static int bpf_iter_create(union bpf_attr *attr)
+{
+       struct bpf_link *link;
+       int err;
+
+       if (CHECK_ATTR(BPF_ITER_CREATE))
+               return -EINVAL;
+
+       if (attr->iter_create.flags)
+               return -EINVAL;
+
+       link = bpf_link_get_from_fd(attr->iter_create.link_fd);
+       if (IS_ERR(link))
+               return PTR_ERR(link);
+
+       err = bpf_iter_new_fd(link);
+       bpf_link_put(link);
+
+       return err;
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
        union bpf_attr attr;
        int err;
 
-       if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))
+       if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
                return -EPERM;
 
        err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
@@ -3762,6 +4179,19 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
        case BPF_LINK_UPDATE:
                err = link_update(&attr);
                break;
+       case BPF_LINK_GET_FD_BY_ID:
+               err = bpf_link_get_fd_by_id(&attr);
+               break;
+       case BPF_LINK_GET_NEXT_ID:
+               err = bpf_obj_get_next_id(&attr, uattr,
+                                         &link_idr, &link_idr_lock);
+               break;
+       case BPF_ENABLE_STATS:
+               err = bpf_enable_stats(&attr);
+               break;
+       case BPF_ITER_CREATE:
+               err = bpf_iter_create(&attr);
+               break;
        default:
                err = -EINVAL;
                break;