Merge tag 'x86-urgent-2020-05-24' of git://git.kernel.org/pub/scm/linux/kernel/git...
[linux-2.6-microblaze.git] / kernel / bpf / syscall.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
3  */
4 #include <linux/bpf.h>
5 #include <linux/bpf_trace.h>
6 #include <linux/bpf_lirc.h>
7 #include <linux/btf.h>
8 #include <linux/syscalls.h>
9 #include <linux/slab.h>
10 #include <linux/sched/signal.h>
11 #include <linux/vmalloc.h>
12 #include <linux/mmzone.h>
13 #include <linux/anon_inodes.h>
14 #include <linux/fdtable.h>
15 #include <linux/file.h>
16 #include <linux/fs.h>
17 #include <linux/license.h>
18 #include <linux/filter.h>
19 #include <linux/version.h>
20 #include <linux/kernel.h>
21 #include <linux/idr.h>
22 #include <linux/cred.h>
23 #include <linux/timekeeping.h>
24 #include <linux/ctype.h>
25 #include <linux/nospec.h>
26 #include <linux/audit.h>
27 #include <uapi/linux/btf.h>
28 #include <linux/bpf_lsm.h>
29
30 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
31                           (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
32                           (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
33 #define IS_FD_PROG_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY)
34 #define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
35 #define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map) || \
36                         IS_FD_HASH(map))
37
38 #define BPF_OBJ_FLAG_MASK   (BPF_F_RDONLY | BPF_F_WRONLY)
39
40 DEFINE_PER_CPU(int, bpf_prog_active);
41 static DEFINE_IDR(prog_idr);
42 static DEFINE_SPINLOCK(prog_idr_lock);
43 static DEFINE_IDR(map_idr);
44 static DEFINE_SPINLOCK(map_idr_lock);
45
46 int sysctl_unprivileged_bpf_disabled __read_mostly;
47
48 static const struct bpf_map_ops * const bpf_map_types[] = {
49 #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
50 #define BPF_MAP_TYPE(_id, _ops) \
51         [_id] = &_ops,
52 #include <linux/bpf_types.h>
53 #undef BPF_PROG_TYPE
54 #undef BPF_MAP_TYPE
55 };
56
57 /*
58  * If we're handed a bigger struct than we know of, ensure all the unknown bits
59  * are 0 - i.e. new user-space does not rely on any kernel feature extensions
60  * we don't know about yet.
61  *
62  * There is a ToCToU between this function call and the following
63  * copy_from_user() call. However, this is not a concern since this function is
64  * meant to be a future-proofing of bits.
65  */
66 int bpf_check_uarg_tail_zero(void __user *uaddr,
67                              size_t expected_size,
68                              size_t actual_size)
69 {
70         unsigned char __user *addr;
71         unsigned char __user *end;
72         unsigned char val;
73         int err;
74
75         if (unlikely(actual_size > PAGE_SIZE))  /* silly large */
76                 return -E2BIG;
77
78         if (unlikely(!access_ok(uaddr, actual_size)))
79                 return -EFAULT;
80
81         if (actual_size <= expected_size)
82                 return 0;
83
84         addr = uaddr + expected_size;
85         end  = uaddr + actual_size;
86
87         for (; addr < end; addr++) {
88                 err = get_user(val, addr);
89                 if (err)
90                         return err;
91                 if (val)
92                         return -E2BIG;
93         }
94
95         return 0;
96 }
97
98 const struct bpf_map_ops bpf_map_offload_ops = {
99         .map_alloc = bpf_map_offload_map_alloc,
100         .map_free = bpf_map_offload_map_free,
101         .map_check_btf = map_check_no_btf,
102 };
103
104 static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
105 {
106         const struct bpf_map_ops *ops;
107         u32 type = attr->map_type;
108         struct bpf_map *map;
109         int err;
110
111         if (type >= ARRAY_SIZE(bpf_map_types))
112                 return ERR_PTR(-EINVAL);
113         type = array_index_nospec(type, ARRAY_SIZE(bpf_map_types));
114         ops = bpf_map_types[type];
115         if (!ops)
116                 return ERR_PTR(-EINVAL);
117
118         if (ops->map_alloc_check) {
119                 err = ops->map_alloc_check(attr);
120                 if (err)
121                         return ERR_PTR(err);
122         }
123         if (attr->map_ifindex)
124                 ops = &bpf_map_offload_ops;
125         map = ops->map_alloc(attr);
126         if (IS_ERR(map))
127                 return map;
128         map->ops = ops;
129         map->map_type = type;
130         return map;
131 }
132
133 static u32 bpf_map_value_size(struct bpf_map *map)
134 {
135         if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
136             map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
137             map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
138             map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
139                 return round_up(map->value_size, 8) * num_possible_cpus();
140         else if (IS_FD_MAP(map))
141                 return sizeof(u32);
142         else
143                 return  map->value_size;
144 }
145
146 static void maybe_wait_bpf_programs(struct bpf_map *map)
147 {
148         /* Wait for any running BPF programs to complete so that
149          * userspace, when we return to it, knows that all programs
150          * that could be running use the new map value.
151          */
152         if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS ||
153             map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
154                 synchronize_rcu();
155 }
156
157 static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
158                                 void *value, __u64 flags)
159 {
160         int err;
161
162         /* Need to create a kthread, thus must support schedule */
163         if (bpf_map_is_dev_bound(map)) {
164                 return bpf_map_offload_update_elem(map, key, value, flags);
165         } else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
166                    map->map_type == BPF_MAP_TYPE_SOCKHASH ||
167                    map->map_type == BPF_MAP_TYPE_SOCKMAP ||
168                    map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
169                 return map->ops->map_update_elem(map, key, value, flags);
170         } else if (IS_FD_PROG_ARRAY(map)) {
171                 return bpf_fd_array_map_update_elem(map, f.file, key, value,
172                                                     flags);
173         }
174
175         bpf_disable_instrumentation();
176         if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
177             map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
178                 err = bpf_percpu_hash_update(map, key, value, flags);
179         } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
180                 err = bpf_percpu_array_update(map, key, value, flags);
181         } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
182                 err = bpf_percpu_cgroup_storage_update(map, key, value,
183                                                        flags);
184         } else if (IS_FD_ARRAY(map)) {
185                 rcu_read_lock();
186                 err = bpf_fd_array_map_update_elem(map, f.file, key, value,
187                                                    flags);
188                 rcu_read_unlock();
189         } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
190                 rcu_read_lock();
191                 err = bpf_fd_htab_map_update_elem(map, f.file, key, value,
192                                                   flags);
193                 rcu_read_unlock();
194         } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
195                 /* rcu_read_lock() is not needed */
196                 err = bpf_fd_reuseport_array_update_elem(map, key, value,
197                                                          flags);
198         } else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
199                    map->map_type == BPF_MAP_TYPE_STACK) {
200                 err = map->ops->map_push_elem(map, value, flags);
201         } else {
202                 rcu_read_lock();
203                 err = map->ops->map_update_elem(map, key, value, flags);
204                 rcu_read_unlock();
205         }
206         bpf_enable_instrumentation();
207         maybe_wait_bpf_programs(map);
208
209         return err;
210 }
211
212 static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
213                               __u64 flags)
214 {
215         void *ptr;
216         int err;
217
218         if (bpf_map_is_dev_bound(map))
219                 return bpf_map_offload_lookup_elem(map, key, value);
220
221         bpf_disable_instrumentation();
222         if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
223             map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
224                 err = bpf_percpu_hash_copy(map, key, value);
225         } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
226                 err = bpf_percpu_array_copy(map, key, value);
227         } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
228                 err = bpf_percpu_cgroup_storage_copy(map, key, value);
229         } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
230                 err = bpf_stackmap_copy(map, key, value);
231         } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) {
232                 err = bpf_fd_array_map_lookup_elem(map, key, value);
233         } else if (IS_FD_HASH(map)) {
234                 err = bpf_fd_htab_map_lookup_elem(map, key, value);
235         } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
236                 err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
237         } else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
238                    map->map_type == BPF_MAP_TYPE_STACK) {
239                 err = map->ops->map_peek_elem(map, value);
240         } else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
241                 /* struct_ops map requires directly updating "value" */
242                 err = bpf_struct_ops_map_sys_lookup_elem(map, key, value);
243         } else {
244                 rcu_read_lock();
245                 if (map->ops->map_lookup_elem_sys_only)
246                         ptr = map->ops->map_lookup_elem_sys_only(map, key);
247                 else
248                         ptr = map->ops->map_lookup_elem(map, key);
249                 if (IS_ERR(ptr)) {
250                         err = PTR_ERR(ptr);
251                 } else if (!ptr) {
252                         err = -ENOENT;
253                 } else {
254                         err = 0;
255                         if (flags & BPF_F_LOCK)
256                                 /* lock 'ptr' and copy everything but lock */
257                                 copy_map_value_locked(map, value, ptr, true);
258                         else
259                                 copy_map_value(map, value, ptr);
260                         /* mask lock, since value wasn't zero inited */
261                         check_and_init_map_lock(map, value);
262                 }
263                 rcu_read_unlock();
264         }
265
266         bpf_enable_instrumentation();
267         maybe_wait_bpf_programs(map);
268
269         return err;
270 }
271
272 static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable)
273 {
274         /* We really just want to fail instead of triggering OOM killer
275          * under memory pressure, therefore we set __GFP_NORETRY to kmalloc,
276          * which is used for lower order allocation requests.
277          *
278          * It has been observed that higher order allocation requests done by
279          * vmalloc with __GFP_NORETRY being set might fail due to not trying
280          * to reclaim memory from the page cache, thus we set
281          * __GFP_RETRY_MAYFAIL to avoid such situations.
282          */
283
284         const gfp_t flags = __GFP_NOWARN | __GFP_ZERO;
285         void *area;
286
287         if (size >= SIZE_MAX)
288                 return NULL;
289
290         /* kmalloc()'ed memory can't be mmap()'ed */
291         if (!mmapable && size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
292                 area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags,
293                                     numa_node);
294                 if (area != NULL)
295                         return area;
296         }
297         if (mmapable) {
298                 BUG_ON(!PAGE_ALIGNED(size));
299                 return vmalloc_user_node_flags(size, numa_node, GFP_KERNEL |
300                                                __GFP_RETRY_MAYFAIL | flags);
301         }
302         return __vmalloc_node_flags_caller(size, numa_node,
303                                            GFP_KERNEL | __GFP_RETRY_MAYFAIL |
304                                            flags, __builtin_return_address(0));
305 }
306
307 void *bpf_map_area_alloc(u64 size, int numa_node)
308 {
309         return __bpf_map_area_alloc(size, numa_node, false);
310 }
311
312 void *bpf_map_area_mmapable_alloc(u64 size, int numa_node)
313 {
314         return __bpf_map_area_alloc(size, numa_node, true);
315 }
316
317 void bpf_map_area_free(void *area)
318 {
319         kvfree(area);
320 }
321
322 static u32 bpf_map_flags_retain_permanent(u32 flags)
323 {
324         /* Some map creation flags are not tied to the map object but
325          * rather to the map fd instead, so they have no meaning upon
326          * map object inspection since multiple file descriptors with
327          * different (access) properties can exist here. Thus, given
328          * this has zero meaning for the map itself, lets clear these
329          * from here.
330          */
331         return flags & ~(BPF_F_RDONLY | BPF_F_WRONLY);
332 }
333
334 void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr)
335 {
336         map->map_type = attr->map_type;
337         map->key_size = attr->key_size;
338         map->value_size = attr->value_size;
339         map->max_entries = attr->max_entries;
340         map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags);
341         map->numa_node = bpf_map_attr_numa_node(attr);
342 }
343
344 static int bpf_charge_memlock(struct user_struct *user, u32 pages)
345 {
346         unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
347
348         if (atomic_long_add_return(pages, &user->locked_vm) > memlock_limit) {
349                 atomic_long_sub(pages, &user->locked_vm);
350                 return -EPERM;
351         }
352         return 0;
353 }
354
355 static void bpf_uncharge_memlock(struct user_struct *user, u32 pages)
356 {
357         if (user)
358                 atomic_long_sub(pages, &user->locked_vm);
359 }
360
361 int bpf_map_charge_init(struct bpf_map_memory *mem, u64 size)
362 {
363         u32 pages = round_up(size, PAGE_SIZE) >> PAGE_SHIFT;
364         struct user_struct *user;
365         int ret;
366
367         if (size >= U32_MAX - PAGE_SIZE)
368                 return -E2BIG;
369
370         user = get_current_user();
371         ret = bpf_charge_memlock(user, pages);
372         if (ret) {
373                 free_uid(user);
374                 return ret;
375         }
376
377         mem->pages = pages;
378         mem->user = user;
379
380         return 0;
381 }
382
383 void bpf_map_charge_finish(struct bpf_map_memory *mem)
384 {
385         bpf_uncharge_memlock(mem->user, mem->pages);
386         free_uid(mem->user);
387 }
388
389 void bpf_map_charge_move(struct bpf_map_memory *dst,
390                          struct bpf_map_memory *src)
391 {
392         *dst = *src;
393
394         /* Make sure src will not be used for the redundant uncharging. */
395         memset(src, 0, sizeof(struct bpf_map_memory));
396 }
397
398 int bpf_map_charge_memlock(struct bpf_map *map, u32 pages)
399 {
400         int ret;
401
402         ret = bpf_charge_memlock(map->memory.user, pages);
403         if (ret)
404                 return ret;
405         map->memory.pages += pages;
406         return ret;
407 }
408
409 void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages)
410 {
411         bpf_uncharge_memlock(map->memory.user, pages);
412         map->memory.pages -= pages;
413 }
414
415 static int bpf_map_alloc_id(struct bpf_map *map)
416 {
417         int id;
418
419         idr_preload(GFP_KERNEL);
420         spin_lock_bh(&map_idr_lock);
421         id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC);
422         if (id > 0)
423                 map->id = id;
424         spin_unlock_bh(&map_idr_lock);
425         idr_preload_end();
426
427         if (WARN_ON_ONCE(!id))
428                 return -ENOSPC;
429
430         return id > 0 ? 0 : id;
431 }
432
433 void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
434 {
435         unsigned long flags;
436
437         /* Offloaded maps are removed from the IDR store when their device
438          * disappears - even if someone holds an fd to them they are unusable,
439          * the memory is gone, all ops will fail; they are simply waiting for
440          * refcnt to drop to be freed.
441          */
442         if (!map->id)
443                 return;
444
445         if (do_idr_lock)
446                 spin_lock_irqsave(&map_idr_lock, flags);
447         else
448                 __acquire(&map_idr_lock);
449
450         idr_remove(&map_idr, map->id);
451         map->id = 0;
452
453         if (do_idr_lock)
454                 spin_unlock_irqrestore(&map_idr_lock, flags);
455         else
456                 __release(&map_idr_lock);
457 }
458
459 /* called from workqueue */
460 static void bpf_map_free_deferred(struct work_struct *work)
461 {
462         struct bpf_map *map = container_of(work, struct bpf_map, work);
463         struct bpf_map_memory mem;
464
465         bpf_map_charge_move(&mem, &map->memory);
466         security_bpf_map_free(map);
467         /* implementation dependent freeing */
468         map->ops->map_free(map);
469         bpf_map_charge_finish(&mem);
470 }
471
472 static void bpf_map_put_uref(struct bpf_map *map)
473 {
474         if (atomic64_dec_and_test(&map->usercnt)) {
475                 if (map->ops->map_release_uref)
476                         map->ops->map_release_uref(map);
477         }
478 }
479
480 /* decrement map refcnt and schedule it for freeing via workqueue
481  * (unrelying map implementation ops->map_free() might sleep)
482  */
483 static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock)
484 {
485         if (atomic64_dec_and_test(&map->refcnt)) {
486                 /* bpf_map_free_id() must be called first */
487                 bpf_map_free_id(map, do_idr_lock);
488                 btf_put(map->btf);
489                 INIT_WORK(&map->work, bpf_map_free_deferred);
490                 schedule_work(&map->work);
491         }
492 }
493
494 void bpf_map_put(struct bpf_map *map)
495 {
496         __bpf_map_put(map, true);
497 }
498 EXPORT_SYMBOL_GPL(bpf_map_put);
499
500 void bpf_map_put_with_uref(struct bpf_map *map)
501 {
502         bpf_map_put_uref(map);
503         bpf_map_put(map);
504 }
505
506 static int bpf_map_release(struct inode *inode, struct file *filp)
507 {
508         struct bpf_map *map = filp->private_data;
509
510         if (map->ops->map_release)
511                 map->ops->map_release(map, filp);
512
513         bpf_map_put_with_uref(map);
514         return 0;
515 }
516
517 static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f)
518 {
519         fmode_t mode = f.file->f_mode;
520
521         /* Our file permissions may have been overridden by global
522          * map permissions facing syscall side.
523          */
524         if (READ_ONCE(map->frozen))
525                 mode &= ~FMODE_CAN_WRITE;
526         return mode;
527 }
528
529 #ifdef CONFIG_PROC_FS
530 static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
531 {
532         const struct bpf_map *map = filp->private_data;
533         const struct bpf_array *array;
534         u32 type = 0, jited = 0;
535
536         if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
537                 array = container_of(map, struct bpf_array, map);
538                 type  = array->aux->type;
539                 jited = array->aux->jited;
540         }
541
542         seq_printf(m,
543                    "map_type:\t%u\n"
544                    "key_size:\t%u\n"
545                    "value_size:\t%u\n"
546                    "max_entries:\t%u\n"
547                    "map_flags:\t%#x\n"
548                    "memlock:\t%llu\n"
549                    "map_id:\t%u\n"
550                    "frozen:\t%u\n",
551                    map->map_type,
552                    map->key_size,
553                    map->value_size,
554                    map->max_entries,
555                    map->map_flags,
556                    map->memory.pages * 1ULL << PAGE_SHIFT,
557                    map->id,
558                    READ_ONCE(map->frozen));
559         if (type) {
560                 seq_printf(m, "owner_prog_type:\t%u\n", type);
561                 seq_printf(m, "owner_jited:\t%u\n", jited);
562         }
563 }
564 #endif
565
566 static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz,
567                               loff_t *ppos)
568 {
569         /* We need this handler such that alloc_file() enables
570          * f_mode with FMODE_CAN_READ.
571          */
572         return -EINVAL;
573 }
574
575 static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf,
576                                size_t siz, loff_t *ppos)
577 {
578         /* We need this handler such that alloc_file() enables
579          * f_mode with FMODE_CAN_WRITE.
580          */
581         return -EINVAL;
582 }
583
584 /* called for any extra memory-mapped regions (except initial) */
585 static void bpf_map_mmap_open(struct vm_area_struct *vma)
586 {
587         struct bpf_map *map = vma->vm_file->private_data;
588
589         if (vma->vm_flags & VM_MAYWRITE) {
590                 mutex_lock(&map->freeze_mutex);
591                 map->writecnt++;
592                 mutex_unlock(&map->freeze_mutex);
593         }
594 }
595
596 /* called for all unmapped memory region (including initial) */
597 static void bpf_map_mmap_close(struct vm_area_struct *vma)
598 {
599         struct bpf_map *map = vma->vm_file->private_data;
600
601         if (vma->vm_flags & VM_MAYWRITE) {
602                 mutex_lock(&map->freeze_mutex);
603                 map->writecnt--;
604                 mutex_unlock(&map->freeze_mutex);
605         }
606 }
607
608 static const struct vm_operations_struct bpf_map_default_vmops = {
609         .open           = bpf_map_mmap_open,
610         .close          = bpf_map_mmap_close,
611 };
612
613 static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
614 {
615         struct bpf_map *map = filp->private_data;
616         int err;
617
618         if (!map->ops->map_mmap || map_value_has_spin_lock(map))
619                 return -ENOTSUPP;
620
621         if (!(vma->vm_flags & VM_SHARED))
622                 return -EINVAL;
623
624         mutex_lock(&map->freeze_mutex);
625
626         if (vma->vm_flags & VM_WRITE) {
627                 if (map->frozen) {
628                         err = -EPERM;
629                         goto out;
630                 }
631                 /* map is meant to be read-only, so do not allow mapping as
632                  * writable, because it's possible to leak a writable page
633                  * reference and allows user-space to still modify it after
634                  * freezing, while verifier will assume contents do not change
635                  */
636                 if (map->map_flags & BPF_F_RDONLY_PROG) {
637                         err = -EACCES;
638                         goto out;
639                 }
640         }
641
642         /* set default open/close callbacks */
643         vma->vm_ops = &bpf_map_default_vmops;
644         vma->vm_private_data = map;
645         vma->vm_flags &= ~VM_MAYEXEC;
646         if (!(vma->vm_flags & VM_WRITE))
647                 /* disallow re-mapping with PROT_WRITE */
648                 vma->vm_flags &= ~VM_MAYWRITE;
649
650         err = map->ops->map_mmap(map, vma);
651         if (err)
652                 goto out;
653
654         if (vma->vm_flags & VM_MAYWRITE)
655                 map->writecnt++;
656 out:
657         mutex_unlock(&map->freeze_mutex);
658         return err;
659 }
660
661 const struct file_operations bpf_map_fops = {
662 #ifdef CONFIG_PROC_FS
663         .show_fdinfo    = bpf_map_show_fdinfo,
664 #endif
665         .release        = bpf_map_release,
666         .read           = bpf_dummy_read,
667         .write          = bpf_dummy_write,
668         .mmap           = bpf_map_mmap,
669 };
670
671 int bpf_map_new_fd(struct bpf_map *map, int flags)
672 {
673         int ret;
674
675         ret = security_bpf_map(map, OPEN_FMODE(flags));
676         if (ret < 0)
677                 return ret;
678
679         return anon_inode_getfd("bpf-map", &bpf_map_fops, map,
680                                 flags | O_CLOEXEC);
681 }
682
683 int bpf_get_file_flag(int flags)
684 {
685         if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY))
686                 return -EINVAL;
687         if (flags & BPF_F_RDONLY)
688                 return O_RDONLY;
689         if (flags & BPF_F_WRONLY)
690                 return O_WRONLY;
691         return O_RDWR;
692 }
693
694 /* helper macro to check that unused fields 'union bpf_attr' are zero */
695 #define CHECK_ATTR(CMD) \
696         memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
697                    sizeof(attr->CMD##_LAST_FIELD), 0, \
698                    sizeof(*attr) - \
699                    offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
700                    sizeof(attr->CMD##_LAST_FIELD)) != NULL
701
702 /* dst and src must have at least "size" number of bytes.
703  * Return strlen on success and < 0 on error.
704  */
705 int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size)
706 {
707         const char *end = src + size;
708         const char *orig_src = src;
709
710         memset(dst, 0, size);
711         /* Copy all isalnum(), '_' and '.' chars. */
712         while (src < end && *src) {
713                 if (!isalnum(*src) &&
714                     *src != '_' && *src != '.')
715                         return -EINVAL;
716                 *dst++ = *src++;
717         }
718
719         /* No '\0' found in "size" number of bytes */
720         if (src == end)
721                 return -EINVAL;
722
723         return src - orig_src;
724 }
725
726 int map_check_no_btf(const struct bpf_map *map,
727                      const struct btf *btf,
728                      const struct btf_type *key_type,
729                      const struct btf_type *value_type)
730 {
731         return -ENOTSUPP;
732 }
733
734 static int map_check_btf(struct bpf_map *map, const struct btf *btf,
735                          u32 btf_key_id, u32 btf_value_id)
736 {
737         const struct btf_type *key_type, *value_type;
738         u32 key_size, value_size;
739         int ret = 0;
740
741         /* Some maps allow key to be unspecified. */
742         if (btf_key_id) {
743                 key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
744                 if (!key_type || key_size != map->key_size)
745                         return -EINVAL;
746         } else {
747                 key_type = btf_type_by_id(btf, 0);
748                 if (!map->ops->map_check_btf)
749                         return -EINVAL;
750         }
751
752         value_type = btf_type_id_size(btf, &btf_value_id, &value_size);
753         if (!value_type || value_size != map->value_size)
754                 return -EINVAL;
755
756         map->spin_lock_off = btf_find_spin_lock(btf, value_type);
757
758         if (map_value_has_spin_lock(map)) {
759                 if (map->map_flags & BPF_F_RDONLY_PROG)
760                         return -EACCES;
761                 if (map->map_type != BPF_MAP_TYPE_HASH &&
762                     map->map_type != BPF_MAP_TYPE_ARRAY &&
763                     map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
764                     map->map_type != BPF_MAP_TYPE_SK_STORAGE)
765                         return -ENOTSUPP;
766                 if (map->spin_lock_off + sizeof(struct bpf_spin_lock) >
767                     map->value_size) {
768                         WARN_ONCE(1,
769                                   "verifier bug spin_lock_off %d value_size %d\n",
770                                   map->spin_lock_off, map->value_size);
771                         return -EFAULT;
772                 }
773         }
774
775         if (map->ops->map_check_btf)
776                 ret = map->ops->map_check_btf(map, btf, key_type, value_type);
777
778         return ret;
779 }
780
781 #define BPF_MAP_CREATE_LAST_FIELD btf_vmlinux_value_type_id
782 /* called via syscall */
783 static int map_create(union bpf_attr *attr)
784 {
785         int numa_node = bpf_map_attr_numa_node(attr);
786         struct bpf_map_memory mem;
787         struct bpf_map *map;
788         int f_flags;
789         int err;
790
791         err = CHECK_ATTR(BPF_MAP_CREATE);
792         if (err)
793                 return -EINVAL;
794
795         if (attr->btf_vmlinux_value_type_id) {
796                 if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS ||
797                     attr->btf_key_type_id || attr->btf_value_type_id)
798                         return -EINVAL;
799         } else if (attr->btf_key_type_id && !attr->btf_value_type_id) {
800                 return -EINVAL;
801         }
802
803         f_flags = bpf_get_file_flag(attr->map_flags);
804         if (f_flags < 0)
805                 return f_flags;
806
807         if (numa_node != NUMA_NO_NODE &&
808             ((unsigned int)numa_node >= nr_node_ids ||
809              !node_online(numa_node)))
810                 return -EINVAL;
811
812         /* find map type and init map: hashtable vs rbtree vs bloom vs ... */
813         map = find_and_alloc_map(attr);
814         if (IS_ERR(map))
815                 return PTR_ERR(map);
816
817         err = bpf_obj_name_cpy(map->name, attr->map_name,
818                                sizeof(attr->map_name));
819         if (err < 0)
820                 goto free_map;
821
822         atomic64_set(&map->refcnt, 1);
823         atomic64_set(&map->usercnt, 1);
824         mutex_init(&map->freeze_mutex);
825
826         map->spin_lock_off = -EINVAL;
827         if (attr->btf_key_type_id || attr->btf_value_type_id ||
828             /* Even the map's value is a kernel's struct,
829              * the bpf_prog.o must have BTF to begin with
830              * to figure out the corresponding kernel's
831              * counter part.  Thus, attr->btf_fd has
832              * to be valid also.
833              */
834             attr->btf_vmlinux_value_type_id) {
835                 struct btf *btf;
836
837                 btf = btf_get_by_fd(attr->btf_fd);
838                 if (IS_ERR(btf)) {
839                         err = PTR_ERR(btf);
840                         goto free_map;
841                 }
842                 map->btf = btf;
843
844                 if (attr->btf_value_type_id) {
845                         err = map_check_btf(map, btf, attr->btf_key_type_id,
846                                             attr->btf_value_type_id);
847                         if (err)
848                                 goto free_map;
849                 }
850
851                 map->btf_key_type_id = attr->btf_key_type_id;
852                 map->btf_value_type_id = attr->btf_value_type_id;
853                 map->btf_vmlinux_value_type_id =
854                         attr->btf_vmlinux_value_type_id;
855         }
856
857         err = security_bpf_map_alloc(map);
858         if (err)
859                 goto free_map;
860
861         err = bpf_map_alloc_id(map);
862         if (err)
863                 goto free_map_sec;
864
865         err = bpf_map_new_fd(map, f_flags);
866         if (err < 0) {
867                 /* failed to allocate fd.
868                  * bpf_map_put_with_uref() is needed because the above
869                  * bpf_map_alloc_id() has published the map
870                  * to the userspace and the userspace may
871                  * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID.
872                  */
873                 bpf_map_put_with_uref(map);
874                 return err;
875         }
876
877         return err;
878
879 free_map_sec:
880         security_bpf_map_free(map);
881 free_map:
882         btf_put(map->btf);
883         bpf_map_charge_move(&mem, &map->memory);
884         map->ops->map_free(map);
885         bpf_map_charge_finish(&mem);
886         return err;
887 }
888
889 /* if error is returned, fd is released.
890  * On success caller should complete fd access with matching fdput()
891  */
892 struct bpf_map *__bpf_map_get(struct fd f)
893 {
894         if (!f.file)
895                 return ERR_PTR(-EBADF);
896         if (f.file->f_op != &bpf_map_fops) {
897                 fdput(f);
898                 return ERR_PTR(-EINVAL);
899         }
900
901         return f.file->private_data;
902 }
903
904 void bpf_map_inc(struct bpf_map *map)
905 {
906         atomic64_inc(&map->refcnt);
907 }
908 EXPORT_SYMBOL_GPL(bpf_map_inc);
909
910 void bpf_map_inc_with_uref(struct bpf_map *map)
911 {
912         atomic64_inc(&map->refcnt);
913         atomic64_inc(&map->usercnt);
914 }
915 EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref);
916
917 struct bpf_map *bpf_map_get(u32 ufd)
918 {
919         struct fd f = fdget(ufd);
920         struct bpf_map *map;
921
922         map = __bpf_map_get(f);
923         if (IS_ERR(map))
924                 return map;
925
926         bpf_map_inc(map);
927         fdput(f);
928
929         return map;
930 }
931
932 struct bpf_map *bpf_map_get_with_uref(u32 ufd)
933 {
934         struct fd f = fdget(ufd);
935         struct bpf_map *map;
936
937         map = __bpf_map_get(f);
938         if (IS_ERR(map))
939                 return map;
940
941         bpf_map_inc_with_uref(map);
942         fdput(f);
943
944         return map;
945 }
946
947 /* map_idr_lock should have been held */
948 static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
949 {
950         int refold;
951
952         refold = atomic64_fetch_add_unless(&map->refcnt, 1, 0);
953         if (!refold)
954                 return ERR_PTR(-ENOENT);
955         if (uref)
956                 atomic64_inc(&map->usercnt);
957
958         return map;
959 }
960
961 struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map)
962 {
963         spin_lock_bh(&map_idr_lock);
964         map = __bpf_map_inc_not_zero(map, false);
965         spin_unlock_bh(&map_idr_lock);
966
967         return map;
968 }
969 EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero);
970
971 int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
972 {
973         return -ENOTSUPP;
974 }
975
976 static void *__bpf_copy_key(void __user *ukey, u64 key_size)
977 {
978         if (key_size)
979                 return memdup_user(ukey, key_size);
980
981         if (ukey)
982                 return ERR_PTR(-EINVAL);
983
984         return NULL;
985 }
986
987 /* last field in 'union bpf_attr' used by this command */
988 #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags
989
990 static int map_lookup_elem(union bpf_attr *attr)
991 {
992         void __user *ukey = u64_to_user_ptr(attr->key);
993         void __user *uvalue = u64_to_user_ptr(attr->value);
994         int ufd = attr->map_fd;
995         struct bpf_map *map;
996         void *key, *value;
997         u32 value_size;
998         struct fd f;
999         int err;
1000
1001         if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
1002                 return -EINVAL;
1003
1004         if (attr->flags & ~BPF_F_LOCK)
1005                 return -EINVAL;
1006
1007         f = fdget(ufd);
1008         map = __bpf_map_get(f);
1009         if (IS_ERR(map))
1010                 return PTR_ERR(map);
1011         if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
1012                 err = -EPERM;
1013                 goto err_put;
1014         }
1015
1016         if ((attr->flags & BPF_F_LOCK) &&
1017             !map_value_has_spin_lock(map)) {
1018                 err = -EINVAL;
1019                 goto err_put;
1020         }
1021
1022         key = __bpf_copy_key(ukey, map->key_size);
1023         if (IS_ERR(key)) {
1024                 err = PTR_ERR(key);
1025                 goto err_put;
1026         }
1027
1028         value_size = bpf_map_value_size(map);
1029
1030         err = -ENOMEM;
1031         value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
1032         if (!value)
1033                 goto free_key;
1034
1035         err = bpf_map_copy_value(map, key, value, attr->flags);
1036         if (err)
1037                 goto free_value;
1038
1039         err = -EFAULT;
1040         if (copy_to_user(uvalue, value, value_size) != 0)
1041                 goto free_value;
1042
1043         err = 0;
1044
1045 free_value:
1046         kfree(value);
1047 free_key:
1048         kfree(key);
1049 err_put:
1050         fdput(f);
1051         return err;
1052 }
1053
1054
1055 #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
1056
1057 static int map_update_elem(union bpf_attr *attr)
1058 {
1059         void __user *ukey = u64_to_user_ptr(attr->key);
1060         void __user *uvalue = u64_to_user_ptr(attr->value);
1061         int ufd = attr->map_fd;
1062         struct bpf_map *map;
1063         void *key, *value;
1064         u32 value_size;
1065         struct fd f;
1066         int err;
1067
1068         if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
1069                 return -EINVAL;
1070
1071         f = fdget(ufd);
1072         map = __bpf_map_get(f);
1073         if (IS_ERR(map))
1074                 return PTR_ERR(map);
1075         if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
1076                 err = -EPERM;
1077                 goto err_put;
1078         }
1079
1080         if ((attr->flags & BPF_F_LOCK) &&
1081             !map_value_has_spin_lock(map)) {
1082                 err = -EINVAL;
1083                 goto err_put;
1084         }
1085
1086         key = __bpf_copy_key(ukey, map->key_size);
1087         if (IS_ERR(key)) {
1088                 err = PTR_ERR(key);
1089                 goto err_put;
1090         }
1091
1092         if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
1093             map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
1094             map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
1095             map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
1096                 value_size = round_up(map->value_size, 8) * num_possible_cpus();
1097         else
1098                 value_size = map->value_size;
1099
1100         err = -ENOMEM;
1101         value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
1102         if (!value)
1103                 goto free_key;
1104
1105         err = -EFAULT;
1106         if (copy_from_user(value, uvalue, value_size) != 0)
1107                 goto free_value;
1108
1109         err = bpf_map_update_value(map, f, key, value, attr->flags);
1110
1111 free_value:
1112         kfree(value);
1113 free_key:
1114         kfree(key);
1115 err_put:
1116         fdput(f);
1117         return err;
1118 }
1119
1120 #define BPF_MAP_DELETE_ELEM_LAST_FIELD key
1121
1122 static int map_delete_elem(union bpf_attr *attr)
1123 {
1124         void __user *ukey = u64_to_user_ptr(attr->key);
1125         int ufd = attr->map_fd;
1126         struct bpf_map *map;
1127         struct fd f;
1128         void *key;
1129         int err;
1130
1131         if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
1132                 return -EINVAL;
1133
1134         f = fdget(ufd);
1135         map = __bpf_map_get(f);
1136         if (IS_ERR(map))
1137                 return PTR_ERR(map);
1138         if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
1139                 err = -EPERM;
1140                 goto err_put;
1141         }
1142
1143         key = __bpf_copy_key(ukey, map->key_size);
1144         if (IS_ERR(key)) {
1145                 err = PTR_ERR(key);
1146                 goto err_put;
1147         }
1148
1149         if (bpf_map_is_dev_bound(map)) {
1150                 err = bpf_map_offload_delete_elem(map, key);
1151                 goto out;
1152         } else if (IS_FD_PROG_ARRAY(map) ||
1153                    map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
1154                 /* These maps require sleepable context */
1155                 err = map->ops->map_delete_elem(map, key);
1156                 goto out;
1157         }
1158
1159         bpf_disable_instrumentation();
1160         rcu_read_lock();
1161         err = map->ops->map_delete_elem(map, key);
1162         rcu_read_unlock();
1163         bpf_enable_instrumentation();
1164         maybe_wait_bpf_programs(map);
1165 out:
1166         kfree(key);
1167 err_put:
1168         fdput(f);
1169         return err;
1170 }
1171
1172 /* last field in 'union bpf_attr' used by this command */
1173 #define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
1174
1175 static int map_get_next_key(union bpf_attr *attr)
1176 {
1177         void __user *ukey = u64_to_user_ptr(attr->key);
1178         void __user *unext_key = u64_to_user_ptr(attr->next_key);
1179         int ufd = attr->map_fd;
1180         struct bpf_map *map;
1181         void *key, *next_key;
1182         struct fd f;
1183         int err;
1184
1185         if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
1186                 return -EINVAL;
1187
1188         f = fdget(ufd);
1189         map = __bpf_map_get(f);
1190         if (IS_ERR(map))
1191                 return PTR_ERR(map);
1192         if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
1193                 err = -EPERM;
1194                 goto err_put;
1195         }
1196
1197         if (ukey) {
1198                 key = __bpf_copy_key(ukey, map->key_size);
1199                 if (IS_ERR(key)) {
1200                         err = PTR_ERR(key);
1201                         goto err_put;
1202                 }
1203         } else {
1204                 key = NULL;
1205         }
1206
1207         err = -ENOMEM;
1208         next_key = kmalloc(map->key_size, GFP_USER);
1209         if (!next_key)
1210                 goto free_key;
1211
1212         if (bpf_map_is_dev_bound(map)) {
1213                 err = bpf_map_offload_get_next_key(map, key, next_key);
1214                 goto out;
1215         }
1216
1217         rcu_read_lock();
1218         err = map->ops->map_get_next_key(map, key, next_key);
1219         rcu_read_unlock();
1220 out:
1221         if (err)
1222                 goto free_next_key;
1223
1224         err = -EFAULT;
1225         if (copy_to_user(unext_key, next_key, map->key_size) != 0)
1226                 goto free_next_key;
1227
1228         err = 0;
1229
1230 free_next_key:
1231         kfree(next_key);
1232 free_key:
1233         kfree(key);
1234 err_put:
1235         fdput(f);
1236         return err;
1237 }
1238
1239 int generic_map_delete_batch(struct bpf_map *map,
1240                              const union bpf_attr *attr,
1241                              union bpf_attr __user *uattr)
1242 {
1243         void __user *keys = u64_to_user_ptr(attr->batch.keys);
1244         u32 cp, max_count;
1245         int err = 0;
1246         void *key;
1247
1248         if (attr->batch.elem_flags & ~BPF_F_LOCK)
1249                 return -EINVAL;
1250
1251         if ((attr->batch.elem_flags & BPF_F_LOCK) &&
1252             !map_value_has_spin_lock(map)) {
1253                 return -EINVAL;
1254         }
1255
1256         max_count = attr->batch.count;
1257         if (!max_count)
1258                 return 0;
1259
1260         key = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
1261         if (!key)
1262                 return -ENOMEM;
1263
1264         for (cp = 0; cp < max_count; cp++) {
1265                 err = -EFAULT;
1266                 if (copy_from_user(key, keys + cp * map->key_size,
1267                                    map->key_size))
1268                         break;
1269
1270                 if (bpf_map_is_dev_bound(map)) {
1271                         err = bpf_map_offload_delete_elem(map, key);
1272                         break;
1273                 }
1274
1275                 bpf_disable_instrumentation();
1276                 rcu_read_lock();
1277                 err = map->ops->map_delete_elem(map, key);
1278                 rcu_read_unlock();
1279                 bpf_enable_instrumentation();
1280                 maybe_wait_bpf_programs(map);
1281                 if (err)
1282                         break;
1283         }
1284         if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
1285                 err = -EFAULT;
1286
1287         kfree(key);
1288         return err;
1289 }
1290
1291 int generic_map_update_batch(struct bpf_map *map,
1292                              const union bpf_attr *attr,
1293                              union bpf_attr __user *uattr)
1294 {
1295         void __user *values = u64_to_user_ptr(attr->batch.values);
1296         void __user *keys = u64_to_user_ptr(attr->batch.keys);
1297         u32 value_size, cp, max_count;
1298         int ufd = attr->map_fd;
1299         void *key, *value;
1300         struct fd f;
1301         int err = 0;
1302
1303         f = fdget(ufd);
1304         if (attr->batch.elem_flags & ~BPF_F_LOCK)
1305                 return -EINVAL;
1306
1307         if ((attr->batch.elem_flags & BPF_F_LOCK) &&
1308             !map_value_has_spin_lock(map)) {
1309                 return -EINVAL;
1310         }
1311
1312         value_size = bpf_map_value_size(map);
1313
1314         max_count = attr->batch.count;
1315         if (!max_count)
1316                 return 0;
1317
1318         key = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
1319         if (!key)
1320                 return -ENOMEM;
1321
1322         value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
1323         if (!value) {
1324                 kfree(key);
1325                 return -ENOMEM;
1326         }
1327
1328         for (cp = 0; cp < max_count; cp++) {
1329                 err = -EFAULT;
1330                 if (copy_from_user(key, keys + cp * map->key_size,
1331                     map->key_size) ||
1332                     copy_from_user(value, values + cp * value_size, value_size))
1333                         break;
1334
1335                 err = bpf_map_update_value(map, f, key, value,
1336                                            attr->batch.elem_flags);
1337
1338                 if (err)
1339                         break;
1340         }
1341
1342         if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
1343                 err = -EFAULT;
1344
1345         kfree(value);
1346         kfree(key);
1347         return err;
1348 }
1349
1350 #define MAP_LOOKUP_RETRIES 3
1351
1352 int generic_map_lookup_batch(struct bpf_map *map,
1353                                     const union bpf_attr *attr,
1354                                     union bpf_attr __user *uattr)
1355 {
1356         void __user *uobatch = u64_to_user_ptr(attr->batch.out_batch);
1357         void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
1358         void __user *values = u64_to_user_ptr(attr->batch.values);
1359         void __user *keys = u64_to_user_ptr(attr->batch.keys);
1360         void *buf, *buf_prevkey, *prev_key, *key, *value;
1361         int err, retry = MAP_LOOKUP_RETRIES;
1362         u32 value_size, cp, max_count;
1363
1364         if (attr->batch.elem_flags & ~BPF_F_LOCK)
1365                 return -EINVAL;
1366
1367         if ((attr->batch.elem_flags & BPF_F_LOCK) &&
1368             !map_value_has_spin_lock(map))
1369                 return -EINVAL;
1370
1371         value_size = bpf_map_value_size(map);
1372
1373         max_count = attr->batch.count;
1374         if (!max_count)
1375                 return 0;
1376
1377         if (put_user(0, &uattr->batch.count))
1378                 return -EFAULT;
1379
1380         buf_prevkey = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
1381         if (!buf_prevkey)
1382                 return -ENOMEM;
1383
1384         buf = kmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN);
1385         if (!buf) {
1386                 kvfree(buf_prevkey);
1387                 return -ENOMEM;
1388         }
1389
1390         err = -EFAULT;
1391         prev_key = NULL;
1392         if (ubatch && copy_from_user(buf_prevkey, ubatch, map->key_size))
1393                 goto free_buf;
1394         key = buf;
1395         value = key + map->key_size;
1396         if (ubatch)
1397                 prev_key = buf_prevkey;
1398
1399         for (cp = 0; cp < max_count;) {
1400                 rcu_read_lock();
1401                 err = map->ops->map_get_next_key(map, prev_key, key);
1402                 rcu_read_unlock();
1403                 if (err)
1404                         break;
1405                 err = bpf_map_copy_value(map, key, value,
1406                                          attr->batch.elem_flags);
1407
1408                 if (err == -ENOENT) {
1409                         if (retry) {
1410                                 retry--;
1411                                 continue;
1412                         }
1413                         err = -EINTR;
1414                         break;
1415                 }
1416
1417                 if (err)
1418                         goto free_buf;
1419
1420                 if (copy_to_user(keys + cp * map->key_size, key,
1421                                  map->key_size)) {
1422                         err = -EFAULT;
1423                         goto free_buf;
1424                 }
1425                 if (copy_to_user(values + cp * value_size, value, value_size)) {
1426                         err = -EFAULT;
1427                         goto free_buf;
1428                 }
1429
1430                 if (!prev_key)
1431                         prev_key = buf_prevkey;
1432
1433                 swap(prev_key, key);
1434                 retry = MAP_LOOKUP_RETRIES;
1435                 cp++;
1436         }
1437
1438         if (err == -EFAULT)
1439                 goto free_buf;
1440
1441         if ((copy_to_user(&uattr->batch.count, &cp, sizeof(cp)) ||
1442                     (cp && copy_to_user(uobatch, prev_key, map->key_size))))
1443                 err = -EFAULT;
1444
1445 free_buf:
1446         kfree(buf_prevkey);
1447         kfree(buf);
1448         return err;
1449 }
1450
1451 #define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value
1452
1453 static int map_lookup_and_delete_elem(union bpf_attr *attr)
1454 {
1455         void __user *ukey = u64_to_user_ptr(attr->key);
1456         void __user *uvalue = u64_to_user_ptr(attr->value);
1457         int ufd = attr->map_fd;
1458         struct bpf_map *map;
1459         void *key, *value;
1460         u32 value_size;
1461         struct fd f;
1462         int err;
1463
1464         if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
1465                 return -EINVAL;
1466
1467         f = fdget(ufd);
1468         map = __bpf_map_get(f);
1469         if (IS_ERR(map))
1470                 return PTR_ERR(map);
1471         if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
1472                 err = -EPERM;
1473                 goto err_put;
1474         }
1475
1476         key = __bpf_copy_key(ukey, map->key_size);
1477         if (IS_ERR(key)) {
1478                 err = PTR_ERR(key);
1479                 goto err_put;
1480         }
1481
1482         value_size = map->value_size;
1483
1484         err = -ENOMEM;
1485         value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
1486         if (!value)
1487                 goto free_key;
1488
1489         if (map->map_type == BPF_MAP_TYPE_QUEUE ||
1490             map->map_type == BPF_MAP_TYPE_STACK) {
1491                 err = map->ops->map_pop_elem(map, value);
1492         } else {
1493                 err = -ENOTSUPP;
1494         }
1495
1496         if (err)
1497                 goto free_value;
1498
1499         if (copy_to_user(uvalue, value, value_size) != 0) {
1500                 err = -EFAULT;
1501                 goto free_value;
1502         }
1503
1504         err = 0;
1505
1506 free_value:
1507         kfree(value);
1508 free_key:
1509         kfree(key);
1510 err_put:
1511         fdput(f);
1512         return err;
1513 }
1514
1515 #define BPF_MAP_FREEZE_LAST_FIELD map_fd
1516
1517 static int map_freeze(const union bpf_attr *attr)
1518 {
1519         int err = 0, ufd = attr->map_fd;
1520         struct bpf_map *map;
1521         struct fd f;
1522
1523         if (CHECK_ATTR(BPF_MAP_FREEZE))
1524                 return -EINVAL;
1525
1526         f = fdget(ufd);
1527         map = __bpf_map_get(f);
1528         if (IS_ERR(map))
1529                 return PTR_ERR(map);
1530
1531         if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
1532                 fdput(f);
1533                 return -ENOTSUPP;
1534         }
1535
1536         mutex_lock(&map->freeze_mutex);
1537
1538         if (map->writecnt) {
1539                 err = -EBUSY;
1540                 goto err_put;
1541         }
1542         if (READ_ONCE(map->frozen)) {
1543                 err = -EBUSY;
1544                 goto err_put;
1545         }
1546         if (!capable(CAP_SYS_ADMIN)) {
1547                 err = -EPERM;
1548                 goto err_put;
1549         }
1550
1551         WRITE_ONCE(map->frozen, true);
1552 err_put:
1553         mutex_unlock(&map->freeze_mutex);
1554         fdput(f);
1555         return err;
1556 }
1557
1558 static const struct bpf_prog_ops * const bpf_prog_types[] = {
1559 #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
1560         [_id] = & _name ## _prog_ops,
1561 #define BPF_MAP_TYPE(_id, _ops)
1562 #include <linux/bpf_types.h>
1563 #undef BPF_PROG_TYPE
1564 #undef BPF_MAP_TYPE
1565 };
1566
1567 static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
1568 {
1569         const struct bpf_prog_ops *ops;
1570
1571         if (type >= ARRAY_SIZE(bpf_prog_types))
1572                 return -EINVAL;
1573         type = array_index_nospec(type, ARRAY_SIZE(bpf_prog_types));
1574         ops = bpf_prog_types[type];
1575         if (!ops)
1576                 return -EINVAL;
1577
1578         if (!bpf_prog_is_dev_bound(prog->aux))
1579                 prog->aux->ops = ops;
1580         else
1581                 prog->aux->ops = &bpf_offload_prog_ops;
1582         prog->type = type;
1583         return 0;
1584 }
1585
1586 enum bpf_audit {
1587         BPF_AUDIT_LOAD,
1588         BPF_AUDIT_UNLOAD,
1589         BPF_AUDIT_MAX,
1590 };
1591
1592 static const char * const bpf_audit_str[BPF_AUDIT_MAX] = {
1593         [BPF_AUDIT_LOAD]   = "LOAD",
1594         [BPF_AUDIT_UNLOAD] = "UNLOAD",
1595 };
1596
1597 static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op)
1598 {
1599         struct audit_context *ctx = NULL;
1600         struct audit_buffer *ab;
1601
1602         if (WARN_ON_ONCE(op >= BPF_AUDIT_MAX))
1603                 return;
1604         if (audit_enabled == AUDIT_OFF)
1605                 return;
1606         if (op == BPF_AUDIT_LOAD)
1607                 ctx = audit_context();
1608         ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF);
1609         if (unlikely(!ab))
1610                 return;
1611         audit_log_format(ab, "prog-id=%u op=%s",
1612                          prog->aux->id, bpf_audit_str[op]);
1613         audit_log_end(ab);
1614 }
1615
1616 int __bpf_prog_charge(struct user_struct *user, u32 pages)
1617 {
1618         unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
1619         unsigned long user_bufs;
1620
1621         if (user) {
1622                 user_bufs = atomic_long_add_return(pages, &user->locked_vm);
1623                 if (user_bufs > memlock_limit) {
1624                         atomic_long_sub(pages, &user->locked_vm);
1625                         return -EPERM;
1626                 }
1627         }
1628
1629         return 0;
1630 }
1631
1632 void __bpf_prog_uncharge(struct user_struct *user, u32 pages)
1633 {
1634         if (user)
1635                 atomic_long_sub(pages, &user->locked_vm);
1636 }
1637
1638 static int bpf_prog_charge_memlock(struct bpf_prog *prog)
1639 {
1640         struct user_struct *user = get_current_user();
1641         int ret;
1642
1643         ret = __bpf_prog_charge(user, prog->pages);
1644         if (ret) {
1645                 free_uid(user);
1646                 return ret;
1647         }
1648
1649         prog->aux->user = user;
1650         return 0;
1651 }
1652
1653 static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
1654 {
1655         struct user_struct *user = prog->aux->user;
1656
1657         __bpf_prog_uncharge(user, prog->pages);
1658         free_uid(user);
1659 }
1660
1661 static int bpf_prog_alloc_id(struct bpf_prog *prog)
1662 {
1663         int id;
1664
1665         idr_preload(GFP_KERNEL);
1666         spin_lock_bh(&prog_idr_lock);
1667         id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC);
1668         if (id > 0)
1669                 prog->aux->id = id;
1670         spin_unlock_bh(&prog_idr_lock);
1671         idr_preload_end();
1672
1673         /* id is in [1, INT_MAX) */
1674         if (WARN_ON_ONCE(!id))
1675                 return -ENOSPC;
1676
1677         return id > 0 ? 0 : id;
1678 }
1679
1680 void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
1681 {
1682         /* cBPF to eBPF migrations are currently not in the idr store.
1683          * Offloaded programs are removed from the store when their device
1684          * disappears - even if someone grabs an fd to them they are unusable,
1685          * simply waiting for refcnt to drop to be freed.
1686          */
1687         if (!prog->aux->id)
1688                 return;
1689
1690         if (do_idr_lock)
1691                 spin_lock_bh(&prog_idr_lock);
1692         else
1693                 __acquire(&prog_idr_lock);
1694
1695         idr_remove(&prog_idr, prog->aux->id);
1696         prog->aux->id = 0;
1697
1698         if (do_idr_lock)
1699                 spin_unlock_bh(&prog_idr_lock);
1700         else
1701                 __release(&prog_idr_lock);
1702 }
1703
1704 static void __bpf_prog_put_rcu(struct rcu_head *rcu)
1705 {
1706         struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
1707
1708         kvfree(aux->func_info);
1709         kfree(aux->func_info_aux);
1710         bpf_prog_uncharge_memlock(aux->prog);
1711         security_bpf_prog_free(aux);
1712         bpf_prog_free(aux->prog);
1713 }
1714
1715 static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
1716 {
1717         bpf_prog_kallsyms_del_all(prog);
1718         btf_put(prog->aux->btf);
1719         bpf_prog_free_linfo(prog);
1720
1721         if (deferred)
1722                 call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
1723         else
1724                 __bpf_prog_put_rcu(&prog->aux->rcu);
1725 }
1726
1727 static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
1728 {
1729         if (atomic64_dec_and_test(&prog->aux->refcnt)) {
1730                 perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
1731                 bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
1732                 /* bpf_prog_free_id() must be called first */
1733                 bpf_prog_free_id(prog, do_idr_lock);
1734                 __bpf_prog_put_noref(prog, true);
1735         }
1736 }
1737
1738 void bpf_prog_put(struct bpf_prog *prog)
1739 {
1740         __bpf_prog_put(prog, true);
1741 }
1742 EXPORT_SYMBOL_GPL(bpf_prog_put);
1743
1744 static int bpf_prog_release(struct inode *inode, struct file *filp)
1745 {
1746         struct bpf_prog *prog = filp->private_data;
1747
1748         bpf_prog_put(prog);
1749         return 0;
1750 }
1751
1752 static void bpf_prog_get_stats(const struct bpf_prog *prog,
1753                                struct bpf_prog_stats *stats)
1754 {
1755         u64 nsecs = 0, cnt = 0;
1756         int cpu;
1757
1758         for_each_possible_cpu(cpu) {
1759                 const struct bpf_prog_stats *st;
1760                 unsigned int start;
1761                 u64 tnsecs, tcnt;
1762
1763                 st = per_cpu_ptr(prog->aux->stats, cpu);
1764                 do {
1765                         start = u64_stats_fetch_begin_irq(&st->syncp);
1766                         tnsecs = st->nsecs;
1767                         tcnt = st->cnt;
1768                 } while (u64_stats_fetch_retry_irq(&st->syncp, start));
1769                 nsecs += tnsecs;
1770                 cnt += tcnt;
1771         }
1772         stats->nsecs = nsecs;
1773         stats->cnt = cnt;
1774 }
1775
1776 #ifdef CONFIG_PROC_FS
1777 static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
1778 {
1779         const struct bpf_prog *prog = filp->private_data;
1780         char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
1781         struct bpf_prog_stats stats;
1782
1783         bpf_prog_get_stats(prog, &stats);
1784         bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
1785         seq_printf(m,
1786                    "prog_type:\t%u\n"
1787                    "prog_jited:\t%u\n"
1788                    "prog_tag:\t%s\n"
1789                    "memlock:\t%llu\n"
1790                    "prog_id:\t%u\n"
1791                    "run_time_ns:\t%llu\n"
1792                    "run_cnt:\t%llu\n",
1793                    prog->type,
1794                    prog->jited,
1795                    prog_tag,
1796                    prog->pages * 1ULL << PAGE_SHIFT,
1797                    prog->aux->id,
1798                    stats.nsecs,
1799                    stats.cnt);
1800 }
1801 #endif
1802
1803 const struct file_operations bpf_prog_fops = {
1804 #ifdef CONFIG_PROC_FS
1805         .show_fdinfo    = bpf_prog_show_fdinfo,
1806 #endif
1807         .release        = bpf_prog_release,
1808         .read           = bpf_dummy_read,
1809         .write          = bpf_dummy_write,
1810 };
1811
1812 int bpf_prog_new_fd(struct bpf_prog *prog)
1813 {
1814         int ret;
1815
1816         ret = security_bpf_prog(prog);
1817         if (ret < 0)
1818                 return ret;
1819
1820         return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog,
1821                                 O_RDWR | O_CLOEXEC);
1822 }
1823
1824 static struct bpf_prog *____bpf_prog_get(struct fd f)
1825 {
1826         if (!f.file)
1827                 return ERR_PTR(-EBADF);
1828         if (f.file->f_op != &bpf_prog_fops) {
1829                 fdput(f);
1830                 return ERR_PTR(-EINVAL);
1831         }
1832
1833         return f.file->private_data;
1834 }
1835
1836 void bpf_prog_add(struct bpf_prog *prog, int i)
1837 {
1838         atomic64_add(i, &prog->aux->refcnt);
1839 }
1840 EXPORT_SYMBOL_GPL(bpf_prog_add);
1841
1842 void bpf_prog_sub(struct bpf_prog *prog, int i)
1843 {
1844         /* Only to be used for undoing previous bpf_prog_add() in some
1845          * error path. We still know that another entity in our call
1846          * path holds a reference to the program, thus atomic_sub() can
1847          * be safely used in such cases!
1848          */
1849         WARN_ON(atomic64_sub_return(i, &prog->aux->refcnt) == 0);
1850 }
1851 EXPORT_SYMBOL_GPL(bpf_prog_sub);
1852
1853 void bpf_prog_inc(struct bpf_prog *prog)
1854 {
1855         atomic64_inc(&prog->aux->refcnt);
1856 }
1857 EXPORT_SYMBOL_GPL(bpf_prog_inc);
1858
1859 /* prog_idr_lock should have been held */
1860 struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
1861 {
1862         int refold;
1863
1864         refold = atomic64_fetch_add_unless(&prog->aux->refcnt, 1, 0);
1865
1866         if (!refold)
1867                 return ERR_PTR(-ENOENT);
1868
1869         return prog;
1870 }
1871 EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
1872
1873 bool bpf_prog_get_ok(struct bpf_prog *prog,
1874                             enum bpf_prog_type *attach_type, bool attach_drv)
1875 {
1876         /* not an attachment, just a refcount inc, always allow */
1877         if (!attach_type)
1878                 return true;
1879
1880         if (prog->type != *attach_type)
1881                 return false;
1882         if (bpf_prog_is_dev_bound(prog->aux) && !attach_drv)
1883                 return false;
1884
1885         return true;
1886 }
1887
1888 static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type,
1889                                        bool attach_drv)
1890 {
1891         struct fd f = fdget(ufd);
1892         struct bpf_prog *prog;
1893
1894         prog = ____bpf_prog_get(f);
1895         if (IS_ERR(prog))
1896                 return prog;
1897         if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) {
1898                 prog = ERR_PTR(-EINVAL);
1899                 goto out;
1900         }
1901
1902         bpf_prog_inc(prog);
1903 out:
1904         fdput(f);
1905         return prog;
1906 }
1907
1908 struct bpf_prog *bpf_prog_get(u32 ufd)
1909 {
1910         return __bpf_prog_get(ufd, NULL, false);
1911 }
1912
1913 struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
1914                                        bool attach_drv)
1915 {
1916         return __bpf_prog_get(ufd, &type, attach_drv);
1917 }
1918 EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev);
1919
1920 /* Initially all BPF programs could be loaded w/o specifying
1921  * expected_attach_type. Later for some of them specifying expected_attach_type
1922  * at load time became required so that program could be validated properly.
1923  * Programs of types that are allowed to be loaded both w/ and w/o (for
1924  * backward compatibility) expected_attach_type, should have the default attach
1925  * type assigned to expected_attach_type for the latter case, so that it can be
1926  * validated later at attach time.
1927  *
1928  * bpf_prog_load_fixup_attach_type() sets expected_attach_type in @attr if
1929  * prog type requires it but has some attach types that have to be backward
1930  * compatible.
1931  */
1932 static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr)
1933 {
1934         switch (attr->prog_type) {
1935         case BPF_PROG_TYPE_CGROUP_SOCK:
1936                 /* Unfortunately BPF_ATTACH_TYPE_UNSPEC enumeration doesn't
1937                  * exist so checking for non-zero is the way to go here.
1938                  */
1939                 if (!attr->expected_attach_type)
1940                         attr->expected_attach_type =
1941                                 BPF_CGROUP_INET_SOCK_CREATE;
1942                 break;
1943         }
1944 }
1945
1946 static int
1947 bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
1948                            enum bpf_attach_type expected_attach_type,
1949                            u32 btf_id, u32 prog_fd)
1950 {
1951         if (btf_id) {
1952                 if (btf_id > BTF_MAX_TYPE)
1953                         return -EINVAL;
1954
1955                 switch (prog_type) {
1956                 case BPF_PROG_TYPE_TRACING:
1957                 case BPF_PROG_TYPE_LSM:
1958                 case BPF_PROG_TYPE_STRUCT_OPS:
1959                 case BPF_PROG_TYPE_EXT:
1960                         break;
1961                 default:
1962                         return -EINVAL;
1963                 }
1964         }
1965
1966         if (prog_fd && prog_type != BPF_PROG_TYPE_TRACING &&
1967             prog_type != BPF_PROG_TYPE_EXT)
1968                 return -EINVAL;
1969
1970         switch (prog_type) {
1971         case BPF_PROG_TYPE_CGROUP_SOCK:
1972                 switch (expected_attach_type) {
1973                 case BPF_CGROUP_INET_SOCK_CREATE:
1974                 case BPF_CGROUP_INET4_POST_BIND:
1975                 case BPF_CGROUP_INET6_POST_BIND:
1976                         return 0;
1977                 default:
1978                         return -EINVAL;
1979                 }
1980         case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
1981                 switch (expected_attach_type) {
1982                 case BPF_CGROUP_INET4_BIND:
1983                 case BPF_CGROUP_INET6_BIND:
1984                 case BPF_CGROUP_INET4_CONNECT:
1985                 case BPF_CGROUP_INET6_CONNECT:
1986                 case BPF_CGROUP_UDP4_SENDMSG:
1987                 case BPF_CGROUP_UDP6_SENDMSG:
1988                 case BPF_CGROUP_UDP4_RECVMSG:
1989                 case BPF_CGROUP_UDP6_RECVMSG:
1990                         return 0;
1991                 default:
1992                         return -EINVAL;
1993                 }
1994         case BPF_PROG_TYPE_CGROUP_SKB:
1995                 switch (expected_attach_type) {
1996                 case BPF_CGROUP_INET_INGRESS:
1997                 case BPF_CGROUP_INET_EGRESS:
1998                         return 0;
1999                 default:
2000                         return -EINVAL;
2001                 }
2002         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
2003                 switch (expected_attach_type) {
2004                 case BPF_CGROUP_SETSOCKOPT:
2005                 case BPF_CGROUP_GETSOCKOPT:
2006                         return 0;
2007                 default:
2008                         return -EINVAL;
2009                 }
2010         case BPF_PROG_TYPE_EXT:
2011                 if (expected_attach_type)
2012                         return -EINVAL;
2013                 /* fallthrough */
2014         default:
2015                 return 0;
2016         }
2017 }
2018
2019 /* last field in 'union bpf_attr' used by this command */
2020 #define BPF_PROG_LOAD_LAST_FIELD attach_prog_fd
2021
2022 static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
2023 {
2024         enum bpf_prog_type type = attr->prog_type;
2025         struct bpf_prog *prog;
2026         int err;
2027         char license[128];
2028         bool is_gpl;
2029
2030         if (CHECK_ATTR(BPF_PROG_LOAD))
2031                 return -EINVAL;
2032
2033         if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |
2034                                  BPF_F_ANY_ALIGNMENT |
2035                                  BPF_F_TEST_STATE_FREQ |
2036                                  BPF_F_TEST_RND_HI32))
2037                 return -EINVAL;
2038
2039         if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
2040             (attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
2041             !capable(CAP_SYS_ADMIN))
2042                 return -EPERM;
2043
2044         /* copy eBPF program license from user space */
2045         if (strncpy_from_user(license, u64_to_user_ptr(attr->license),
2046                               sizeof(license) - 1) < 0)
2047                 return -EFAULT;
2048         license[sizeof(license) - 1] = 0;
2049
2050         /* eBPF programs must be GPL compatible to use GPL-ed functions */
2051         is_gpl = license_is_gpl_compatible(license);
2052
2053         if (attr->insn_cnt == 0 ||
2054             attr->insn_cnt > (capable(CAP_SYS_ADMIN) ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
2055                 return -E2BIG;
2056         if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
2057             type != BPF_PROG_TYPE_CGROUP_SKB &&
2058             !capable(CAP_SYS_ADMIN))
2059                 return -EPERM;
2060
2061         bpf_prog_load_fixup_attach_type(attr);
2062         if (bpf_prog_load_check_attach(type, attr->expected_attach_type,
2063                                        attr->attach_btf_id,
2064                                        attr->attach_prog_fd))
2065                 return -EINVAL;
2066
2067         /* plain bpf_prog allocation */
2068         prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
2069         if (!prog)
2070                 return -ENOMEM;
2071
2072         prog->expected_attach_type = attr->expected_attach_type;
2073         prog->aux->attach_btf_id = attr->attach_btf_id;
2074         if (attr->attach_prog_fd) {
2075                 struct bpf_prog *tgt_prog;
2076
2077                 tgt_prog = bpf_prog_get(attr->attach_prog_fd);
2078                 if (IS_ERR(tgt_prog)) {
2079                         err = PTR_ERR(tgt_prog);
2080                         goto free_prog_nouncharge;
2081                 }
2082                 prog->aux->linked_prog = tgt_prog;
2083         }
2084
2085         prog->aux->offload_requested = !!attr->prog_ifindex;
2086
2087         err = security_bpf_prog_alloc(prog->aux);
2088         if (err)
2089                 goto free_prog_nouncharge;
2090
2091         err = bpf_prog_charge_memlock(prog);
2092         if (err)
2093                 goto free_prog_sec;
2094
2095         prog->len = attr->insn_cnt;
2096
2097         err = -EFAULT;
2098         if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns),
2099                            bpf_prog_insn_size(prog)) != 0)
2100                 goto free_prog;
2101
2102         prog->orig_prog = NULL;
2103         prog->jited = 0;
2104
2105         atomic64_set(&prog->aux->refcnt, 1);
2106         prog->gpl_compatible = is_gpl ? 1 : 0;
2107
2108         if (bpf_prog_is_dev_bound(prog->aux)) {
2109                 err = bpf_prog_offload_init(prog, attr);
2110                 if (err)
2111                         goto free_prog;
2112         }
2113
2114         /* find program type: socket_filter vs tracing_filter */
2115         err = find_prog_type(type, prog);
2116         if (err < 0)
2117                 goto free_prog;
2118
2119         prog->aux->load_time = ktime_get_boottime_ns();
2120         err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name,
2121                                sizeof(attr->prog_name));
2122         if (err < 0)
2123                 goto free_prog;
2124
2125         /* run eBPF verifier */
2126         err = bpf_check(&prog, attr, uattr);
2127         if (err < 0)
2128                 goto free_used_maps;
2129
2130         prog = bpf_prog_select_runtime(prog, &err);
2131         if (err < 0)
2132                 goto free_used_maps;
2133
2134         err = bpf_prog_alloc_id(prog);
2135         if (err)
2136                 goto free_used_maps;
2137
2138         /* Upon success of bpf_prog_alloc_id(), the BPF prog is
2139          * effectively publicly exposed. However, retrieving via
2140          * bpf_prog_get_fd_by_id() will take another reference,
2141          * therefore it cannot be gone underneath us.
2142          *
2143          * Only for the time /after/ successful bpf_prog_new_fd()
2144          * and before returning to userspace, we might just hold
2145          * one reference and any parallel close on that fd could
2146          * rip everything out. Hence, below notifications must
2147          * happen before bpf_prog_new_fd().
2148          *
2149          * Also, any failure handling from this point onwards must
2150          * be using bpf_prog_put() given the program is exposed.
2151          */
2152         bpf_prog_kallsyms_add(prog);
2153         perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0);
2154         bpf_audit_prog(prog, BPF_AUDIT_LOAD);
2155
2156         err = bpf_prog_new_fd(prog);
2157         if (err < 0)
2158                 bpf_prog_put(prog);
2159         return err;
2160
2161 free_used_maps:
2162         /* In case we have subprogs, we need to wait for a grace
2163          * period before we can tear down JIT memory since symbols
2164          * are already exposed under kallsyms.
2165          */
2166         __bpf_prog_put_noref(prog, prog->aux->func_cnt);
2167         return err;
2168 free_prog:
2169         bpf_prog_uncharge_memlock(prog);
2170 free_prog_sec:
2171         security_bpf_prog_free(prog->aux);
2172 free_prog_nouncharge:
2173         bpf_prog_free(prog);
2174         return err;
2175 }
2176
2177 #define BPF_OBJ_LAST_FIELD file_flags
2178
2179 static int bpf_obj_pin(const union bpf_attr *attr)
2180 {
2181         if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0)
2182                 return -EINVAL;
2183
2184         return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname));
2185 }
2186
2187 static int bpf_obj_get(const union bpf_attr *attr)
2188 {
2189         if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 ||
2190             attr->file_flags & ~BPF_OBJ_FLAG_MASK)
2191                 return -EINVAL;
2192
2193         return bpf_obj_get_user(u64_to_user_ptr(attr->pathname),
2194                                 attr->file_flags);
2195 }
2196
2197 void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops,
2198                    struct bpf_prog *prog)
2199 {
2200         atomic64_set(&link->refcnt, 1);
2201         link->ops = ops;
2202         link->prog = prog;
2203 }
2204
2205 /* Clean up bpf_link and corresponding anon_inode file and FD. After
2206  * anon_inode is created, bpf_link can't be just kfree()'d due to deferred
2207  * anon_inode's release() call. This helper manages marking bpf_link as
2208  * defunct, releases anon_inode file and puts reserved FD.
2209  */
2210 void bpf_link_cleanup(struct bpf_link *link, struct file *link_file,
2211                       int link_fd)
2212 {
2213         link->prog = NULL;
2214         fput(link_file);
2215         put_unused_fd(link_fd);
2216 }
2217
2218 void bpf_link_inc(struct bpf_link *link)
2219 {
2220         atomic64_inc(&link->refcnt);
2221 }
2222
2223 /* bpf_link_free is guaranteed to be called from process context */
2224 static void bpf_link_free(struct bpf_link *link)
2225 {
2226         if (link->prog) {
2227                 /* detach BPF program, clean up used resources */
2228                 link->ops->release(link);
2229                 bpf_prog_put(link->prog);
2230         }
2231         /* free bpf_link and its containing memory */
2232         link->ops->dealloc(link);
2233 }
2234
2235 static void bpf_link_put_deferred(struct work_struct *work)
2236 {
2237         struct bpf_link *link = container_of(work, struct bpf_link, work);
2238
2239         bpf_link_free(link);
2240 }
2241
2242 /* bpf_link_put can be called from atomic context, but ensures that resources
2243  * are freed from process context
2244  */
2245 void bpf_link_put(struct bpf_link *link)
2246 {
2247         if (!atomic64_dec_and_test(&link->refcnt))
2248                 return;
2249
2250         if (in_atomic()) {
2251                 INIT_WORK(&link->work, bpf_link_put_deferred);
2252                 schedule_work(&link->work);
2253         } else {
2254                 bpf_link_free(link);
2255         }
2256 }
2257
2258 static int bpf_link_release(struct inode *inode, struct file *filp)
2259 {
2260         struct bpf_link *link = filp->private_data;
2261
2262         bpf_link_put(link);
2263         return 0;
2264 }
2265
2266 #ifdef CONFIG_PROC_FS
2267 static const struct bpf_link_ops bpf_raw_tp_lops;
2268 static const struct bpf_link_ops bpf_tracing_link_lops;
2269
2270 static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
2271 {
2272         const struct bpf_link *link = filp->private_data;
2273         const struct bpf_prog *prog = link->prog;
2274         char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
2275         const char *link_type;
2276
2277         if (link->ops == &bpf_raw_tp_lops)
2278                 link_type = "raw_tracepoint";
2279         else if (link->ops == &bpf_tracing_link_lops)
2280                 link_type = "tracing";
2281 #ifdef CONFIG_CGROUP_BPF
2282         else if (link->ops == &bpf_cgroup_link_lops)
2283                 link_type = "cgroup";
2284 #endif
2285         else
2286                 link_type = "unknown";
2287
2288         bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
2289         seq_printf(m,
2290                    "link_type:\t%s\n"
2291                    "prog_tag:\t%s\n"
2292                    "prog_id:\t%u\n",
2293                    link_type,
2294                    prog_tag,
2295                    prog->aux->id);
2296 }
2297 #endif
2298
2299 static const struct file_operations bpf_link_fops = {
2300 #ifdef CONFIG_PROC_FS
2301         .show_fdinfo    = bpf_link_show_fdinfo,
2302 #endif
2303         .release        = bpf_link_release,
2304         .read           = bpf_dummy_read,
2305         .write          = bpf_dummy_write,
2306 };
2307
2308 int bpf_link_new_fd(struct bpf_link *link)
2309 {
2310         return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC);
2311 }
2312
2313 /* Similar to bpf_link_new_fd, create anon_inode for given bpf_link, but
2314  * instead of immediately installing fd in fdtable, just reserve it and
2315  * return. Caller then need to either install it with fd_install(fd, file) or
2316  * release with put_unused_fd(fd).
2317  * This is useful for cases when bpf_link attachment/detachment are
2318  * complicated and expensive operations and should be delayed until all the fd
2319  * reservation and anon_inode creation succeeds.
2320  */
2321 struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd)
2322 {
2323         struct file *file;
2324         int fd;
2325
2326         fd = get_unused_fd_flags(O_CLOEXEC);
2327         if (fd < 0)
2328                 return ERR_PTR(fd);
2329
2330         file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC);
2331         if (IS_ERR(file)) {
2332                 put_unused_fd(fd);
2333                 return file;
2334         }
2335
2336         *reserved_fd = fd;
2337         return file;
2338 }
2339
2340 struct bpf_link *bpf_link_get_from_fd(u32 ufd)
2341 {
2342         struct fd f = fdget(ufd);
2343         struct bpf_link *link;
2344
2345         if (!f.file)
2346                 return ERR_PTR(-EBADF);
2347         if (f.file->f_op != &bpf_link_fops) {
2348                 fdput(f);
2349                 return ERR_PTR(-EINVAL);
2350         }
2351
2352         link = f.file->private_data;
2353         bpf_link_inc(link);
2354         fdput(f);
2355
2356         return link;
2357 }
2358
2359 struct bpf_tracing_link {
2360         struct bpf_link link;
2361 };
2362
2363 static void bpf_tracing_link_release(struct bpf_link *link)
2364 {
2365         WARN_ON_ONCE(bpf_trampoline_unlink_prog(link->prog));
2366 }
2367
2368 static void bpf_tracing_link_dealloc(struct bpf_link *link)
2369 {
2370         struct bpf_tracing_link *tr_link =
2371                 container_of(link, struct bpf_tracing_link, link);
2372
2373         kfree(tr_link);
2374 }
2375
2376 static const struct bpf_link_ops bpf_tracing_link_lops = {
2377         .release = bpf_tracing_link_release,
2378         .dealloc = bpf_tracing_link_dealloc,
2379 };
2380
2381 static int bpf_tracing_prog_attach(struct bpf_prog *prog)
2382 {
2383         struct bpf_tracing_link *link;
2384         struct file *link_file;
2385         int link_fd, err;
2386
2387         switch (prog->type) {
2388         case BPF_PROG_TYPE_TRACING:
2389                 if (prog->expected_attach_type != BPF_TRACE_FENTRY &&
2390                     prog->expected_attach_type != BPF_TRACE_FEXIT &&
2391                     prog->expected_attach_type != BPF_MODIFY_RETURN) {
2392                         err = -EINVAL;
2393                         goto out_put_prog;
2394                 }
2395                 break;
2396         case BPF_PROG_TYPE_EXT:
2397                 if (prog->expected_attach_type != 0) {
2398                         err = -EINVAL;
2399                         goto out_put_prog;
2400                 }
2401                 break;
2402         case BPF_PROG_TYPE_LSM:
2403                 if (prog->expected_attach_type != BPF_LSM_MAC) {
2404                         err = -EINVAL;
2405                         goto out_put_prog;
2406                 }
2407                 break;
2408         default:
2409                 err = -EINVAL;
2410                 goto out_put_prog;
2411         }
2412
2413         link = kzalloc(sizeof(*link), GFP_USER);
2414         if (!link) {
2415                 err = -ENOMEM;
2416                 goto out_put_prog;
2417         }
2418         bpf_link_init(&link->link, &bpf_tracing_link_lops, prog);
2419
2420         link_file = bpf_link_new_file(&link->link, &link_fd);
2421         if (IS_ERR(link_file)) {
2422                 kfree(link);
2423                 err = PTR_ERR(link_file);
2424                 goto out_put_prog;
2425         }
2426
2427         err = bpf_trampoline_link_prog(prog);
2428         if (err) {
2429                 bpf_link_cleanup(&link->link, link_file, link_fd);
2430                 goto out_put_prog;
2431         }
2432
2433         fd_install(link_fd, link_file);
2434         return link_fd;
2435
2436 out_put_prog:
2437         bpf_prog_put(prog);
2438         return err;
2439 }
2440
2441 struct bpf_raw_tp_link {
2442         struct bpf_link link;
2443         struct bpf_raw_event_map *btp;
2444 };
2445
2446 static void bpf_raw_tp_link_release(struct bpf_link *link)
2447 {
2448         struct bpf_raw_tp_link *raw_tp =
2449                 container_of(link, struct bpf_raw_tp_link, link);
2450
2451         bpf_probe_unregister(raw_tp->btp, raw_tp->link.prog);
2452         bpf_put_raw_tracepoint(raw_tp->btp);
2453 }
2454
2455 static void bpf_raw_tp_link_dealloc(struct bpf_link *link)
2456 {
2457         struct bpf_raw_tp_link *raw_tp =
2458                 container_of(link, struct bpf_raw_tp_link, link);
2459
2460         kfree(raw_tp);
2461 }
2462
2463 static const struct bpf_link_ops bpf_raw_tp_lops = {
2464         .release = bpf_raw_tp_link_release,
2465         .dealloc = bpf_raw_tp_link_dealloc,
2466 };
2467
2468 #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
2469
2470 static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
2471 {
2472         struct bpf_raw_tp_link *link;
2473         struct bpf_raw_event_map *btp;
2474         struct file *link_file;
2475         struct bpf_prog *prog;
2476         const char *tp_name;
2477         char buf[128];
2478         int link_fd, err;
2479
2480         if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN))
2481                 return -EINVAL;
2482
2483         prog = bpf_prog_get(attr->raw_tracepoint.prog_fd);
2484         if (IS_ERR(prog))
2485                 return PTR_ERR(prog);
2486
2487         switch (prog->type) {
2488         case BPF_PROG_TYPE_TRACING:
2489         case BPF_PROG_TYPE_EXT:
2490         case BPF_PROG_TYPE_LSM:
2491                 if (attr->raw_tracepoint.name) {
2492                         /* The attach point for this category of programs
2493                          * should be specified via btf_id during program load.
2494                          */
2495                         err = -EINVAL;
2496                         goto out_put_prog;
2497                 }
2498                 if (prog->type == BPF_PROG_TYPE_TRACING &&
2499                     prog->expected_attach_type == BPF_TRACE_RAW_TP) {
2500                         tp_name = prog->aux->attach_func_name;
2501                         break;
2502                 }
2503                 return bpf_tracing_prog_attach(prog);
2504         case BPF_PROG_TYPE_RAW_TRACEPOINT:
2505         case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
2506                 if (strncpy_from_user(buf,
2507                                       u64_to_user_ptr(attr->raw_tracepoint.name),
2508                                       sizeof(buf) - 1) < 0) {
2509                         err = -EFAULT;
2510                         goto out_put_prog;
2511                 }
2512                 buf[sizeof(buf) - 1] = 0;
2513                 tp_name = buf;
2514                 break;
2515         default:
2516                 err = -EINVAL;
2517                 goto out_put_prog;
2518         }
2519
2520         btp = bpf_get_raw_tracepoint(tp_name);
2521         if (!btp) {
2522                 err = -ENOENT;
2523                 goto out_put_prog;
2524         }
2525
2526         link = kzalloc(sizeof(*link), GFP_USER);
2527         if (!link) {
2528                 err = -ENOMEM;
2529                 goto out_put_btp;
2530         }
2531         bpf_link_init(&link->link, &bpf_raw_tp_lops, prog);
2532         link->btp = btp;
2533
2534         link_file = bpf_link_new_file(&link->link, &link_fd);
2535         if (IS_ERR(link_file)) {
2536                 kfree(link);
2537                 err = PTR_ERR(link_file);
2538                 goto out_put_btp;
2539         }
2540
2541         err = bpf_probe_register(link->btp, prog);
2542         if (err) {
2543                 bpf_link_cleanup(&link->link, link_file, link_fd);
2544                 goto out_put_btp;
2545         }
2546
2547         fd_install(link_fd, link_file);
2548         return link_fd;
2549
2550 out_put_btp:
2551         bpf_put_raw_tracepoint(btp);
2552 out_put_prog:
2553         bpf_prog_put(prog);
2554         return err;
2555 }
2556
2557 static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
2558                                              enum bpf_attach_type attach_type)
2559 {
2560         switch (prog->type) {
2561         case BPF_PROG_TYPE_CGROUP_SOCK:
2562         case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
2563         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
2564                 return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
2565         case BPF_PROG_TYPE_CGROUP_SKB:
2566                 return prog->enforce_expected_attach_type &&
2567                         prog->expected_attach_type != attach_type ?
2568                         -EINVAL : 0;
2569         default:
2570                 return 0;
2571         }
2572 }
2573
2574 static enum bpf_prog_type
2575 attach_type_to_prog_type(enum bpf_attach_type attach_type)
2576 {
2577         switch (attach_type) {
2578         case BPF_CGROUP_INET_INGRESS:
2579         case BPF_CGROUP_INET_EGRESS:
2580                 return BPF_PROG_TYPE_CGROUP_SKB;
2581                 break;
2582         case BPF_CGROUP_INET_SOCK_CREATE:
2583         case BPF_CGROUP_INET4_POST_BIND:
2584         case BPF_CGROUP_INET6_POST_BIND:
2585                 return BPF_PROG_TYPE_CGROUP_SOCK;
2586         case BPF_CGROUP_INET4_BIND:
2587         case BPF_CGROUP_INET6_BIND:
2588         case BPF_CGROUP_INET4_CONNECT:
2589         case BPF_CGROUP_INET6_CONNECT:
2590         case BPF_CGROUP_UDP4_SENDMSG:
2591         case BPF_CGROUP_UDP6_SENDMSG:
2592         case BPF_CGROUP_UDP4_RECVMSG:
2593         case BPF_CGROUP_UDP6_RECVMSG:
2594                 return BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
2595         case BPF_CGROUP_SOCK_OPS:
2596                 return BPF_PROG_TYPE_SOCK_OPS;
2597         case BPF_CGROUP_DEVICE:
2598                 return BPF_PROG_TYPE_CGROUP_DEVICE;
2599         case BPF_SK_MSG_VERDICT:
2600                 return BPF_PROG_TYPE_SK_MSG;
2601         case BPF_SK_SKB_STREAM_PARSER:
2602         case BPF_SK_SKB_STREAM_VERDICT:
2603                 return BPF_PROG_TYPE_SK_SKB;
2604         case BPF_LIRC_MODE2:
2605                 return BPF_PROG_TYPE_LIRC_MODE2;
2606         case BPF_FLOW_DISSECTOR:
2607                 return BPF_PROG_TYPE_FLOW_DISSECTOR;
2608         case BPF_CGROUP_SYSCTL:
2609                 return BPF_PROG_TYPE_CGROUP_SYSCTL;
2610         case BPF_CGROUP_GETSOCKOPT:
2611         case BPF_CGROUP_SETSOCKOPT:
2612                 return BPF_PROG_TYPE_CGROUP_SOCKOPT;
2613         default:
2614                 return BPF_PROG_TYPE_UNSPEC;
2615         }
2616 }
2617
2618 #define BPF_PROG_ATTACH_LAST_FIELD replace_bpf_fd
2619
2620 #define BPF_F_ATTACH_MASK \
2621         (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI | BPF_F_REPLACE)
2622
2623 static int bpf_prog_attach(const union bpf_attr *attr)
2624 {
2625         enum bpf_prog_type ptype;
2626         struct bpf_prog *prog;
2627         int ret;
2628
2629         if (!capable(CAP_NET_ADMIN))
2630                 return -EPERM;
2631
2632         if (CHECK_ATTR(BPF_PROG_ATTACH))
2633                 return -EINVAL;
2634
2635         if (attr->attach_flags & ~BPF_F_ATTACH_MASK)
2636                 return -EINVAL;
2637
2638         ptype = attach_type_to_prog_type(attr->attach_type);
2639         if (ptype == BPF_PROG_TYPE_UNSPEC)
2640                 return -EINVAL;
2641
2642         prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
2643         if (IS_ERR(prog))
2644                 return PTR_ERR(prog);
2645
2646         if (bpf_prog_attach_check_attach_type(prog, attr->attach_type)) {
2647                 bpf_prog_put(prog);
2648                 return -EINVAL;
2649         }
2650
2651         switch (ptype) {
2652         case BPF_PROG_TYPE_SK_SKB:
2653         case BPF_PROG_TYPE_SK_MSG:
2654                 ret = sock_map_get_from_fd(attr, prog);
2655                 break;
2656         case BPF_PROG_TYPE_LIRC_MODE2:
2657                 ret = lirc_prog_attach(attr, prog);
2658                 break;
2659         case BPF_PROG_TYPE_FLOW_DISSECTOR:
2660                 ret = skb_flow_dissector_bpf_prog_attach(attr, prog);
2661                 break;
2662         case BPF_PROG_TYPE_CGROUP_DEVICE:
2663         case BPF_PROG_TYPE_CGROUP_SKB:
2664         case BPF_PROG_TYPE_CGROUP_SOCK:
2665         case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
2666         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
2667         case BPF_PROG_TYPE_CGROUP_SYSCTL:
2668         case BPF_PROG_TYPE_SOCK_OPS:
2669                 ret = cgroup_bpf_prog_attach(attr, ptype, prog);
2670                 break;
2671         default:
2672                 ret = -EINVAL;
2673         }
2674
2675         if (ret)
2676                 bpf_prog_put(prog);
2677         return ret;
2678 }
2679
2680 #define BPF_PROG_DETACH_LAST_FIELD attach_type
2681
2682 static int bpf_prog_detach(const union bpf_attr *attr)
2683 {
2684         enum bpf_prog_type ptype;
2685
2686         if (!capable(CAP_NET_ADMIN))
2687                 return -EPERM;
2688
2689         if (CHECK_ATTR(BPF_PROG_DETACH))
2690                 return -EINVAL;
2691
2692         ptype = attach_type_to_prog_type(attr->attach_type);
2693
2694         switch (ptype) {
2695         case BPF_PROG_TYPE_SK_MSG:
2696         case BPF_PROG_TYPE_SK_SKB:
2697                 return sock_map_get_from_fd(attr, NULL);
2698         case BPF_PROG_TYPE_LIRC_MODE2:
2699                 return lirc_prog_detach(attr);
2700         case BPF_PROG_TYPE_FLOW_DISSECTOR:
2701                 return skb_flow_dissector_bpf_prog_detach(attr);
2702         case BPF_PROG_TYPE_CGROUP_DEVICE:
2703         case BPF_PROG_TYPE_CGROUP_SKB:
2704         case BPF_PROG_TYPE_CGROUP_SOCK:
2705         case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
2706         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
2707         case BPF_PROG_TYPE_CGROUP_SYSCTL:
2708         case BPF_PROG_TYPE_SOCK_OPS:
2709                 return cgroup_bpf_prog_detach(attr, ptype);
2710         default:
2711                 return -EINVAL;
2712         }
2713 }
2714
2715 #define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt
2716
2717 static int bpf_prog_query(const union bpf_attr *attr,
2718                           union bpf_attr __user *uattr)
2719 {
2720         if (!capable(CAP_NET_ADMIN))
2721                 return -EPERM;
2722         if (CHECK_ATTR(BPF_PROG_QUERY))
2723                 return -EINVAL;
2724         if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE)
2725                 return -EINVAL;
2726
2727         switch (attr->query.attach_type) {
2728         case BPF_CGROUP_INET_INGRESS:
2729         case BPF_CGROUP_INET_EGRESS:
2730         case BPF_CGROUP_INET_SOCK_CREATE:
2731         case BPF_CGROUP_INET4_BIND:
2732         case BPF_CGROUP_INET6_BIND:
2733         case BPF_CGROUP_INET4_POST_BIND:
2734         case BPF_CGROUP_INET6_POST_BIND:
2735         case BPF_CGROUP_INET4_CONNECT:
2736         case BPF_CGROUP_INET6_CONNECT:
2737         case BPF_CGROUP_UDP4_SENDMSG:
2738         case BPF_CGROUP_UDP6_SENDMSG:
2739         case BPF_CGROUP_UDP4_RECVMSG:
2740         case BPF_CGROUP_UDP6_RECVMSG:
2741         case BPF_CGROUP_SOCK_OPS:
2742         case BPF_CGROUP_DEVICE:
2743         case BPF_CGROUP_SYSCTL:
2744         case BPF_CGROUP_GETSOCKOPT:
2745         case BPF_CGROUP_SETSOCKOPT:
2746                 return cgroup_bpf_prog_query(attr, uattr);
2747         case BPF_LIRC_MODE2:
2748                 return lirc_prog_query(attr, uattr);
2749         case BPF_FLOW_DISSECTOR:
2750                 return skb_flow_dissector_prog_query(attr, uattr);
2751         default:
2752                 return -EINVAL;
2753         }
2754 }
2755
2756 #define BPF_PROG_TEST_RUN_LAST_FIELD test.ctx_out
2757
2758 static int bpf_prog_test_run(const union bpf_attr *attr,
2759                              union bpf_attr __user *uattr)
2760 {
2761         struct bpf_prog *prog;
2762         int ret = -ENOTSUPP;
2763
2764         if (!capable(CAP_SYS_ADMIN))
2765                 return -EPERM;
2766         if (CHECK_ATTR(BPF_PROG_TEST_RUN))
2767                 return -EINVAL;
2768
2769         if ((attr->test.ctx_size_in && !attr->test.ctx_in) ||
2770             (!attr->test.ctx_size_in && attr->test.ctx_in))
2771                 return -EINVAL;
2772
2773         if ((attr->test.ctx_size_out && !attr->test.ctx_out) ||
2774             (!attr->test.ctx_size_out && attr->test.ctx_out))
2775                 return -EINVAL;
2776
2777         prog = bpf_prog_get(attr->test.prog_fd);
2778         if (IS_ERR(prog))
2779                 return PTR_ERR(prog);
2780
2781         if (prog->aux->ops->test_run)
2782                 ret = prog->aux->ops->test_run(prog, attr, uattr);
2783
2784         bpf_prog_put(prog);
2785         return ret;
2786 }
2787
2788 #define BPF_OBJ_GET_NEXT_ID_LAST_FIELD next_id
2789
2790 static int bpf_obj_get_next_id(const union bpf_attr *attr,
2791                                union bpf_attr __user *uattr,
2792                                struct idr *idr,
2793                                spinlock_t *lock)
2794 {
2795         u32 next_id = attr->start_id;
2796         int err = 0;
2797
2798         if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX)
2799                 return -EINVAL;
2800
2801         if (!capable(CAP_SYS_ADMIN))
2802                 return -EPERM;
2803
2804         next_id++;
2805         spin_lock_bh(lock);
2806         if (!idr_get_next(idr, &next_id))
2807                 err = -ENOENT;
2808         spin_unlock_bh(lock);
2809
2810         if (!err)
2811                 err = put_user(next_id, &uattr->next_id);
2812
2813         return err;
2814 }
2815
2816 #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
2817
2818 struct bpf_prog *bpf_prog_by_id(u32 id)
2819 {
2820         struct bpf_prog *prog;
2821
2822         if (!id)
2823                 return ERR_PTR(-ENOENT);
2824
2825         spin_lock_bh(&prog_idr_lock);
2826         prog = idr_find(&prog_idr, id);
2827         if (prog)
2828                 prog = bpf_prog_inc_not_zero(prog);
2829         else
2830                 prog = ERR_PTR(-ENOENT);
2831         spin_unlock_bh(&prog_idr_lock);
2832         return prog;
2833 }
2834
2835 static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
2836 {
2837         struct bpf_prog *prog;
2838         u32 id = attr->prog_id;
2839         int fd;
2840
2841         if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
2842                 return -EINVAL;
2843
2844         if (!capable(CAP_SYS_ADMIN))
2845                 return -EPERM;
2846
2847         prog = bpf_prog_by_id(id);
2848         if (IS_ERR(prog))
2849                 return PTR_ERR(prog);
2850
2851         fd = bpf_prog_new_fd(prog);
2852         if (fd < 0)
2853                 bpf_prog_put(prog);
2854
2855         return fd;
2856 }
2857
2858 #define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags
2859
2860 static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
2861 {
2862         struct bpf_map *map;
2863         u32 id = attr->map_id;
2864         int f_flags;
2865         int fd;
2866
2867         if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) ||
2868             attr->open_flags & ~BPF_OBJ_FLAG_MASK)
2869                 return -EINVAL;
2870
2871         if (!capable(CAP_SYS_ADMIN))
2872                 return -EPERM;
2873
2874         f_flags = bpf_get_file_flag(attr->open_flags);
2875         if (f_flags < 0)
2876                 return f_flags;
2877
2878         spin_lock_bh(&map_idr_lock);
2879         map = idr_find(&map_idr, id);
2880         if (map)
2881                 map = __bpf_map_inc_not_zero(map, true);
2882         else
2883                 map = ERR_PTR(-ENOENT);
2884         spin_unlock_bh(&map_idr_lock);
2885
2886         if (IS_ERR(map))
2887                 return PTR_ERR(map);
2888
2889         fd = bpf_map_new_fd(map, f_flags);
2890         if (fd < 0)
2891                 bpf_map_put_with_uref(map);
2892
2893         return fd;
2894 }
2895
2896 static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog,
2897                                               unsigned long addr, u32 *off,
2898                                               u32 *type)
2899 {
2900         const struct bpf_map *map;
2901         int i;
2902
2903         for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) {
2904                 map = prog->aux->used_maps[i];
2905                 if (map == (void *)addr) {
2906                         *type = BPF_PSEUDO_MAP_FD;
2907                         return map;
2908                 }
2909                 if (!map->ops->map_direct_value_meta)
2910                         continue;
2911                 if (!map->ops->map_direct_value_meta(map, addr, off)) {
2912                         *type = BPF_PSEUDO_MAP_VALUE;
2913                         return map;
2914                 }
2915         }
2916
2917         return NULL;
2918 }
2919
2920 static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog)
2921 {
2922         const struct bpf_map *map;
2923         struct bpf_insn *insns;
2924         u32 off, type;
2925         u64 imm;
2926         int i;
2927
2928         insns = kmemdup(prog->insnsi, bpf_prog_insn_size(prog),
2929                         GFP_USER);
2930         if (!insns)
2931                 return insns;
2932
2933         for (i = 0; i < prog->len; i++) {
2934                 if (insns[i].code == (BPF_JMP | BPF_TAIL_CALL)) {
2935                         insns[i].code = BPF_JMP | BPF_CALL;
2936                         insns[i].imm = BPF_FUNC_tail_call;
2937                         /* fall-through */
2938                 }
2939                 if (insns[i].code == (BPF_JMP | BPF_CALL) ||
2940                     insns[i].code == (BPF_JMP | BPF_CALL_ARGS)) {
2941                         if (insns[i].code == (BPF_JMP | BPF_CALL_ARGS))
2942                                 insns[i].code = BPF_JMP | BPF_CALL;
2943                         if (!bpf_dump_raw_ok())
2944                                 insns[i].imm = 0;
2945                         continue;
2946                 }
2947
2948                 if (insns[i].code != (BPF_LD | BPF_IMM | BPF_DW))
2949                         continue;
2950
2951                 imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm;
2952                 map = bpf_map_from_imm(prog, imm, &off, &type);
2953                 if (map) {
2954                         insns[i].src_reg = type;
2955                         insns[i].imm = map->id;
2956                         insns[i + 1].imm = off;
2957                         continue;
2958                 }
2959         }
2960
2961         return insns;
2962 }
2963
2964 static int set_info_rec_size(struct bpf_prog_info *info)
2965 {
2966         /*
2967          * Ensure info.*_rec_size is the same as kernel expected size
2968          *
2969          * or
2970          *
2971          * Only allow zero *_rec_size if both _rec_size and _cnt are
2972          * zero.  In this case, the kernel will set the expected
2973          * _rec_size back to the info.
2974          */
2975
2976         if ((info->nr_func_info || info->func_info_rec_size) &&
2977             info->func_info_rec_size != sizeof(struct bpf_func_info))
2978                 return -EINVAL;
2979
2980         if ((info->nr_line_info || info->line_info_rec_size) &&
2981             info->line_info_rec_size != sizeof(struct bpf_line_info))
2982                 return -EINVAL;
2983
2984         if ((info->nr_jited_line_info || info->jited_line_info_rec_size) &&
2985             info->jited_line_info_rec_size != sizeof(__u64))
2986                 return -EINVAL;
2987
2988         info->func_info_rec_size = sizeof(struct bpf_func_info);
2989         info->line_info_rec_size = sizeof(struct bpf_line_info);
2990         info->jited_line_info_rec_size = sizeof(__u64);
2991
2992         return 0;
2993 }
2994
2995 static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
2996                                    const union bpf_attr *attr,
2997                                    union bpf_attr __user *uattr)
2998 {
2999         struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
3000         struct bpf_prog_info info;
3001         u32 info_len = attr->info.info_len;
3002         struct bpf_prog_stats stats;
3003         char __user *uinsns;
3004         u32 ulen;
3005         int err;
3006
3007         err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len);
3008         if (err)
3009                 return err;
3010         info_len = min_t(u32, sizeof(info), info_len);
3011
3012         memset(&info, 0, sizeof(info));
3013         if (copy_from_user(&info, uinfo, info_len))
3014                 return -EFAULT;
3015
3016         info.type = prog->type;
3017         info.id = prog->aux->id;
3018         info.load_time = prog->aux->load_time;
3019         info.created_by_uid = from_kuid_munged(current_user_ns(),
3020                                                prog->aux->user->uid);
3021         info.gpl_compatible = prog->gpl_compatible;
3022
3023         memcpy(info.tag, prog->tag, sizeof(prog->tag));
3024         memcpy(info.name, prog->aux->name, sizeof(prog->aux->name));
3025
3026         ulen = info.nr_map_ids;
3027         info.nr_map_ids = prog->aux->used_map_cnt;
3028         ulen = min_t(u32, info.nr_map_ids, ulen);
3029         if (ulen) {
3030                 u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids);
3031                 u32 i;
3032
3033                 for (i = 0; i < ulen; i++)
3034                         if (put_user(prog->aux->used_maps[i]->id,
3035                                      &user_map_ids[i]))
3036                                 return -EFAULT;
3037         }
3038
3039         err = set_info_rec_size(&info);
3040         if (err)
3041                 return err;
3042
3043         bpf_prog_get_stats(prog, &stats);
3044         info.run_time_ns = stats.nsecs;
3045         info.run_cnt = stats.cnt;
3046
3047         if (!capable(CAP_SYS_ADMIN)) {
3048                 info.jited_prog_len = 0;
3049                 info.xlated_prog_len = 0;
3050                 info.nr_jited_ksyms = 0;
3051                 info.nr_jited_func_lens = 0;
3052                 info.nr_func_info = 0;
3053                 info.nr_line_info = 0;
3054                 info.nr_jited_line_info = 0;
3055                 goto done;
3056         }
3057
3058         ulen = info.xlated_prog_len;
3059         info.xlated_prog_len = bpf_prog_insn_size(prog);
3060         if (info.xlated_prog_len && ulen) {
3061                 struct bpf_insn *insns_sanitized;
3062                 bool fault;
3063
3064                 if (prog->blinded && !bpf_dump_raw_ok()) {
3065                         info.xlated_prog_insns = 0;
3066                         goto done;
3067                 }
3068                 insns_sanitized = bpf_insn_prepare_dump(prog);
3069                 if (!insns_sanitized)
3070                         return -ENOMEM;
3071                 uinsns = u64_to_user_ptr(info.xlated_prog_insns);
3072                 ulen = min_t(u32, info.xlated_prog_len, ulen);
3073                 fault = copy_to_user(uinsns, insns_sanitized, ulen);
3074                 kfree(insns_sanitized);
3075                 if (fault)
3076                         return -EFAULT;
3077         }
3078
3079         if (bpf_prog_is_dev_bound(prog->aux)) {
3080                 err = bpf_prog_offload_info_fill(&info, prog);
3081                 if (err)
3082                         return err;
3083                 goto done;
3084         }
3085
3086         /* NOTE: the following code is supposed to be skipped for offload.
3087          * bpf_prog_offload_info_fill() is the place to fill similar fields
3088          * for offload.
3089          */
3090         ulen = info.jited_prog_len;
3091         if (prog->aux->func_cnt) {
3092                 u32 i;
3093
3094                 info.jited_prog_len = 0;
3095                 for (i = 0; i < prog->aux->func_cnt; i++)
3096                         info.jited_prog_len += prog->aux->func[i]->jited_len;
3097         } else {
3098                 info.jited_prog_len = prog->jited_len;
3099         }
3100
3101         if (info.jited_prog_len && ulen) {
3102                 if (bpf_dump_raw_ok()) {
3103                         uinsns = u64_to_user_ptr(info.jited_prog_insns);
3104                         ulen = min_t(u32, info.jited_prog_len, ulen);
3105
3106                         /* for multi-function programs, copy the JITed
3107                          * instructions for all the functions
3108                          */
3109                         if (prog->aux->func_cnt) {
3110                                 u32 len, free, i;
3111                                 u8 *img;
3112
3113                                 free = ulen;
3114                                 for (i = 0; i < prog->aux->func_cnt; i++) {
3115                                         len = prog->aux->func[i]->jited_len;
3116                                         len = min_t(u32, len, free);
3117                                         img = (u8 *) prog->aux->func[i]->bpf_func;
3118                                         if (copy_to_user(uinsns, img, len))
3119                                                 return -EFAULT;
3120                                         uinsns += len;
3121                                         free -= len;
3122                                         if (!free)
3123                                                 break;
3124                                 }
3125                         } else {
3126                                 if (copy_to_user(uinsns, prog->bpf_func, ulen))
3127                                         return -EFAULT;
3128                         }
3129                 } else {
3130                         info.jited_prog_insns = 0;
3131                 }
3132         }
3133
3134         ulen = info.nr_jited_ksyms;
3135         info.nr_jited_ksyms = prog->aux->func_cnt ? : 1;
3136         if (ulen) {
3137                 if (bpf_dump_raw_ok()) {
3138                         unsigned long ksym_addr;
3139                         u64 __user *user_ksyms;
3140                         u32 i;
3141
3142                         /* copy the address of the kernel symbol
3143                          * corresponding to each function
3144                          */
3145                         ulen = min_t(u32, info.nr_jited_ksyms, ulen);
3146                         user_ksyms = u64_to_user_ptr(info.jited_ksyms);
3147                         if (prog->aux->func_cnt) {
3148                                 for (i = 0; i < ulen; i++) {
3149                                         ksym_addr = (unsigned long)
3150                                                 prog->aux->func[i]->bpf_func;
3151                                         if (put_user((u64) ksym_addr,
3152                                                      &user_ksyms[i]))
3153                                                 return -EFAULT;
3154                                 }
3155                         } else {
3156                                 ksym_addr = (unsigned long) prog->bpf_func;
3157                                 if (put_user((u64) ksym_addr, &user_ksyms[0]))
3158                                         return -EFAULT;
3159                         }
3160                 } else {
3161                         info.jited_ksyms = 0;
3162                 }
3163         }
3164
3165         ulen = info.nr_jited_func_lens;
3166         info.nr_jited_func_lens = prog->aux->func_cnt ? : 1;
3167         if (ulen) {
3168                 if (bpf_dump_raw_ok()) {
3169                         u32 __user *user_lens;
3170                         u32 func_len, i;
3171
3172                         /* copy the JITed image lengths for each function */
3173                         ulen = min_t(u32, info.nr_jited_func_lens, ulen);
3174                         user_lens = u64_to_user_ptr(info.jited_func_lens);
3175                         if (prog->aux->func_cnt) {
3176                                 for (i = 0; i < ulen; i++) {
3177                                         func_len =
3178                                                 prog->aux->func[i]->jited_len;
3179                                         if (put_user(func_len, &user_lens[i]))
3180                                                 return -EFAULT;
3181                                 }
3182                         } else {
3183                                 func_len = prog->jited_len;
3184                                 if (put_user(func_len, &user_lens[0]))
3185                                         return -EFAULT;
3186                         }
3187                 } else {
3188                         info.jited_func_lens = 0;
3189                 }
3190         }
3191
3192         if (prog->aux->btf)
3193                 info.btf_id = btf_id(prog->aux->btf);
3194
3195         ulen = info.nr_func_info;
3196         info.nr_func_info = prog->aux->func_info_cnt;
3197         if (info.nr_func_info && ulen) {
3198                 char __user *user_finfo;
3199
3200                 user_finfo = u64_to_user_ptr(info.func_info);
3201                 ulen = min_t(u32, info.nr_func_info, ulen);
3202                 if (copy_to_user(user_finfo, prog->aux->func_info,
3203                                  info.func_info_rec_size * ulen))
3204                         return -EFAULT;
3205         }
3206
3207         ulen = info.nr_line_info;
3208         info.nr_line_info = prog->aux->nr_linfo;
3209         if (info.nr_line_info && ulen) {
3210                 __u8 __user *user_linfo;
3211
3212                 user_linfo = u64_to_user_ptr(info.line_info);
3213                 ulen = min_t(u32, info.nr_line_info, ulen);
3214                 if (copy_to_user(user_linfo, prog->aux->linfo,
3215                                  info.line_info_rec_size * ulen))
3216                         return -EFAULT;
3217         }
3218
3219         ulen = info.nr_jited_line_info;
3220         if (prog->aux->jited_linfo)
3221                 info.nr_jited_line_info = prog->aux->nr_linfo;
3222         else
3223                 info.nr_jited_line_info = 0;
3224         if (info.nr_jited_line_info && ulen) {
3225                 if (bpf_dump_raw_ok()) {
3226                         __u64 __user *user_linfo;
3227                         u32 i;
3228
3229                         user_linfo = u64_to_user_ptr(info.jited_line_info);
3230                         ulen = min_t(u32, info.nr_jited_line_info, ulen);
3231                         for (i = 0; i < ulen; i++) {
3232                                 if (put_user((__u64)(long)prog->aux->jited_linfo[i],
3233                                              &user_linfo[i]))
3234                                         return -EFAULT;
3235                         }
3236                 } else {
3237                         info.jited_line_info = 0;
3238                 }
3239         }
3240
3241         ulen = info.nr_prog_tags;
3242         info.nr_prog_tags = prog->aux->func_cnt ? : 1;
3243         if (ulen) {
3244                 __u8 __user (*user_prog_tags)[BPF_TAG_SIZE];
3245                 u32 i;
3246
3247                 user_prog_tags = u64_to_user_ptr(info.prog_tags);
3248                 ulen = min_t(u32, info.nr_prog_tags, ulen);
3249                 if (prog->aux->func_cnt) {
3250                         for (i = 0; i < ulen; i++) {
3251                                 if (copy_to_user(user_prog_tags[i],
3252                                                  prog->aux->func[i]->tag,
3253                                                  BPF_TAG_SIZE))
3254                                         return -EFAULT;
3255                         }
3256                 } else {
3257                         if (copy_to_user(user_prog_tags[0],
3258                                          prog->tag, BPF_TAG_SIZE))
3259                                 return -EFAULT;
3260                 }
3261         }
3262
3263 done:
3264         if (copy_to_user(uinfo, &info, info_len) ||
3265             put_user(info_len, &uattr->info.info_len))
3266                 return -EFAULT;
3267
3268         return 0;
3269 }
3270
3271 static int bpf_map_get_info_by_fd(struct bpf_map *map,
3272                                   const union bpf_attr *attr,
3273                                   union bpf_attr __user *uattr)
3274 {
3275         struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info);
3276         struct bpf_map_info info;
3277         u32 info_len = attr->info.info_len;
3278         int err;
3279
3280         err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len);
3281         if (err)
3282                 return err;
3283         info_len = min_t(u32, sizeof(info), info_len);
3284
3285         memset(&info, 0, sizeof(info));
3286         info.type = map->map_type;
3287         info.id = map->id;
3288         info.key_size = map->key_size;
3289         info.value_size = map->value_size;
3290         info.max_entries = map->max_entries;
3291         info.map_flags = map->map_flags;
3292         memcpy(info.name, map->name, sizeof(map->name));
3293
3294         if (map->btf) {
3295                 info.btf_id = btf_id(map->btf);
3296                 info.btf_key_type_id = map->btf_key_type_id;
3297                 info.btf_value_type_id = map->btf_value_type_id;
3298         }
3299         info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id;
3300
3301         if (bpf_map_is_dev_bound(map)) {
3302                 err = bpf_map_offload_info_fill(&info, map);
3303                 if (err)
3304                         return err;
3305         }
3306
3307         if (copy_to_user(uinfo, &info, info_len) ||
3308             put_user(info_len, &uattr->info.info_len))
3309                 return -EFAULT;
3310
3311         return 0;
3312 }
3313
3314 static int bpf_btf_get_info_by_fd(struct btf *btf,
3315                                   const union bpf_attr *attr,
3316                                   union bpf_attr __user *uattr)
3317 {
3318         struct bpf_btf_info __user *uinfo = u64_to_user_ptr(attr->info.info);
3319         u32 info_len = attr->info.info_len;
3320         int err;
3321
3322         err = bpf_check_uarg_tail_zero(uinfo, sizeof(*uinfo), info_len);
3323         if (err)
3324                 return err;
3325
3326         return btf_get_info_by_fd(btf, attr, uattr);
3327 }
3328
3329 #define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info
3330
3331 static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
3332                                   union bpf_attr __user *uattr)
3333 {
3334         int ufd = attr->info.bpf_fd;
3335         struct fd f;
3336         int err;
3337
3338         if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD))
3339                 return -EINVAL;
3340
3341         f = fdget(ufd);
3342         if (!f.file)
3343                 return -EBADFD;
3344
3345         if (f.file->f_op == &bpf_prog_fops)
3346                 err = bpf_prog_get_info_by_fd(f.file->private_data, attr,
3347                                               uattr);
3348         else if (f.file->f_op == &bpf_map_fops)
3349                 err = bpf_map_get_info_by_fd(f.file->private_data, attr,
3350                                              uattr);
3351         else if (f.file->f_op == &btf_fops)
3352                 err = bpf_btf_get_info_by_fd(f.file->private_data, attr, uattr);
3353         else
3354                 err = -EINVAL;
3355
3356         fdput(f);
3357         return err;
3358 }
3359
3360 #define BPF_BTF_LOAD_LAST_FIELD btf_log_level
3361
3362 static int bpf_btf_load(const union bpf_attr *attr)
3363 {
3364         if (CHECK_ATTR(BPF_BTF_LOAD))
3365                 return -EINVAL;
3366
3367         if (!capable(CAP_SYS_ADMIN))
3368                 return -EPERM;
3369
3370         return btf_new_fd(attr);
3371 }
3372
3373 #define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id
3374
3375 static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
3376 {
3377         if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID))
3378                 return -EINVAL;
3379
3380         if (!capable(CAP_SYS_ADMIN))
3381                 return -EPERM;
3382
3383         return btf_get_fd_by_id(attr->btf_id);
3384 }
3385
3386 static int bpf_task_fd_query_copy(const union bpf_attr *attr,
3387                                     union bpf_attr __user *uattr,
3388                                     u32 prog_id, u32 fd_type,
3389                                     const char *buf, u64 probe_offset,
3390                                     u64 probe_addr)
3391 {
3392         char __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf);
3393         u32 len = buf ? strlen(buf) : 0, input_len;
3394         int err = 0;
3395
3396         if (put_user(len, &uattr->task_fd_query.buf_len))
3397                 return -EFAULT;
3398         input_len = attr->task_fd_query.buf_len;
3399         if (input_len && ubuf) {
3400                 if (!len) {
3401                         /* nothing to copy, just make ubuf NULL terminated */
3402                         char zero = '\0';
3403
3404                         if (put_user(zero, ubuf))
3405                                 return -EFAULT;
3406                 } else if (input_len >= len + 1) {
3407                         /* ubuf can hold the string with NULL terminator */
3408                         if (copy_to_user(ubuf, buf, len + 1))
3409                                 return -EFAULT;
3410                 } else {
3411                         /* ubuf cannot hold the string with NULL terminator,
3412                          * do a partial copy with NULL terminator.
3413                          */
3414                         char zero = '\0';
3415
3416                         err = -ENOSPC;
3417                         if (copy_to_user(ubuf, buf, input_len - 1))
3418                                 return -EFAULT;
3419                         if (put_user(zero, ubuf + input_len - 1))
3420                                 return -EFAULT;
3421                 }
3422         }
3423
3424         if (put_user(prog_id, &uattr->task_fd_query.prog_id) ||
3425             put_user(fd_type, &uattr->task_fd_query.fd_type) ||
3426             put_user(probe_offset, &uattr->task_fd_query.probe_offset) ||
3427             put_user(probe_addr, &uattr->task_fd_query.probe_addr))
3428                 return -EFAULT;
3429
3430         return err;
3431 }
3432
3433 #define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr
3434
3435 static int bpf_task_fd_query(const union bpf_attr *attr,
3436                              union bpf_attr __user *uattr)
3437 {
3438         pid_t pid = attr->task_fd_query.pid;
3439         u32 fd = attr->task_fd_query.fd;
3440         const struct perf_event *event;
3441         struct files_struct *files;
3442         struct task_struct *task;
3443         struct file *file;
3444         int err;
3445
3446         if (CHECK_ATTR(BPF_TASK_FD_QUERY))
3447                 return -EINVAL;
3448
3449         if (!capable(CAP_SYS_ADMIN))
3450                 return -EPERM;
3451
3452         if (attr->task_fd_query.flags != 0)
3453                 return -EINVAL;
3454
3455         task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
3456         if (!task)
3457                 return -ENOENT;
3458
3459         files = get_files_struct(task);
3460         put_task_struct(task);
3461         if (!files)
3462                 return -ENOENT;
3463
3464         err = 0;
3465         spin_lock(&files->file_lock);
3466         file = fcheck_files(files, fd);
3467         if (!file)
3468                 err = -EBADF;
3469         else
3470                 get_file(file);
3471         spin_unlock(&files->file_lock);
3472         put_files_struct(files);
3473
3474         if (err)
3475                 goto out;
3476
3477         if (file->f_op == &bpf_link_fops) {
3478                 struct bpf_link *link = file->private_data;
3479
3480                 if (link->ops == &bpf_raw_tp_lops) {
3481                         struct bpf_raw_tp_link *raw_tp =
3482                                 container_of(link, struct bpf_raw_tp_link, link);
3483                         struct bpf_raw_event_map *btp = raw_tp->btp;
3484
3485                         err = bpf_task_fd_query_copy(attr, uattr,
3486                                                      raw_tp->link.prog->aux->id,
3487                                                      BPF_FD_TYPE_RAW_TRACEPOINT,
3488                                                      btp->tp->name, 0, 0);
3489                         goto put_file;
3490                 }
3491                 goto out_not_supp;
3492         }
3493
3494         event = perf_get_event(file);
3495         if (!IS_ERR(event)) {
3496                 u64 probe_offset, probe_addr;
3497                 u32 prog_id, fd_type;
3498                 const char *buf;
3499
3500                 err = bpf_get_perf_event_info(event, &prog_id, &fd_type,
3501                                               &buf, &probe_offset,
3502                                               &probe_addr);
3503                 if (!err)
3504                         err = bpf_task_fd_query_copy(attr, uattr, prog_id,
3505                                                      fd_type, buf,
3506                                                      probe_offset,
3507                                                      probe_addr);
3508                 goto put_file;
3509         }
3510
3511 out_not_supp:
3512         err = -ENOTSUPP;
3513 put_file:
3514         fput(file);
3515 out:
3516         return err;
3517 }
3518
3519 #define BPF_MAP_BATCH_LAST_FIELD batch.flags
3520
3521 #define BPF_DO_BATCH(fn)                        \
3522         do {                                    \
3523                 if (!fn) {                      \
3524                         err = -ENOTSUPP;        \
3525                         goto err_put;           \
3526                 }                               \
3527                 err = fn(map, attr, uattr);     \
3528         } while (0)
3529
3530 static int bpf_map_do_batch(const union bpf_attr *attr,
3531                             union bpf_attr __user *uattr,
3532                             int cmd)
3533 {
3534         struct bpf_map *map;
3535         int err, ufd;
3536         struct fd f;
3537
3538         if (CHECK_ATTR(BPF_MAP_BATCH))
3539                 return -EINVAL;
3540
3541         ufd = attr->batch.map_fd;
3542         f = fdget(ufd);
3543         map = __bpf_map_get(f);
3544         if (IS_ERR(map))
3545                 return PTR_ERR(map);
3546
3547         if ((cmd == BPF_MAP_LOOKUP_BATCH ||
3548              cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) &&
3549             !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
3550                 err = -EPERM;
3551                 goto err_put;
3552         }
3553
3554         if (cmd != BPF_MAP_LOOKUP_BATCH &&
3555             !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
3556                 err = -EPERM;
3557                 goto err_put;
3558         }
3559
3560         if (cmd == BPF_MAP_LOOKUP_BATCH)
3561                 BPF_DO_BATCH(map->ops->map_lookup_batch);
3562         else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH)
3563                 BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch);
3564         else if (cmd == BPF_MAP_UPDATE_BATCH)
3565                 BPF_DO_BATCH(map->ops->map_update_batch);
3566         else
3567                 BPF_DO_BATCH(map->ops->map_delete_batch);
3568
3569 err_put:
3570         fdput(f);
3571         return err;
3572 }
3573
3574 #define BPF_LINK_CREATE_LAST_FIELD link_create.flags
3575 static int link_create(union bpf_attr *attr)
3576 {
3577         enum bpf_prog_type ptype;
3578         struct bpf_prog *prog;
3579         int ret;
3580
3581         if (!capable(CAP_NET_ADMIN))
3582                 return -EPERM;
3583
3584         if (CHECK_ATTR(BPF_LINK_CREATE))
3585                 return -EINVAL;
3586
3587         ptype = attach_type_to_prog_type(attr->link_create.attach_type);
3588         if (ptype == BPF_PROG_TYPE_UNSPEC)
3589                 return -EINVAL;
3590
3591         prog = bpf_prog_get_type(attr->link_create.prog_fd, ptype);
3592         if (IS_ERR(prog))
3593                 return PTR_ERR(prog);
3594
3595         ret = bpf_prog_attach_check_attach_type(prog,
3596                                                 attr->link_create.attach_type);
3597         if (ret)
3598                 goto err_out;
3599
3600         switch (ptype) {
3601         case BPF_PROG_TYPE_CGROUP_SKB:
3602         case BPF_PROG_TYPE_CGROUP_SOCK:
3603         case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
3604         case BPF_PROG_TYPE_SOCK_OPS:
3605         case BPF_PROG_TYPE_CGROUP_DEVICE:
3606         case BPF_PROG_TYPE_CGROUP_SYSCTL:
3607         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
3608                 ret = cgroup_bpf_link_attach(attr, prog);
3609                 break;
3610         default:
3611                 ret = -EINVAL;
3612         }
3613
3614 err_out:
3615         if (ret < 0)
3616                 bpf_prog_put(prog);
3617         return ret;
3618 }
3619
3620 #define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd
3621
3622 static int link_update(union bpf_attr *attr)
3623 {
3624         struct bpf_prog *old_prog = NULL, *new_prog;
3625         struct bpf_link *link;
3626         u32 flags;
3627         int ret;
3628
3629         if (!capable(CAP_NET_ADMIN))
3630                 return -EPERM;
3631
3632         if (CHECK_ATTR(BPF_LINK_UPDATE))
3633                 return -EINVAL;
3634
3635         flags = attr->link_update.flags;
3636         if (flags & ~BPF_F_REPLACE)
3637                 return -EINVAL;
3638
3639         link = bpf_link_get_from_fd(attr->link_update.link_fd);
3640         if (IS_ERR(link))
3641                 return PTR_ERR(link);
3642
3643         new_prog = bpf_prog_get(attr->link_update.new_prog_fd);
3644         if (IS_ERR(new_prog)) {
3645                 ret = PTR_ERR(new_prog);
3646                 goto out_put_link;
3647         }
3648
3649         if (flags & BPF_F_REPLACE) {
3650                 old_prog = bpf_prog_get(attr->link_update.old_prog_fd);
3651                 if (IS_ERR(old_prog)) {
3652                         ret = PTR_ERR(old_prog);
3653                         old_prog = NULL;
3654                         goto out_put_progs;
3655                 }
3656         } else if (attr->link_update.old_prog_fd) {
3657                 ret = -EINVAL;
3658                 goto out_put_progs;
3659         }
3660
3661 #ifdef CONFIG_CGROUP_BPF
3662         if (link->ops == &bpf_cgroup_link_lops) {
3663                 ret = cgroup_bpf_replace(link, old_prog, new_prog);
3664                 goto out_put_progs;
3665         }
3666 #endif
3667         ret = -EINVAL;
3668
3669 out_put_progs:
3670         if (old_prog)
3671                 bpf_prog_put(old_prog);
3672         if (ret)
3673                 bpf_prog_put(new_prog);
3674 out_put_link:
3675         bpf_link_put(link);
3676         return ret;
3677 }
3678
3679 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
3680 {
3681         union bpf_attr attr;
3682         int err;
3683
3684         if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))
3685                 return -EPERM;
3686
3687         err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
3688         if (err)
3689                 return err;
3690         size = min_t(u32, size, sizeof(attr));
3691
3692         /* copy attributes from user space, may be less than sizeof(bpf_attr) */
3693         memset(&attr, 0, sizeof(attr));
3694         if (copy_from_user(&attr, uattr, size) != 0)
3695                 return -EFAULT;
3696
3697         err = security_bpf(cmd, &attr, size);
3698         if (err < 0)
3699                 return err;
3700
3701         switch (cmd) {
3702         case BPF_MAP_CREATE:
3703                 err = map_create(&attr);
3704                 break;
3705         case BPF_MAP_LOOKUP_ELEM:
3706                 err = map_lookup_elem(&attr);
3707                 break;
3708         case BPF_MAP_UPDATE_ELEM:
3709                 err = map_update_elem(&attr);
3710                 break;
3711         case BPF_MAP_DELETE_ELEM:
3712                 err = map_delete_elem(&attr);
3713                 break;
3714         case BPF_MAP_GET_NEXT_KEY:
3715                 err = map_get_next_key(&attr);
3716                 break;
3717         case BPF_MAP_FREEZE:
3718                 err = map_freeze(&attr);
3719                 break;
3720         case BPF_PROG_LOAD:
3721                 err = bpf_prog_load(&attr, uattr);
3722                 break;
3723         case BPF_OBJ_PIN:
3724                 err = bpf_obj_pin(&attr);
3725                 break;
3726         case BPF_OBJ_GET:
3727                 err = bpf_obj_get(&attr);
3728                 break;
3729         case BPF_PROG_ATTACH:
3730                 err = bpf_prog_attach(&attr);
3731                 break;
3732         case BPF_PROG_DETACH:
3733                 err = bpf_prog_detach(&attr);
3734                 break;
3735         case BPF_PROG_QUERY:
3736                 err = bpf_prog_query(&attr, uattr);
3737                 break;
3738         case BPF_PROG_TEST_RUN:
3739                 err = bpf_prog_test_run(&attr, uattr);
3740                 break;
3741         case BPF_PROG_GET_NEXT_ID:
3742                 err = bpf_obj_get_next_id(&attr, uattr,
3743                                           &prog_idr, &prog_idr_lock);
3744                 break;
3745         case BPF_MAP_GET_NEXT_ID:
3746                 err = bpf_obj_get_next_id(&attr, uattr,
3747                                           &map_idr, &map_idr_lock);
3748                 break;
3749         case BPF_BTF_GET_NEXT_ID:
3750                 err = bpf_obj_get_next_id(&attr, uattr,
3751                                           &btf_idr, &btf_idr_lock);
3752                 break;
3753         case BPF_PROG_GET_FD_BY_ID:
3754                 err = bpf_prog_get_fd_by_id(&attr);
3755                 break;
3756         case BPF_MAP_GET_FD_BY_ID:
3757                 err = bpf_map_get_fd_by_id(&attr);
3758                 break;
3759         case BPF_OBJ_GET_INFO_BY_FD:
3760                 err = bpf_obj_get_info_by_fd(&attr, uattr);
3761                 break;
3762         case BPF_RAW_TRACEPOINT_OPEN:
3763                 err = bpf_raw_tracepoint_open(&attr);
3764                 break;
3765         case BPF_BTF_LOAD:
3766                 err = bpf_btf_load(&attr);
3767                 break;
3768         case BPF_BTF_GET_FD_BY_ID:
3769                 err = bpf_btf_get_fd_by_id(&attr);
3770                 break;
3771         case BPF_TASK_FD_QUERY:
3772                 err = bpf_task_fd_query(&attr, uattr);
3773                 break;
3774         case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
3775                 err = map_lookup_and_delete_elem(&attr);
3776                 break;
3777         case BPF_MAP_LOOKUP_BATCH:
3778                 err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_BATCH);
3779                 break;
3780         case BPF_MAP_LOOKUP_AND_DELETE_BATCH:
3781                 err = bpf_map_do_batch(&attr, uattr,
3782                                        BPF_MAP_LOOKUP_AND_DELETE_BATCH);
3783                 break;
3784         case BPF_MAP_UPDATE_BATCH:
3785                 err = bpf_map_do_batch(&attr, uattr, BPF_MAP_UPDATE_BATCH);
3786                 break;
3787         case BPF_MAP_DELETE_BATCH:
3788                 err = bpf_map_do_batch(&attr, uattr, BPF_MAP_DELETE_BATCH);
3789                 break;
3790         case BPF_LINK_CREATE:
3791                 err = link_create(&attr);
3792                 break;
3793         case BPF_LINK_UPDATE:
3794                 err = link_update(&attr);
3795                 break;
3796         default:
3797                 err = -EINVAL;
3798                 break;
3799         }
3800
3801         return err;
3802 }