mm: remove __vmalloc_node_flags_caller
[linux-2.6-microblaze.git] / kernel / bpf / syscall.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
3  */
4 #include <linux/bpf.h>
5 #include <linux/bpf_trace.h>
6 #include <linux/bpf_lirc.h>
7 #include <linux/btf.h>
8 #include <linux/syscalls.h>
9 #include <linux/slab.h>
10 #include <linux/sched/signal.h>
11 #include <linux/vmalloc.h>
12 #include <linux/mmzone.h>
13 #include <linux/anon_inodes.h>
14 #include <linux/fdtable.h>
15 #include <linux/file.h>
16 #include <linux/fs.h>
17 #include <linux/license.h>
18 #include <linux/filter.h>
19 #include <linux/version.h>
20 #include <linux/kernel.h>
21 #include <linux/idr.h>
22 #include <linux/cred.h>
23 #include <linux/timekeeping.h>
24 #include <linux/ctype.h>
25 #include <linux/nospec.h>
26 #include <linux/audit.h>
27 #include <uapi/linux/btf.h>
28 #include <linux/bpf_lsm.h>
29
30 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
31                           (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
32                           (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
33 #define IS_FD_PROG_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY)
34 #define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
35 #define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map) || \
36                         IS_FD_HASH(map))
37
38 #define BPF_OBJ_FLAG_MASK   (BPF_F_RDONLY | BPF_F_WRONLY)
39
40 DEFINE_PER_CPU(int, bpf_prog_active);
41 static DEFINE_IDR(prog_idr);
42 static DEFINE_SPINLOCK(prog_idr_lock);
43 static DEFINE_IDR(map_idr);
44 static DEFINE_SPINLOCK(map_idr_lock);
45
46 int sysctl_unprivileged_bpf_disabled __read_mostly;
47
48 static const struct bpf_map_ops * const bpf_map_types[] = {
49 #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
50 #define BPF_MAP_TYPE(_id, _ops) \
51         [_id] = &_ops,
52 #include <linux/bpf_types.h>
53 #undef BPF_PROG_TYPE
54 #undef BPF_MAP_TYPE
55 };
56
57 /*
58  * If we're handed a bigger struct than we know of, ensure all the unknown bits
59  * are 0 - i.e. new user-space does not rely on any kernel feature extensions
60  * we don't know about yet.
61  *
62  * There is a ToCToU between this function call and the following
63  * copy_from_user() call. However, this is not a concern since this function is
64  * meant to be a future-proofing of bits.
65  */
66 int bpf_check_uarg_tail_zero(void __user *uaddr,
67                              size_t expected_size,
68                              size_t actual_size)
69 {
70         unsigned char __user *addr;
71         unsigned char __user *end;
72         unsigned char val;
73         int err;
74
75         if (unlikely(actual_size > PAGE_SIZE))  /* silly large */
76                 return -E2BIG;
77
78         if (unlikely(!access_ok(uaddr, actual_size)))
79                 return -EFAULT;
80
81         if (actual_size <= expected_size)
82                 return 0;
83
84         addr = uaddr + expected_size;
85         end  = uaddr + actual_size;
86
87         for (; addr < end; addr++) {
88                 err = get_user(val, addr);
89                 if (err)
90                         return err;
91                 if (val)
92                         return -E2BIG;
93         }
94
95         return 0;
96 }
97
98 const struct bpf_map_ops bpf_map_offload_ops = {
99         .map_alloc = bpf_map_offload_map_alloc,
100         .map_free = bpf_map_offload_map_free,
101         .map_check_btf = map_check_no_btf,
102 };
103
104 static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
105 {
106         const struct bpf_map_ops *ops;
107         u32 type = attr->map_type;
108         struct bpf_map *map;
109         int err;
110
111         if (type >= ARRAY_SIZE(bpf_map_types))
112                 return ERR_PTR(-EINVAL);
113         type = array_index_nospec(type, ARRAY_SIZE(bpf_map_types));
114         ops = bpf_map_types[type];
115         if (!ops)
116                 return ERR_PTR(-EINVAL);
117
118         if (ops->map_alloc_check) {
119                 err = ops->map_alloc_check(attr);
120                 if (err)
121                         return ERR_PTR(err);
122         }
123         if (attr->map_ifindex)
124                 ops = &bpf_map_offload_ops;
125         map = ops->map_alloc(attr);
126         if (IS_ERR(map))
127                 return map;
128         map->ops = ops;
129         map->map_type = type;
130         return map;
131 }
132
133 static u32 bpf_map_value_size(struct bpf_map *map)
134 {
135         if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
136             map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
137             map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
138             map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
139                 return round_up(map->value_size, 8) * num_possible_cpus();
140         else if (IS_FD_MAP(map))
141                 return sizeof(u32);
142         else
143                 return  map->value_size;
144 }
145
146 static void maybe_wait_bpf_programs(struct bpf_map *map)
147 {
148         /* Wait for any running BPF programs to complete so that
149          * userspace, when we return to it, knows that all programs
150          * that could be running use the new map value.
151          */
152         if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS ||
153             map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
154                 synchronize_rcu();
155 }
156
157 static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
158                                 void *value, __u64 flags)
159 {
160         int err;
161
162         /* Need to create a kthread, thus must support schedule */
163         if (bpf_map_is_dev_bound(map)) {
164                 return bpf_map_offload_update_elem(map, key, value, flags);
165         } else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
166                    map->map_type == BPF_MAP_TYPE_SOCKHASH ||
167                    map->map_type == BPF_MAP_TYPE_SOCKMAP ||
168                    map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
169                 return map->ops->map_update_elem(map, key, value, flags);
170         } else if (IS_FD_PROG_ARRAY(map)) {
171                 return bpf_fd_array_map_update_elem(map, f.file, key, value,
172                                                     flags);
173         }
174
175         bpf_disable_instrumentation();
176         if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
177             map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
178                 err = bpf_percpu_hash_update(map, key, value, flags);
179         } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
180                 err = bpf_percpu_array_update(map, key, value, flags);
181         } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
182                 err = bpf_percpu_cgroup_storage_update(map, key, value,
183                                                        flags);
184         } else if (IS_FD_ARRAY(map)) {
185                 rcu_read_lock();
186                 err = bpf_fd_array_map_update_elem(map, f.file, key, value,
187                                                    flags);
188                 rcu_read_unlock();
189         } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
190                 rcu_read_lock();
191                 err = bpf_fd_htab_map_update_elem(map, f.file, key, value,
192                                                   flags);
193                 rcu_read_unlock();
194         } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
195                 /* rcu_read_lock() is not needed */
196                 err = bpf_fd_reuseport_array_update_elem(map, key, value,
197                                                          flags);
198         } else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
199                    map->map_type == BPF_MAP_TYPE_STACK) {
200                 err = map->ops->map_push_elem(map, value, flags);
201         } else {
202                 rcu_read_lock();
203                 err = map->ops->map_update_elem(map, key, value, flags);
204                 rcu_read_unlock();
205         }
206         bpf_enable_instrumentation();
207         maybe_wait_bpf_programs(map);
208
209         return err;
210 }
211
212 static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
213                               __u64 flags)
214 {
215         void *ptr;
216         int err;
217
218         if (bpf_map_is_dev_bound(map))
219                 return bpf_map_offload_lookup_elem(map, key, value);
220
221         bpf_disable_instrumentation();
222         if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
223             map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
224                 err = bpf_percpu_hash_copy(map, key, value);
225         } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
226                 err = bpf_percpu_array_copy(map, key, value);
227         } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
228                 err = bpf_percpu_cgroup_storage_copy(map, key, value);
229         } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
230                 err = bpf_stackmap_copy(map, key, value);
231         } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) {
232                 err = bpf_fd_array_map_lookup_elem(map, key, value);
233         } else if (IS_FD_HASH(map)) {
234                 err = bpf_fd_htab_map_lookup_elem(map, key, value);
235         } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
236                 err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
237         } else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
238                    map->map_type == BPF_MAP_TYPE_STACK) {
239                 err = map->ops->map_peek_elem(map, value);
240         } else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
241                 /* struct_ops map requires directly updating "value" */
242                 err = bpf_struct_ops_map_sys_lookup_elem(map, key, value);
243         } else {
244                 rcu_read_lock();
245                 if (map->ops->map_lookup_elem_sys_only)
246                         ptr = map->ops->map_lookup_elem_sys_only(map, key);
247                 else
248                         ptr = map->ops->map_lookup_elem(map, key);
249                 if (IS_ERR(ptr)) {
250                         err = PTR_ERR(ptr);
251                 } else if (!ptr) {
252                         err = -ENOENT;
253                 } else {
254                         err = 0;
255                         if (flags & BPF_F_LOCK)
256                                 /* lock 'ptr' and copy everything but lock */
257                                 copy_map_value_locked(map, value, ptr, true);
258                         else
259                                 copy_map_value(map, value, ptr);
260                         /* mask lock, since value wasn't zero inited */
261                         check_and_init_map_lock(map, value);
262                 }
263                 rcu_read_unlock();
264         }
265
266         bpf_enable_instrumentation();
267         maybe_wait_bpf_programs(map);
268
269         return err;
270 }
271
272 static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable)
273 {
274         /* We really just want to fail instead of triggering OOM killer
275          * under memory pressure, therefore we set __GFP_NORETRY to kmalloc,
276          * which is used for lower order allocation requests.
277          *
278          * It has been observed that higher order allocation requests done by
279          * vmalloc with __GFP_NORETRY being set might fail due to not trying
280          * to reclaim memory from the page cache, thus we set
281          * __GFP_RETRY_MAYFAIL to avoid such situations.
282          */
283
284         const gfp_t flags = __GFP_NOWARN | __GFP_ZERO;
285         void *area;
286
287         if (size >= SIZE_MAX)
288                 return NULL;
289
290         /* kmalloc()'ed memory can't be mmap()'ed */
291         if (!mmapable && size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
292                 area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags,
293                                     numa_node);
294                 if (area != NULL)
295                         return area;
296         }
297         if (mmapable) {
298                 BUG_ON(!PAGE_ALIGNED(size));
299                 return vmalloc_user_node_flags(size, numa_node, GFP_KERNEL |
300                                                __GFP_RETRY_MAYFAIL | flags);
301         }
302         return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_RETRY_MAYFAIL | flags,
303                               numa_node, __builtin_return_address(0));
304 }
305
306 void *bpf_map_area_alloc(u64 size, int numa_node)
307 {
308         return __bpf_map_area_alloc(size, numa_node, false);
309 }
310
311 void *bpf_map_area_mmapable_alloc(u64 size, int numa_node)
312 {
313         return __bpf_map_area_alloc(size, numa_node, true);
314 }
315
316 void bpf_map_area_free(void *area)
317 {
318         kvfree(area);
319 }
320
321 static u32 bpf_map_flags_retain_permanent(u32 flags)
322 {
323         /* Some map creation flags are not tied to the map object but
324          * rather to the map fd instead, so they have no meaning upon
325          * map object inspection since multiple file descriptors with
326          * different (access) properties can exist here. Thus, given
327          * this has zero meaning for the map itself, lets clear these
328          * from here.
329          */
330         return flags & ~(BPF_F_RDONLY | BPF_F_WRONLY);
331 }
332
333 void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr)
334 {
335         map->map_type = attr->map_type;
336         map->key_size = attr->key_size;
337         map->value_size = attr->value_size;
338         map->max_entries = attr->max_entries;
339         map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags);
340         map->numa_node = bpf_map_attr_numa_node(attr);
341 }
342
343 static int bpf_charge_memlock(struct user_struct *user, u32 pages)
344 {
345         unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
346
347         if (atomic_long_add_return(pages, &user->locked_vm) > memlock_limit) {
348                 atomic_long_sub(pages, &user->locked_vm);
349                 return -EPERM;
350         }
351         return 0;
352 }
353
354 static void bpf_uncharge_memlock(struct user_struct *user, u32 pages)
355 {
356         if (user)
357                 atomic_long_sub(pages, &user->locked_vm);
358 }
359
360 int bpf_map_charge_init(struct bpf_map_memory *mem, u64 size)
361 {
362         u32 pages = round_up(size, PAGE_SIZE) >> PAGE_SHIFT;
363         struct user_struct *user;
364         int ret;
365
366         if (size >= U32_MAX - PAGE_SIZE)
367                 return -E2BIG;
368
369         user = get_current_user();
370         ret = bpf_charge_memlock(user, pages);
371         if (ret) {
372                 free_uid(user);
373                 return ret;
374         }
375
376         mem->pages = pages;
377         mem->user = user;
378
379         return 0;
380 }
381
382 void bpf_map_charge_finish(struct bpf_map_memory *mem)
383 {
384         bpf_uncharge_memlock(mem->user, mem->pages);
385         free_uid(mem->user);
386 }
387
388 void bpf_map_charge_move(struct bpf_map_memory *dst,
389                          struct bpf_map_memory *src)
390 {
391         *dst = *src;
392
393         /* Make sure src will not be used for the redundant uncharging. */
394         memset(src, 0, sizeof(struct bpf_map_memory));
395 }
396
397 int bpf_map_charge_memlock(struct bpf_map *map, u32 pages)
398 {
399         int ret;
400
401         ret = bpf_charge_memlock(map->memory.user, pages);
402         if (ret)
403                 return ret;
404         map->memory.pages += pages;
405         return ret;
406 }
407
408 void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages)
409 {
410         bpf_uncharge_memlock(map->memory.user, pages);
411         map->memory.pages -= pages;
412 }
413
414 static int bpf_map_alloc_id(struct bpf_map *map)
415 {
416         int id;
417
418         idr_preload(GFP_KERNEL);
419         spin_lock_bh(&map_idr_lock);
420         id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC);
421         if (id > 0)
422                 map->id = id;
423         spin_unlock_bh(&map_idr_lock);
424         idr_preload_end();
425
426         if (WARN_ON_ONCE(!id))
427                 return -ENOSPC;
428
429         return id > 0 ? 0 : id;
430 }
431
432 void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
433 {
434         unsigned long flags;
435
436         /* Offloaded maps are removed from the IDR store when their device
437          * disappears - even if someone holds an fd to them they are unusable,
438          * the memory is gone, all ops will fail; they are simply waiting for
439          * refcnt to drop to be freed.
440          */
441         if (!map->id)
442                 return;
443
444         if (do_idr_lock)
445                 spin_lock_irqsave(&map_idr_lock, flags);
446         else
447                 __acquire(&map_idr_lock);
448
449         idr_remove(&map_idr, map->id);
450         map->id = 0;
451
452         if (do_idr_lock)
453                 spin_unlock_irqrestore(&map_idr_lock, flags);
454         else
455                 __release(&map_idr_lock);
456 }
457
458 /* called from workqueue */
459 static void bpf_map_free_deferred(struct work_struct *work)
460 {
461         struct bpf_map *map = container_of(work, struct bpf_map, work);
462         struct bpf_map_memory mem;
463
464         bpf_map_charge_move(&mem, &map->memory);
465         security_bpf_map_free(map);
466         /* implementation dependent freeing */
467         map->ops->map_free(map);
468         bpf_map_charge_finish(&mem);
469 }
470
471 static void bpf_map_put_uref(struct bpf_map *map)
472 {
473         if (atomic64_dec_and_test(&map->usercnt)) {
474                 if (map->ops->map_release_uref)
475                         map->ops->map_release_uref(map);
476         }
477 }
478
479 /* decrement map refcnt and schedule it for freeing via workqueue
480  * (unrelying map implementation ops->map_free() might sleep)
481  */
482 static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock)
483 {
484         if (atomic64_dec_and_test(&map->refcnt)) {
485                 /* bpf_map_free_id() must be called first */
486                 bpf_map_free_id(map, do_idr_lock);
487                 btf_put(map->btf);
488                 INIT_WORK(&map->work, bpf_map_free_deferred);
489                 schedule_work(&map->work);
490         }
491 }
492
493 void bpf_map_put(struct bpf_map *map)
494 {
495         __bpf_map_put(map, true);
496 }
497 EXPORT_SYMBOL_GPL(bpf_map_put);
498
499 void bpf_map_put_with_uref(struct bpf_map *map)
500 {
501         bpf_map_put_uref(map);
502         bpf_map_put(map);
503 }
504
505 static int bpf_map_release(struct inode *inode, struct file *filp)
506 {
507         struct bpf_map *map = filp->private_data;
508
509         if (map->ops->map_release)
510                 map->ops->map_release(map, filp);
511
512         bpf_map_put_with_uref(map);
513         return 0;
514 }
515
516 static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f)
517 {
518         fmode_t mode = f.file->f_mode;
519
520         /* Our file permissions may have been overridden by global
521          * map permissions facing syscall side.
522          */
523         if (READ_ONCE(map->frozen))
524                 mode &= ~FMODE_CAN_WRITE;
525         return mode;
526 }
527
528 #ifdef CONFIG_PROC_FS
529 static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
530 {
531         const struct bpf_map *map = filp->private_data;
532         const struct bpf_array *array;
533         u32 type = 0, jited = 0;
534
535         if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
536                 array = container_of(map, struct bpf_array, map);
537                 type  = array->aux->type;
538                 jited = array->aux->jited;
539         }
540
541         seq_printf(m,
542                    "map_type:\t%u\n"
543                    "key_size:\t%u\n"
544                    "value_size:\t%u\n"
545                    "max_entries:\t%u\n"
546                    "map_flags:\t%#x\n"
547                    "memlock:\t%llu\n"
548                    "map_id:\t%u\n"
549                    "frozen:\t%u\n",
550                    map->map_type,
551                    map->key_size,
552                    map->value_size,
553                    map->max_entries,
554                    map->map_flags,
555                    map->memory.pages * 1ULL << PAGE_SHIFT,
556                    map->id,
557                    READ_ONCE(map->frozen));
558         if (type) {
559                 seq_printf(m, "owner_prog_type:\t%u\n", type);
560                 seq_printf(m, "owner_jited:\t%u\n", jited);
561         }
562 }
563 #endif
564
565 static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz,
566                               loff_t *ppos)
567 {
568         /* We need this handler such that alloc_file() enables
569          * f_mode with FMODE_CAN_READ.
570          */
571         return -EINVAL;
572 }
573
574 static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf,
575                                size_t siz, loff_t *ppos)
576 {
577         /* We need this handler such that alloc_file() enables
578          * f_mode with FMODE_CAN_WRITE.
579          */
580         return -EINVAL;
581 }
582
583 /* called for any extra memory-mapped regions (except initial) */
584 static void bpf_map_mmap_open(struct vm_area_struct *vma)
585 {
586         struct bpf_map *map = vma->vm_file->private_data;
587
588         if (vma->vm_flags & VM_MAYWRITE) {
589                 mutex_lock(&map->freeze_mutex);
590                 map->writecnt++;
591                 mutex_unlock(&map->freeze_mutex);
592         }
593 }
594
595 /* called for all unmapped memory region (including initial) */
596 static void bpf_map_mmap_close(struct vm_area_struct *vma)
597 {
598         struct bpf_map *map = vma->vm_file->private_data;
599
600         if (vma->vm_flags & VM_MAYWRITE) {
601                 mutex_lock(&map->freeze_mutex);
602                 map->writecnt--;
603                 mutex_unlock(&map->freeze_mutex);
604         }
605 }
606
607 static const struct vm_operations_struct bpf_map_default_vmops = {
608         .open           = bpf_map_mmap_open,
609         .close          = bpf_map_mmap_close,
610 };
611
612 static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
613 {
614         struct bpf_map *map = filp->private_data;
615         int err;
616
617         if (!map->ops->map_mmap || map_value_has_spin_lock(map))
618                 return -ENOTSUPP;
619
620         if (!(vma->vm_flags & VM_SHARED))
621                 return -EINVAL;
622
623         mutex_lock(&map->freeze_mutex);
624
625         if (vma->vm_flags & VM_WRITE) {
626                 if (map->frozen) {
627                         err = -EPERM;
628                         goto out;
629                 }
630                 /* map is meant to be read-only, so do not allow mapping as
631                  * writable, because it's possible to leak a writable page
632                  * reference and allows user-space to still modify it after
633                  * freezing, while verifier will assume contents do not change
634                  */
635                 if (map->map_flags & BPF_F_RDONLY_PROG) {
636                         err = -EACCES;
637                         goto out;
638                 }
639         }
640
641         /* set default open/close callbacks */
642         vma->vm_ops = &bpf_map_default_vmops;
643         vma->vm_private_data = map;
644         vma->vm_flags &= ~VM_MAYEXEC;
645         if (!(vma->vm_flags & VM_WRITE))
646                 /* disallow re-mapping with PROT_WRITE */
647                 vma->vm_flags &= ~VM_MAYWRITE;
648
649         err = map->ops->map_mmap(map, vma);
650         if (err)
651                 goto out;
652
653         if (vma->vm_flags & VM_MAYWRITE)
654                 map->writecnt++;
655 out:
656         mutex_unlock(&map->freeze_mutex);
657         return err;
658 }
659
660 const struct file_operations bpf_map_fops = {
661 #ifdef CONFIG_PROC_FS
662         .show_fdinfo    = bpf_map_show_fdinfo,
663 #endif
664         .release        = bpf_map_release,
665         .read           = bpf_dummy_read,
666         .write          = bpf_dummy_write,
667         .mmap           = bpf_map_mmap,
668 };
669
670 int bpf_map_new_fd(struct bpf_map *map, int flags)
671 {
672         int ret;
673
674         ret = security_bpf_map(map, OPEN_FMODE(flags));
675         if (ret < 0)
676                 return ret;
677
678         return anon_inode_getfd("bpf-map", &bpf_map_fops, map,
679                                 flags | O_CLOEXEC);
680 }
681
682 int bpf_get_file_flag(int flags)
683 {
684         if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY))
685                 return -EINVAL;
686         if (flags & BPF_F_RDONLY)
687                 return O_RDONLY;
688         if (flags & BPF_F_WRONLY)
689                 return O_WRONLY;
690         return O_RDWR;
691 }
692
693 /* helper macro to check that unused fields 'union bpf_attr' are zero */
694 #define CHECK_ATTR(CMD) \
695         memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
696                    sizeof(attr->CMD##_LAST_FIELD), 0, \
697                    sizeof(*attr) - \
698                    offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
699                    sizeof(attr->CMD##_LAST_FIELD)) != NULL
700
701 /* dst and src must have at least "size" number of bytes.
702  * Return strlen on success and < 0 on error.
703  */
704 int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size)
705 {
706         const char *end = src + size;
707         const char *orig_src = src;
708
709         memset(dst, 0, size);
710         /* Copy all isalnum(), '_' and '.' chars. */
711         while (src < end && *src) {
712                 if (!isalnum(*src) &&
713                     *src != '_' && *src != '.')
714                         return -EINVAL;
715                 *dst++ = *src++;
716         }
717
718         /* No '\0' found in "size" number of bytes */
719         if (src == end)
720                 return -EINVAL;
721
722         return src - orig_src;
723 }
724
725 int map_check_no_btf(const struct bpf_map *map,
726                      const struct btf *btf,
727                      const struct btf_type *key_type,
728                      const struct btf_type *value_type)
729 {
730         return -ENOTSUPP;
731 }
732
733 static int map_check_btf(struct bpf_map *map, const struct btf *btf,
734                          u32 btf_key_id, u32 btf_value_id)
735 {
736         const struct btf_type *key_type, *value_type;
737         u32 key_size, value_size;
738         int ret = 0;
739
740         /* Some maps allow key to be unspecified. */
741         if (btf_key_id) {
742                 key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
743                 if (!key_type || key_size != map->key_size)
744                         return -EINVAL;
745         } else {
746                 key_type = btf_type_by_id(btf, 0);
747                 if (!map->ops->map_check_btf)
748                         return -EINVAL;
749         }
750
751         value_type = btf_type_id_size(btf, &btf_value_id, &value_size);
752         if (!value_type || value_size != map->value_size)
753                 return -EINVAL;
754
755         map->spin_lock_off = btf_find_spin_lock(btf, value_type);
756
757         if (map_value_has_spin_lock(map)) {
758                 if (map->map_flags & BPF_F_RDONLY_PROG)
759                         return -EACCES;
760                 if (map->map_type != BPF_MAP_TYPE_HASH &&
761                     map->map_type != BPF_MAP_TYPE_ARRAY &&
762                     map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
763                     map->map_type != BPF_MAP_TYPE_SK_STORAGE)
764                         return -ENOTSUPP;
765                 if (map->spin_lock_off + sizeof(struct bpf_spin_lock) >
766                     map->value_size) {
767                         WARN_ONCE(1,
768                                   "verifier bug spin_lock_off %d value_size %d\n",
769                                   map->spin_lock_off, map->value_size);
770                         return -EFAULT;
771                 }
772         }
773
774         if (map->ops->map_check_btf)
775                 ret = map->ops->map_check_btf(map, btf, key_type, value_type);
776
777         return ret;
778 }
779
780 #define BPF_MAP_CREATE_LAST_FIELD btf_vmlinux_value_type_id
781 /* called via syscall */
782 static int map_create(union bpf_attr *attr)
783 {
784         int numa_node = bpf_map_attr_numa_node(attr);
785         struct bpf_map_memory mem;
786         struct bpf_map *map;
787         int f_flags;
788         int err;
789
790         err = CHECK_ATTR(BPF_MAP_CREATE);
791         if (err)
792                 return -EINVAL;
793
794         if (attr->btf_vmlinux_value_type_id) {
795                 if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS ||
796                     attr->btf_key_type_id || attr->btf_value_type_id)
797                         return -EINVAL;
798         } else if (attr->btf_key_type_id && !attr->btf_value_type_id) {
799                 return -EINVAL;
800         }
801
802         f_flags = bpf_get_file_flag(attr->map_flags);
803         if (f_flags < 0)
804                 return f_flags;
805
806         if (numa_node != NUMA_NO_NODE &&
807             ((unsigned int)numa_node >= nr_node_ids ||
808              !node_online(numa_node)))
809                 return -EINVAL;
810
811         /* find map type and init map: hashtable vs rbtree vs bloom vs ... */
812         map = find_and_alloc_map(attr);
813         if (IS_ERR(map))
814                 return PTR_ERR(map);
815
816         err = bpf_obj_name_cpy(map->name, attr->map_name,
817                                sizeof(attr->map_name));
818         if (err < 0)
819                 goto free_map;
820
821         atomic64_set(&map->refcnt, 1);
822         atomic64_set(&map->usercnt, 1);
823         mutex_init(&map->freeze_mutex);
824
825         map->spin_lock_off = -EINVAL;
826         if (attr->btf_key_type_id || attr->btf_value_type_id ||
827             /* Even the map's value is a kernel's struct,
828              * the bpf_prog.o must have BTF to begin with
829              * to figure out the corresponding kernel's
830              * counter part.  Thus, attr->btf_fd has
831              * to be valid also.
832              */
833             attr->btf_vmlinux_value_type_id) {
834                 struct btf *btf;
835
836                 btf = btf_get_by_fd(attr->btf_fd);
837                 if (IS_ERR(btf)) {
838                         err = PTR_ERR(btf);
839                         goto free_map;
840                 }
841                 map->btf = btf;
842
843                 if (attr->btf_value_type_id) {
844                         err = map_check_btf(map, btf, attr->btf_key_type_id,
845                                             attr->btf_value_type_id);
846                         if (err)
847                                 goto free_map;
848                 }
849
850                 map->btf_key_type_id = attr->btf_key_type_id;
851                 map->btf_value_type_id = attr->btf_value_type_id;
852                 map->btf_vmlinux_value_type_id =
853                         attr->btf_vmlinux_value_type_id;
854         }
855
856         err = security_bpf_map_alloc(map);
857         if (err)
858                 goto free_map;
859
860         err = bpf_map_alloc_id(map);
861         if (err)
862                 goto free_map_sec;
863
864         err = bpf_map_new_fd(map, f_flags);
865         if (err < 0) {
866                 /* failed to allocate fd.
867                  * bpf_map_put_with_uref() is needed because the above
868                  * bpf_map_alloc_id() has published the map
869                  * to the userspace and the userspace may
870                  * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID.
871                  */
872                 bpf_map_put_with_uref(map);
873                 return err;
874         }
875
876         return err;
877
878 free_map_sec:
879         security_bpf_map_free(map);
880 free_map:
881         btf_put(map->btf);
882         bpf_map_charge_move(&mem, &map->memory);
883         map->ops->map_free(map);
884         bpf_map_charge_finish(&mem);
885         return err;
886 }
887
888 /* if error is returned, fd is released.
889  * On success caller should complete fd access with matching fdput()
890  */
891 struct bpf_map *__bpf_map_get(struct fd f)
892 {
893         if (!f.file)
894                 return ERR_PTR(-EBADF);
895         if (f.file->f_op != &bpf_map_fops) {
896                 fdput(f);
897                 return ERR_PTR(-EINVAL);
898         }
899
900         return f.file->private_data;
901 }
902
903 void bpf_map_inc(struct bpf_map *map)
904 {
905         atomic64_inc(&map->refcnt);
906 }
907 EXPORT_SYMBOL_GPL(bpf_map_inc);
908
909 void bpf_map_inc_with_uref(struct bpf_map *map)
910 {
911         atomic64_inc(&map->refcnt);
912         atomic64_inc(&map->usercnt);
913 }
914 EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref);
915
916 struct bpf_map *bpf_map_get(u32 ufd)
917 {
918         struct fd f = fdget(ufd);
919         struct bpf_map *map;
920
921         map = __bpf_map_get(f);
922         if (IS_ERR(map))
923                 return map;
924
925         bpf_map_inc(map);
926         fdput(f);
927
928         return map;
929 }
930
931 struct bpf_map *bpf_map_get_with_uref(u32 ufd)
932 {
933         struct fd f = fdget(ufd);
934         struct bpf_map *map;
935
936         map = __bpf_map_get(f);
937         if (IS_ERR(map))
938                 return map;
939
940         bpf_map_inc_with_uref(map);
941         fdput(f);
942
943         return map;
944 }
945
946 /* map_idr_lock should have been held */
947 static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
948 {
949         int refold;
950
951         refold = atomic64_fetch_add_unless(&map->refcnt, 1, 0);
952         if (!refold)
953                 return ERR_PTR(-ENOENT);
954         if (uref)
955                 atomic64_inc(&map->usercnt);
956
957         return map;
958 }
959
960 struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map)
961 {
962         spin_lock_bh(&map_idr_lock);
963         map = __bpf_map_inc_not_zero(map, false);
964         spin_unlock_bh(&map_idr_lock);
965
966         return map;
967 }
968 EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero);
969
970 int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
971 {
972         return -ENOTSUPP;
973 }
974
975 static void *__bpf_copy_key(void __user *ukey, u64 key_size)
976 {
977         if (key_size)
978                 return memdup_user(ukey, key_size);
979
980         if (ukey)
981                 return ERR_PTR(-EINVAL);
982
983         return NULL;
984 }
985
986 /* last field in 'union bpf_attr' used by this command */
987 #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags
988
989 static int map_lookup_elem(union bpf_attr *attr)
990 {
991         void __user *ukey = u64_to_user_ptr(attr->key);
992         void __user *uvalue = u64_to_user_ptr(attr->value);
993         int ufd = attr->map_fd;
994         struct bpf_map *map;
995         void *key, *value;
996         u32 value_size;
997         struct fd f;
998         int err;
999
1000         if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
1001                 return -EINVAL;
1002
1003         if (attr->flags & ~BPF_F_LOCK)
1004                 return -EINVAL;
1005
1006         f = fdget(ufd);
1007         map = __bpf_map_get(f);
1008         if (IS_ERR(map))
1009                 return PTR_ERR(map);
1010         if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
1011                 err = -EPERM;
1012                 goto err_put;
1013         }
1014
1015         if ((attr->flags & BPF_F_LOCK) &&
1016             !map_value_has_spin_lock(map)) {
1017                 err = -EINVAL;
1018                 goto err_put;
1019         }
1020
1021         key = __bpf_copy_key(ukey, map->key_size);
1022         if (IS_ERR(key)) {
1023                 err = PTR_ERR(key);
1024                 goto err_put;
1025         }
1026
1027         value_size = bpf_map_value_size(map);
1028
1029         err = -ENOMEM;
1030         value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
1031         if (!value)
1032                 goto free_key;
1033
1034         err = bpf_map_copy_value(map, key, value, attr->flags);
1035         if (err)
1036                 goto free_value;
1037
1038         err = -EFAULT;
1039         if (copy_to_user(uvalue, value, value_size) != 0)
1040                 goto free_value;
1041
1042         err = 0;
1043
1044 free_value:
1045         kfree(value);
1046 free_key:
1047         kfree(key);
1048 err_put:
1049         fdput(f);
1050         return err;
1051 }
1052
1053
1054 #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
1055
1056 static int map_update_elem(union bpf_attr *attr)
1057 {
1058         void __user *ukey = u64_to_user_ptr(attr->key);
1059         void __user *uvalue = u64_to_user_ptr(attr->value);
1060         int ufd = attr->map_fd;
1061         struct bpf_map *map;
1062         void *key, *value;
1063         u32 value_size;
1064         struct fd f;
1065         int err;
1066
1067         if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
1068                 return -EINVAL;
1069
1070         f = fdget(ufd);
1071         map = __bpf_map_get(f);
1072         if (IS_ERR(map))
1073                 return PTR_ERR(map);
1074         if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
1075                 err = -EPERM;
1076                 goto err_put;
1077         }
1078
1079         if ((attr->flags & BPF_F_LOCK) &&
1080             !map_value_has_spin_lock(map)) {
1081                 err = -EINVAL;
1082                 goto err_put;
1083         }
1084
1085         key = __bpf_copy_key(ukey, map->key_size);
1086         if (IS_ERR(key)) {
1087                 err = PTR_ERR(key);
1088                 goto err_put;
1089         }
1090
1091         if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
1092             map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
1093             map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
1094             map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
1095                 value_size = round_up(map->value_size, 8) * num_possible_cpus();
1096         else
1097                 value_size = map->value_size;
1098
1099         err = -ENOMEM;
1100         value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
1101         if (!value)
1102                 goto free_key;
1103
1104         err = -EFAULT;
1105         if (copy_from_user(value, uvalue, value_size) != 0)
1106                 goto free_value;
1107
1108         err = bpf_map_update_value(map, f, key, value, attr->flags);
1109
1110 free_value:
1111         kfree(value);
1112 free_key:
1113         kfree(key);
1114 err_put:
1115         fdput(f);
1116         return err;
1117 }
1118
1119 #define BPF_MAP_DELETE_ELEM_LAST_FIELD key
1120
1121 static int map_delete_elem(union bpf_attr *attr)
1122 {
1123         void __user *ukey = u64_to_user_ptr(attr->key);
1124         int ufd = attr->map_fd;
1125         struct bpf_map *map;
1126         struct fd f;
1127         void *key;
1128         int err;
1129
1130         if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
1131                 return -EINVAL;
1132
1133         f = fdget(ufd);
1134         map = __bpf_map_get(f);
1135         if (IS_ERR(map))
1136                 return PTR_ERR(map);
1137         if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
1138                 err = -EPERM;
1139                 goto err_put;
1140         }
1141
1142         key = __bpf_copy_key(ukey, map->key_size);
1143         if (IS_ERR(key)) {
1144                 err = PTR_ERR(key);
1145                 goto err_put;
1146         }
1147
1148         if (bpf_map_is_dev_bound(map)) {
1149                 err = bpf_map_offload_delete_elem(map, key);
1150                 goto out;
1151         } else if (IS_FD_PROG_ARRAY(map) ||
1152                    map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
1153                 /* These maps require sleepable context */
1154                 err = map->ops->map_delete_elem(map, key);
1155                 goto out;
1156         }
1157
1158         bpf_disable_instrumentation();
1159         rcu_read_lock();
1160         err = map->ops->map_delete_elem(map, key);
1161         rcu_read_unlock();
1162         bpf_enable_instrumentation();
1163         maybe_wait_bpf_programs(map);
1164 out:
1165         kfree(key);
1166 err_put:
1167         fdput(f);
1168         return err;
1169 }
1170
1171 /* last field in 'union bpf_attr' used by this command */
1172 #define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
1173
1174 static int map_get_next_key(union bpf_attr *attr)
1175 {
1176         void __user *ukey = u64_to_user_ptr(attr->key);
1177         void __user *unext_key = u64_to_user_ptr(attr->next_key);
1178         int ufd = attr->map_fd;
1179         struct bpf_map *map;
1180         void *key, *next_key;
1181         struct fd f;
1182         int err;
1183
1184         if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
1185                 return -EINVAL;
1186
1187         f = fdget(ufd);
1188         map = __bpf_map_get(f);
1189         if (IS_ERR(map))
1190                 return PTR_ERR(map);
1191         if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
1192                 err = -EPERM;
1193                 goto err_put;
1194         }
1195
1196         if (ukey) {
1197                 key = __bpf_copy_key(ukey, map->key_size);
1198                 if (IS_ERR(key)) {
1199                         err = PTR_ERR(key);
1200                         goto err_put;
1201                 }
1202         } else {
1203                 key = NULL;
1204         }
1205
1206         err = -ENOMEM;
1207         next_key = kmalloc(map->key_size, GFP_USER);
1208         if (!next_key)
1209                 goto free_key;
1210
1211         if (bpf_map_is_dev_bound(map)) {
1212                 err = bpf_map_offload_get_next_key(map, key, next_key);
1213                 goto out;
1214         }
1215
1216         rcu_read_lock();
1217         err = map->ops->map_get_next_key(map, key, next_key);
1218         rcu_read_unlock();
1219 out:
1220         if (err)
1221                 goto free_next_key;
1222
1223         err = -EFAULT;
1224         if (copy_to_user(unext_key, next_key, map->key_size) != 0)
1225                 goto free_next_key;
1226
1227         err = 0;
1228
1229 free_next_key:
1230         kfree(next_key);
1231 free_key:
1232         kfree(key);
1233 err_put:
1234         fdput(f);
1235         return err;
1236 }
1237
1238 int generic_map_delete_batch(struct bpf_map *map,
1239                              const union bpf_attr *attr,
1240                              union bpf_attr __user *uattr)
1241 {
1242         void __user *keys = u64_to_user_ptr(attr->batch.keys);
1243         u32 cp, max_count;
1244         int err = 0;
1245         void *key;
1246
1247         if (attr->batch.elem_flags & ~BPF_F_LOCK)
1248                 return -EINVAL;
1249
1250         if ((attr->batch.elem_flags & BPF_F_LOCK) &&
1251             !map_value_has_spin_lock(map)) {
1252                 return -EINVAL;
1253         }
1254
1255         max_count = attr->batch.count;
1256         if (!max_count)
1257                 return 0;
1258
1259         key = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
1260         if (!key)
1261                 return -ENOMEM;
1262
1263         for (cp = 0; cp < max_count; cp++) {
1264                 err = -EFAULT;
1265                 if (copy_from_user(key, keys + cp * map->key_size,
1266                                    map->key_size))
1267                         break;
1268
1269                 if (bpf_map_is_dev_bound(map)) {
1270                         err = bpf_map_offload_delete_elem(map, key);
1271                         break;
1272                 }
1273
1274                 bpf_disable_instrumentation();
1275                 rcu_read_lock();
1276                 err = map->ops->map_delete_elem(map, key);
1277                 rcu_read_unlock();
1278                 bpf_enable_instrumentation();
1279                 maybe_wait_bpf_programs(map);
1280                 if (err)
1281                         break;
1282         }
1283         if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
1284                 err = -EFAULT;
1285
1286         kfree(key);
1287         return err;
1288 }
1289
1290 int generic_map_update_batch(struct bpf_map *map,
1291                              const union bpf_attr *attr,
1292                              union bpf_attr __user *uattr)
1293 {
1294         void __user *values = u64_to_user_ptr(attr->batch.values);
1295         void __user *keys = u64_to_user_ptr(attr->batch.keys);
1296         u32 value_size, cp, max_count;
1297         int ufd = attr->map_fd;
1298         void *key, *value;
1299         struct fd f;
1300         int err = 0;
1301
1302         f = fdget(ufd);
1303         if (attr->batch.elem_flags & ~BPF_F_LOCK)
1304                 return -EINVAL;
1305
1306         if ((attr->batch.elem_flags & BPF_F_LOCK) &&
1307             !map_value_has_spin_lock(map)) {
1308                 return -EINVAL;
1309         }
1310
1311         value_size = bpf_map_value_size(map);
1312
1313         max_count = attr->batch.count;
1314         if (!max_count)
1315                 return 0;
1316
1317         key = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
1318         if (!key)
1319                 return -ENOMEM;
1320
1321         value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
1322         if (!value) {
1323                 kfree(key);
1324                 return -ENOMEM;
1325         }
1326
1327         for (cp = 0; cp < max_count; cp++) {
1328                 err = -EFAULT;
1329                 if (copy_from_user(key, keys + cp * map->key_size,
1330                     map->key_size) ||
1331                     copy_from_user(value, values + cp * value_size, value_size))
1332                         break;
1333
1334                 err = bpf_map_update_value(map, f, key, value,
1335                                            attr->batch.elem_flags);
1336
1337                 if (err)
1338                         break;
1339         }
1340
1341         if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
1342                 err = -EFAULT;
1343
1344         kfree(value);
1345         kfree(key);
1346         return err;
1347 }
1348
1349 #define MAP_LOOKUP_RETRIES 3
1350
1351 int generic_map_lookup_batch(struct bpf_map *map,
1352                                     const union bpf_attr *attr,
1353                                     union bpf_attr __user *uattr)
1354 {
1355         void __user *uobatch = u64_to_user_ptr(attr->batch.out_batch);
1356         void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
1357         void __user *values = u64_to_user_ptr(attr->batch.values);
1358         void __user *keys = u64_to_user_ptr(attr->batch.keys);
1359         void *buf, *buf_prevkey, *prev_key, *key, *value;
1360         int err, retry = MAP_LOOKUP_RETRIES;
1361         u32 value_size, cp, max_count;
1362
1363         if (attr->batch.elem_flags & ~BPF_F_LOCK)
1364                 return -EINVAL;
1365
1366         if ((attr->batch.elem_flags & BPF_F_LOCK) &&
1367             !map_value_has_spin_lock(map))
1368                 return -EINVAL;
1369
1370         value_size = bpf_map_value_size(map);
1371
1372         max_count = attr->batch.count;
1373         if (!max_count)
1374                 return 0;
1375
1376         if (put_user(0, &uattr->batch.count))
1377                 return -EFAULT;
1378
1379         buf_prevkey = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
1380         if (!buf_prevkey)
1381                 return -ENOMEM;
1382
1383         buf = kmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN);
1384         if (!buf) {
1385                 kvfree(buf_prevkey);
1386                 return -ENOMEM;
1387         }
1388
1389         err = -EFAULT;
1390         prev_key = NULL;
1391         if (ubatch && copy_from_user(buf_prevkey, ubatch, map->key_size))
1392                 goto free_buf;
1393         key = buf;
1394         value = key + map->key_size;
1395         if (ubatch)
1396                 prev_key = buf_prevkey;
1397
1398         for (cp = 0; cp < max_count;) {
1399                 rcu_read_lock();
1400                 err = map->ops->map_get_next_key(map, prev_key, key);
1401                 rcu_read_unlock();
1402                 if (err)
1403                         break;
1404                 err = bpf_map_copy_value(map, key, value,
1405                                          attr->batch.elem_flags);
1406
1407                 if (err == -ENOENT) {
1408                         if (retry) {
1409                                 retry--;
1410                                 continue;
1411                         }
1412                         err = -EINTR;
1413                         break;
1414                 }
1415
1416                 if (err)
1417                         goto free_buf;
1418
1419                 if (copy_to_user(keys + cp * map->key_size, key,
1420                                  map->key_size)) {
1421                         err = -EFAULT;
1422                         goto free_buf;
1423                 }
1424                 if (copy_to_user(values + cp * value_size, value, value_size)) {
1425                         err = -EFAULT;
1426                         goto free_buf;
1427                 }
1428
1429                 if (!prev_key)
1430                         prev_key = buf_prevkey;
1431
1432                 swap(prev_key, key);
1433                 retry = MAP_LOOKUP_RETRIES;
1434                 cp++;
1435         }
1436
1437         if (err == -EFAULT)
1438                 goto free_buf;
1439
1440         if ((copy_to_user(&uattr->batch.count, &cp, sizeof(cp)) ||
1441                     (cp && copy_to_user(uobatch, prev_key, map->key_size))))
1442                 err = -EFAULT;
1443
1444 free_buf:
1445         kfree(buf_prevkey);
1446         kfree(buf);
1447         return err;
1448 }
1449
1450 #define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value
1451
1452 static int map_lookup_and_delete_elem(union bpf_attr *attr)
1453 {
1454         void __user *ukey = u64_to_user_ptr(attr->key);
1455         void __user *uvalue = u64_to_user_ptr(attr->value);
1456         int ufd = attr->map_fd;
1457         struct bpf_map *map;
1458         void *key, *value;
1459         u32 value_size;
1460         struct fd f;
1461         int err;
1462
1463         if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
1464                 return -EINVAL;
1465
1466         f = fdget(ufd);
1467         map = __bpf_map_get(f);
1468         if (IS_ERR(map))
1469                 return PTR_ERR(map);
1470         if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
1471                 err = -EPERM;
1472                 goto err_put;
1473         }
1474
1475         key = __bpf_copy_key(ukey, map->key_size);
1476         if (IS_ERR(key)) {
1477                 err = PTR_ERR(key);
1478                 goto err_put;
1479         }
1480
1481         value_size = map->value_size;
1482
1483         err = -ENOMEM;
1484         value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
1485         if (!value)
1486                 goto free_key;
1487
1488         if (map->map_type == BPF_MAP_TYPE_QUEUE ||
1489             map->map_type == BPF_MAP_TYPE_STACK) {
1490                 err = map->ops->map_pop_elem(map, value);
1491         } else {
1492                 err = -ENOTSUPP;
1493         }
1494
1495         if (err)
1496                 goto free_value;
1497
1498         if (copy_to_user(uvalue, value, value_size) != 0) {
1499                 err = -EFAULT;
1500                 goto free_value;
1501         }
1502
1503         err = 0;
1504
1505 free_value:
1506         kfree(value);
1507 free_key:
1508         kfree(key);
1509 err_put:
1510         fdput(f);
1511         return err;
1512 }
1513
1514 #define BPF_MAP_FREEZE_LAST_FIELD map_fd
1515
1516 static int map_freeze(const union bpf_attr *attr)
1517 {
1518         int err = 0, ufd = attr->map_fd;
1519         struct bpf_map *map;
1520         struct fd f;
1521
1522         if (CHECK_ATTR(BPF_MAP_FREEZE))
1523                 return -EINVAL;
1524
1525         f = fdget(ufd);
1526         map = __bpf_map_get(f);
1527         if (IS_ERR(map))
1528                 return PTR_ERR(map);
1529
1530         if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
1531                 fdput(f);
1532                 return -ENOTSUPP;
1533         }
1534
1535         mutex_lock(&map->freeze_mutex);
1536
1537         if (map->writecnt) {
1538                 err = -EBUSY;
1539                 goto err_put;
1540         }
1541         if (READ_ONCE(map->frozen)) {
1542                 err = -EBUSY;
1543                 goto err_put;
1544         }
1545         if (!capable(CAP_SYS_ADMIN)) {
1546                 err = -EPERM;
1547                 goto err_put;
1548         }
1549
1550         WRITE_ONCE(map->frozen, true);
1551 err_put:
1552         mutex_unlock(&map->freeze_mutex);
1553         fdput(f);
1554         return err;
1555 }
1556
1557 static const struct bpf_prog_ops * const bpf_prog_types[] = {
1558 #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
1559         [_id] = & _name ## _prog_ops,
1560 #define BPF_MAP_TYPE(_id, _ops)
1561 #include <linux/bpf_types.h>
1562 #undef BPF_PROG_TYPE
1563 #undef BPF_MAP_TYPE
1564 };
1565
1566 static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
1567 {
1568         const struct bpf_prog_ops *ops;
1569
1570         if (type >= ARRAY_SIZE(bpf_prog_types))
1571                 return -EINVAL;
1572         type = array_index_nospec(type, ARRAY_SIZE(bpf_prog_types));
1573         ops = bpf_prog_types[type];
1574         if (!ops)
1575                 return -EINVAL;
1576
1577         if (!bpf_prog_is_dev_bound(prog->aux))
1578                 prog->aux->ops = ops;
1579         else
1580                 prog->aux->ops = &bpf_offload_prog_ops;
1581         prog->type = type;
1582         return 0;
1583 }
1584
1585 enum bpf_audit {
1586         BPF_AUDIT_LOAD,
1587         BPF_AUDIT_UNLOAD,
1588         BPF_AUDIT_MAX,
1589 };
1590
1591 static const char * const bpf_audit_str[BPF_AUDIT_MAX] = {
1592         [BPF_AUDIT_LOAD]   = "LOAD",
1593         [BPF_AUDIT_UNLOAD] = "UNLOAD",
1594 };
1595
1596 static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op)
1597 {
1598         struct audit_context *ctx = NULL;
1599         struct audit_buffer *ab;
1600
1601         if (WARN_ON_ONCE(op >= BPF_AUDIT_MAX))
1602                 return;
1603         if (audit_enabled == AUDIT_OFF)
1604                 return;
1605         if (op == BPF_AUDIT_LOAD)
1606                 ctx = audit_context();
1607         ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF);
1608         if (unlikely(!ab))
1609                 return;
1610         audit_log_format(ab, "prog-id=%u op=%s",
1611                          prog->aux->id, bpf_audit_str[op]);
1612         audit_log_end(ab);
1613 }
1614
1615 int __bpf_prog_charge(struct user_struct *user, u32 pages)
1616 {
1617         unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
1618         unsigned long user_bufs;
1619
1620         if (user) {
1621                 user_bufs = atomic_long_add_return(pages, &user->locked_vm);
1622                 if (user_bufs > memlock_limit) {
1623                         atomic_long_sub(pages, &user->locked_vm);
1624                         return -EPERM;
1625                 }
1626         }
1627
1628         return 0;
1629 }
1630
1631 void __bpf_prog_uncharge(struct user_struct *user, u32 pages)
1632 {
1633         if (user)
1634                 atomic_long_sub(pages, &user->locked_vm);
1635 }
1636
1637 static int bpf_prog_charge_memlock(struct bpf_prog *prog)
1638 {
1639         struct user_struct *user = get_current_user();
1640         int ret;
1641
1642         ret = __bpf_prog_charge(user, prog->pages);
1643         if (ret) {
1644                 free_uid(user);
1645                 return ret;
1646         }
1647
1648         prog->aux->user = user;
1649         return 0;
1650 }
1651
1652 static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
1653 {
1654         struct user_struct *user = prog->aux->user;
1655
1656         __bpf_prog_uncharge(user, prog->pages);
1657         free_uid(user);
1658 }
1659
1660 static int bpf_prog_alloc_id(struct bpf_prog *prog)
1661 {
1662         int id;
1663
1664         idr_preload(GFP_KERNEL);
1665         spin_lock_bh(&prog_idr_lock);
1666         id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC);
1667         if (id > 0)
1668                 prog->aux->id = id;
1669         spin_unlock_bh(&prog_idr_lock);
1670         idr_preload_end();
1671
1672         /* id is in [1, INT_MAX) */
1673         if (WARN_ON_ONCE(!id))
1674                 return -ENOSPC;
1675
1676         return id > 0 ? 0 : id;
1677 }
1678
1679 void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
1680 {
1681         /* cBPF to eBPF migrations are currently not in the idr store.
1682          * Offloaded programs are removed from the store when their device
1683          * disappears - even if someone grabs an fd to them they are unusable,
1684          * simply waiting for refcnt to drop to be freed.
1685          */
1686         if (!prog->aux->id)
1687                 return;
1688
1689         if (do_idr_lock)
1690                 spin_lock_bh(&prog_idr_lock);
1691         else
1692                 __acquire(&prog_idr_lock);
1693
1694         idr_remove(&prog_idr, prog->aux->id);
1695         prog->aux->id = 0;
1696
1697         if (do_idr_lock)
1698                 spin_unlock_bh(&prog_idr_lock);
1699         else
1700                 __release(&prog_idr_lock);
1701 }
1702
1703 static void __bpf_prog_put_rcu(struct rcu_head *rcu)
1704 {
1705         struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
1706
1707         kvfree(aux->func_info);
1708         kfree(aux->func_info_aux);
1709         bpf_prog_uncharge_memlock(aux->prog);
1710         security_bpf_prog_free(aux);
1711         bpf_prog_free(aux->prog);
1712 }
1713
1714 static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
1715 {
1716         bpf_prog_kallsyms_del_all(prog);
1717         btf_put(prog->aux->btf);
1718         bpf_prog_free_linfo(prog);
1719
1720         if (deferred)
1721                 call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
1722         else
1723                 __bpf_prog_put_rcu(&prog->aux->rcu);
1724 }
1725
1726 static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
1727 {
1728         if (atomic64_dec_and_test(&prog->aux->refcnt)) {
1729                 perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
1730                 bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
1731                 /* bpf_prog_free_id() must be called first */
1732                 bpf_prog_free_id(prog, do_idr_lock);
1733                 __bpf_prog_put_noref(prog, true);
1734         }
1735 }
1736
1737 void bpf_prog_put(struct bpf_prog *prog)
1738 {
1739         __bpf_prog_put(prog, true);
1740 }
1741 EXPORT_SYMBOL_GPL(bpf_prog_put);
1742
1743 static int bpf_prog_release(struct inode *inode, struct file *filp)
1744 {
1745         struct bpf_prog *prog = filp->private_data;
1746
1747         bpf_prog_put(prog);
1748         return 0;
1749 }
1750
1751 static void bpf_prog_get_stats(const struct bpf_prog *prog,
1752                                struct bpf_prog_stats *stats)
1753 {
1754         u64 nsecs = 0, cnt = 0;
1755         int cpu;
1756
1757         for_each_possible_cpu(cpu) {
1758                 const struct bpf_prog_stats *st;
1759                 unsigned int start;
1760                 u64 tnsecs, tcnt;
1761
1762                 st = per_cpu_ptr(prog->aux->stats, cpu);
1763                 do {
1764                         start = u64_stats_fetch_begin_irq(&st->syncp);
1765                         tnsecs = st->nsecs;
1766                         tcnt = st->cnt;
1767                 } while (u64_stats_fetch_retry_irq(&st->syncp, start));
1768                 nsecs += tnsecs;
1769                 cnt += tcnt;
1770         }
1771         stats->nsecs = nsecs;
1772         stats->cnt = cnt;
1773 }
1774
1775 #ifdef CONFIG_PROC_FS
1776 static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
1777 {
1778         const struct bpf_prog *prog = filp->private_data;
1779         char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
1780         struct bpf_prog_stats stats;
1781
1782         bpf_prog_get_stats(prog, &stats);
1783         bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
1784         seq_printf(m,
1785                    "prog_type:\t%u\n"
1786                    "prog_jited:\t%u\n"
1787                    "prog_tag:\t%s\n"
1788                    "memlock:\t%llu\n"
1789                    "prog_id:\t%u\n"
1790                    "run_time_ns:\t%llu\n"
1791                    "run_cnt:\t%llu\n",
1792                    prog->type,
1793                    prog->jited,
1794                    prog_tag,
1795                    prog->pages * 1ULL << PAGE_SHIFT,
1796                    prog->aux->id,
1797                    stats.nsecs,
1798                    stats.cnt);
1799 }
1800 #endif
1801
1802 const struct file_operations bpf_prog_fops = {
1803 #ifdef CONFIG_PROC_FS
1804         .show_fdinfo    = bpf_prog_show_fdinfo,
1805 #endif
1806         .release        = bpf_prog_release,
1807         .read           = bpf_dummy_read,
1808         .write          = bpf_dummy_write,
1809 };
1810
1811 int bpf_prog_new_fd(struct bpf_prog *prog)
1812 {
1813         int ret;
1814
1815         ret = security_bpf_prog(prog);
1816         if (ret < 0)
1817                 return ret;
1818
1819         return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog,
1820                                 O_RDWR | O_CLOEXEC);
1821 }
1822
1823 static struct bpf_prog *____bpf_prog_get(struct fd f)
1824 {
1825         if (!f.file)
1826                 return ERR_PTR(-EBADF);
1827         if (f.file->f_op != &bpf_prog_fops) {
1828                 fdput(f);
1829                 return ERR_PTR(-EINVAL);
1830         }
1831
1832         return f.file->private_data;
1833 }
1834
1835 void bpf_prog_add(struct bpf_prog *prog, int i)
1836 {
1837         atomic64_add(i, &prog->aux->refcnt);
1838 }
1839 EXPORT_SYMBOL_GPL(bpf_prog_add);
1840
1841 void bpf_prog_sub(struct bpf_prog *prog, int i)
1842 {
1843         /* Only to be used for undoing previous bpf_prog_add() in some
1844          * error path. We still know that another entity in our call
1845          * path holds a reference to the program, thus atomic_sub() can
1846          * be safely used in such cases!
1847          */
1848         WARN_ON(atomic64_sub_return(i, &prog->aux->refcnt) == 0);
1849 }
1850 EXPORT_SYMBOL_GPL(bpf_prog_sub);
1851
1852 void bpf_prog_inc(struct bpf_prog *prog)
1853 {
1854         atomic64_inc(&prog->aux->refcnt);
1855 }
1856 EXPORT_SYMBOL_GPL(bpf_prog_inc);
1857
1858 /* prog_idr_lock should have been held */
1859 struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
1860 {
1861         int refold;
1862
1863         refold = atomic64_fetch_add_unless(&prog->aux->refcnt, 1, 0);
1864
1865         if (!refold)
1866                 return ERR_PTR(-ENOENT);
1867
1868         return prog;
1869 }
1870 EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
1871
1872 bool bpf_prog_get_ok(struct bpf_prog *prog,
1873                             enum bpf_prog_type *attach_type, bool attach_drv)
1874 {
1875         /* not an attachment, just a refcount inc, always allow */
1876         if (!attach_type)
1877                 return true;
1878
1879         if (prog->type != *attach_type)
1880                 return false;
1881         if (bpf_prog_is_dev_bound(prog->aux) && !attach_drv)
1882                 return false;
1883
1884         return true;
1885 }
1886
1887 static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type,
1888                                        bool attach_drv)
1889 {
1890         struct fd f = fdget(ufd);
1891         struct bpf_prog *prog;
1892
1893         prog = ____bpf_prog_get(f);
1894         if (IS_ERR(prog))
1895                 return prog;
1896         if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) {
1897                 prog = ERR_PTR(-EINVAL);
1898                 goto out;
1899         }
1900
1901         bpf_prog_inc(prog);
1902 out:
1903         fdput(f);
1904         return prog;
1905 }
1906
1907 struct bpf_prog *bpf_prog_get(u32 ufd)
1908 {
1909         return __bpf_prog_get(ufd, NULL, false);
1910 }
1911
1912 struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
1913                                        bool attach_drv)
1914 {
1915         return __bpf_prog_get(ufd, &type, attach_drv);
1916 }
1917 EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev);
1918
1919 /* Initially all BPF programs could be loaded w/o specifying
1920  * expected_attach_type. Later for some of them specifying expected_attach_type
1921  * at load time became required so that program could be validated properly.
1922  * Programs of types that are allowed to be loaded both w/ and w/o (for
1923  * backward compatibility) expected_attach_type, should have the default attach
1924  * type assigned to expected_attach_type for the latter case, so that it can be
1925  * validated later at attach time.
1926  *
1927  * bpf_prog_load_fixup_attach_type() sets expected_attach_type in @attr if
1928  * prog type requires it but has some attach types that have to be backward
1929  * compatible.
1930  */
1931 static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr)
1932 {
1933         switch (attr->prog_type) {
1934         case BPF_PROG_TYPE_CGROUP_SOCK:
1935                 /* Unfortunately BPF_ATTACH_TYPE_UNSPEC enumeration doesn't
1936                  * exist so checking for non-zero is the way to go here.
1937                  */
1938                 if (!attr->expected_attach_type)
1939                         attr->expected_attach_type =
1940                                 BPF_CGROUP_INET_SOCK_CREATE;
1941                 break;
1942         }
1943 }
1944
1945 static int
1946 bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
1947                            enum bpf_attach_type expected_attach_type,
1948                            u32 btf_id, u32 prog_fd)
1949 {
1950         if (btf_id) {
1951                 if (btf_id > BTF_MAX_TYPE)
1952                         return -EINVAL;
1953
1954                 switch (prog_type) {
1955                 case BPF_PROG_TYPE_TRACING:
1956                 case BPF_PROG_TYPE_LSM:
1957                 case BPF_PROG_TYPE_STRUCT_OPS:
1958                 case BPF_PROG_TYPE_EXT:
1959                         break;
1960                 default:
1961                         return -EINVAL;
1962                 }
1963         }
1964
1965         if (prog_fd && prog_type != BPF_PROG_TYPE_TRACING &&
1966             prog_type != BPF_PROG_TYPE_EXT)
1967                 return -EINVAL;
1968
1969         switch (prog_type) {
1970         case BPF_PROG_TYPE_CGROUP_SOCK:
1971                 switch (expected_attach_type) {
1972                 case BPF_CGROUP_INET_SOCK_CREATE:
1973                 case BPF_CGROUP_INET4_POST_BIND:
1974                 case BPF_CGROUP_INET6_POST_BIND:
1975                         return 0;
1976                 default:
1977                         return -EINVAL;
1978                 }
1979         case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
1980                 switch (expected_attach_type) {
1981                 case BPF_CGROUP_INET4_BIND:
1982                 case BPF_CGROUP_INET6_BIND:
1983                 case BPF_CGROUP_INET4_CONNECT:
1984                 case BPF_CGROUP_INET6_CONNECT:
1985                 case BPF_CGROUP_UDP4_SENDMSG:
1986                 case BPF_CGROUP_UDP6_SENDMSG:
1987                 case BPF_CGROUP_UDP4_RECVMSG:
1988                 case BPF_CGROUP_UDP6_RECVMSG:
1989                         return 0;
1990                 default:
1991                         return -EINVAL;
1992                 }
1993         case BPF_PROG_TYPE_CGROUP_SKB:
1994                 switch (expected_attach_type) {
1995                 case BPF_CGROUP_INET_INGRESS:
1996                 case BPF_CGROUP_INET_EGRESS:
1997                         return 0;
1998                 default:
1999                         return -EINVAL;
2000                 }
2001         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
2002                 switch (expected_attach_type) {
2003                 case BPF_CGROUP_SETSOCKOPT:
2004                 case BPF_CGROUP_GETSOCKOPT:
2005                         return 0;
2006                 default:
2007                         return -EINVAL;
2008                 }
2009         case BPF_PROG_TYPE_EXT:
2010                 if (expected_attach_type)
2011                         return -EINVAL;
2012                 /* fallthrough */
2013         default:
2014                 return 0;
2015         }
2016 }
2017
2018 /* last field in 'union bpf_attr' used by this command */
2019 #define BPF_PROG_LOAD_LAST_FIELD attach_prog_fd
2020
2021 static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
2022 {
2023         enum bpf_prog_type type = attr->prog_type;
2024         struct bpf_prog *prog;
2025         int err;
2026         char license[128];
2027         bool is_gpl;
2028
2029         if (CHECK_ATTR(BPF_PROG_LOAD))
2030                 return -EINVAL;
2031
2032         if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |
2033                                  BPF_F_ANY_ALIGNMENT |
2034                                  BPF_F_TEST_STATE_FREQ |
2035                                  BPF_F_TEST_RND_HI32))
2036                 return -EINVAL;
2037
2038         if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
2039             (attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
2040             !capable(CAP_SYS_ADMIN))
2041                 return -EPERM;
2042
2043         /* copy eBPF program license from user space */
2044         if (strncpy_from_user(license, u64_to_user_ptr(attr->license),
2045                               sizeof(license) - 1) < 0)
2046                 return -EFAULT;
2047         license[sizeof(license) - 1] = 0;
2048
2049         /* eBPF programs must be GPL compatible to use GPL-ed functions */
2050         is_gpl = license_is_gpl_compatible(license);
2051
2052         if (attr->insn_cnt == 0 ||
2053             attr->insn_cnt > (capable(CAP_SYS_ADMIN) ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
2054                 return -E2BIG;
2055         if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
2056             type != BPF_PROG_TYPE_CGROUP_SKB &&
2057             !capable(CAP_SYS_ADMIN))
2058                 return -EPERM;
2059
2060         bpf_prog_load_fixup_attach_type(attr);
2061         if (bpf_prog_load_check_attach(type, attr->expected_attach_type,
2062                                        attr->attach_btf_id,
2063                                        attr->attach_prog_fd))
2064                 return -EINVAL;
2065
2066         /* plain bpf_prog allocation */
2067         prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
2068         if (!prog)
2069                 return -ENOMEM;
2070
2071         prog->expected_attach_type = attr->expected_attach_type;
2072         prog->aux->attach_btf_id = attr->attach_btf_id;
2073         if (attr->attach_prog_fd) {
2074                 struct bpf_prog *tgt_prog;
2075
2076                 tgt_prog = bpf_prog_get(attr->attach_prog_fd);
2077                 if (IS_ERR(tgt_prog)) {
2078                         err = PTR_ERR(tgt_prog);
2079                         goto free_prog_nouncharge;
2080                 }
2081                 prog->aux->linked_prog = tgt_prog;
2082         }
2083
2084         prog->aux->offload_requested = !!attr->prog_ifindex;
2085
2086         err = security_bpf_prog_alloc(prog->aux);
2087         if (err)
2088                 goto free_prog_nouncharge;
2089
2090         err = bpf_prog_charge_memlock(prog);
2091         if (err)
2092                 goto free_prog_sec;
2093
2094         prog->len = attr->insn_cnt;
2095
2096         err = -EFAULT;
2097         if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns),
2098                            bpf_prog_insn_size(prog)) != 0)
2099                 goto free_prog;
2100
2101         prog->orig_prog = NULL;
2102         prog->jited = 0;
2103
2104         atomic64_set(&prog->aux->refcnt, 1);
2105         prog->gpl_compatible = is_gpl ? 1 : 0;
2106
2107         if (bpf_prog_is_dev_bound(prog->aux)) {
2108                 err = bpf_prog_offload_init(prog, attr);
2109                 if (err)
2110                         goto free_prog;
2111         }
2112
2113         /* find program type: socket_filter vs tracing_filter */
2114         err = find_prog_type(type, prog);
2115         if (err < 0)
2116                 goto free_prog;
2117
2118         prog->aux->load_time = ktime_get_boottime_ns();
2119         err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name,
2120                                sizeof(attr->prog_name));
2121         if (err < 0)
2122                 goto free_prog;
2123
2124         /* run eBPF verifier */
2125         err = bpf_check(&prog, attr, uattr);
2126         if (err < 0)
2127                 goto free_used_maps;
2128
2129         prog = bpf_prog_select_runtime(prog, &err);
2130         if (err < 0)
2131                 goto free_used_maps;
2132
2133         err = bpf_prog_alloc_id(prog);
2134         if (err)
2135                 goto free_used_maps;
2136
2137         /* Upon success of bpf_prog_alloc_id(), the BPF prog is
2138          * effectively publicly exposed. However, retrieving via
2139          * bpf_prog_get_fd_by_id() will take another reference,
2140          * therefore it cannot be gone underneath us.
2141          *
2142          * Only for the time /after/ successful bpf_prog_new_fd()
2143          * and before returning to userspace, we might just hold
2144          * one reference and any parallel close on that fd could
2145          * rip everything out. Hence, below notifications must
2146          * happen before bpf_prog_new_fd().
2147          *
2148          * Also, any failure handling from this point onwards must
2149          * be using bpf_prog_put() given the program is exposed.
2150          */
2151         bpf_prog_kallsyms_add(prog);
2152         perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0);
2153         bpf_audit_prog(prog, BPF_AUDIT_LOAD);
2154
2155         err = bpf_prog_new_fd(prog);
2156         if (err < 0)
2157                 bpf_prog_put(prog);
2158         return err;
2159
2160 free_used_maps:
2161         /* In case we have subprogs, we need to wait for a grace
2162          * period before we can tear down JIT memory since symbols
2163          * are already exposed under kallsyms.
2164          */
2165         __bpf_prog_put_noref(prog, prog->aux->func_cnt);
2166         return err;
2167 free_prog:
2168         bpf_prog_uncharge_memlock(prog);
2169 free_prog_sec:
2170         security_bpf_prog_free(prog->aux);
2171 free_prog_nouncharge:
2172         bpf_prog_free(prog);
2173         return err;
2174 }
2175
2176 #define BPF_OBJ_LAST_FIELD file_flags
2177
2178 static int bpf_obj_pin(const union bpf_attr *attr)
2179 {
2180         if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0)
2181                 return -EINVAL;
2182
2183         return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname));
2184 }
2185
2186 static int bpf_obj_get(const union bpf_attr *attr)
2187 {
2188         if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 ||
2189             attr->file_flags & ~BPF_OBJ_FLAG_MASK)
2190                 return -EINVAL;
2191
2192         return bpf_obj_get_user(u64_to_user_ptr(attr->pathname),
2193                                 attr->file_flags);
2194 }
2195
2196 void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops,
2197                    struct bpf_prog *prog)
2198 {
2199         atomic64_set(&link->refcnt, 1);
2200         link->ops = ops;
2201         link->prog = prog;
2202 }
2203
2204 /* Clean up bpf_link and corresponding anon_inode file and FD. After
2205  * anon_inode is created, bpf_link can't be just kfree()'d due to deferred
2206  * anon_inode's release() call. This helper manages marking bpf_link as
2207  * defunct, releases anon_inode file and puts reserved FD.
2208  */
2209 void bpf_link_cleanup(struct bpf_link *link, struct file *link_file,
2210                       int link_fd)
2211 {
2212         link->prog = NULL;
2213         fput(link_file);
2214         put_unused_fd(link_fd);
2215 }
2216
2217 void bpf_link_inc(struct bpf_link *link)
2218 {
2219         atomic64_inc(&link->refcnt);
2220 }
2221
2222 /* bpf_link_free is guaranteed to be called from process context */
2223 static void bpf_link_free(struct bpf_link *link)
2224 {
2225         if (link->prog) {
2226                 /* detach BPF program, clean up used resources */
2227                 link->ops->release(link);
2228                 bpf_prog_put(link->prog);
2229         }
2230         /* free bpf_link and its containing memory */
2231         link->ops->dealloc(link);
2232 }
2233
2234 static void bpf_link_put_deferred(struct work_struct *work)
2235 {
2236         struct bpf_link *link = container_of(work, struct bpf_link, work);
2237
2238         bpf_link_free(link);
2239 }
2240
2241 /* bpf_link_put can be called from atomic context, but ensures that resources
2242  * are freed from process context
2243  */
2244 void bpf_link_put(struct bpf_link *link)
2245 {
2246         if (!atomic64_dec_and_test(&link->refcnt))
2247                 return;
2248
2249         if (in_atomic()) {
2250                 INIT_WORK(&link->work, bpf_link_put_deferred);
2251                 schedule_work(&link->work);
2252         } else {
2253                 bpf_link_free(link);
2254         }
2255 }
2256
2257 static int bpf_link_release(struct inode *inode, struct file *filp)
2258 {
2259         struct bpf_link *link = filp->private_data;
2260
2261         bpf_link_put(link);
2262         return 0;
2263 }
2264
2265 #ifdef CONFIG_PROC_FS
2266 static const struct bpf_link_ops bpf_raw_tp_lops;
2267 static const struct bpf_link_ops bpf_tracing_link_lops;
2268
2269 static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
2270 {
2271         const struct bpf_link *link = filp->private_data;
2272         const struct bpf_prog *prog = link->prog;
2273         char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
2274         const char *link_type;
2275
2276         if (link->ops == &bpf_raw_tp_lops)
2277                 link_type = "raw_tracepoint";
2278         else if (link->ops == &bpf_tracing_link_lops)
2279                 link_type = "tracing";
2280 #ifdef CONFIG_CGROUP_BPF
2281         else if (link->ops == &bpf_cgroup_link_lops)
2282                 link_type = "cgroup";
2283 #endif
2284         else
2285                 link_type = "unknown";
2286
2287         bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
2288         seq_printf(m,
2289                    "link_type:\t%s\n"
2290                    "prog_tag:\t%s\n"
2291                    "prog_id:\t%u\n",
2292                    link_type,
2293                    prog_tag,
2294                    prog->aux->id);
2295 }
2296 #endif
2297
2298 static const struct file_operations bpf_link_fops = {
2299 #ifdef CONFIG_PROC_FS
2300         .show_fdinfo    = bpf_link_show_fdinfo,
2301 #endif
2302         .release        = bpf_link_release,
2303         .read           = bpf_dummy_read,
2304         .write          = bpf_dummy_write,
2305 };
2306
2307 int bpf_link_new_fd(struct bpf_link *link)
2308 {
2309         return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC);
2310 }
2311
2312 /* Similar to bpf_link_new_fd, create anon_inode for given bpf_link, but
2313  * instead of immediately installing fd in fdtable, just reserve it and
2314  * return. Caller then need to either install it with fd_install(fd, file) or
2315  * release with put_unused_fd(fd).
2316  * This is useful for cases when bpf_link attachment/detachment are
2317  * complicated and expensive operations and should be delayed until all the fd
2318  * reservation and anon_inode creation succeeds.
2319  */
2320 struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd)
2321 {
2322         struct file *file;
2323         int fd;
2324
2325         fd = get_unused_fd_flags(O_CLOEXEC);
2326         if (fd < 0)
2327                 return ERR_PTR(fd);
2328
2329         file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC);
2330         if (IS_ERR(file)) {
2331                 put_unused_fd(fd);
2332                 return file;
2333         }
2334
2335         *reserved_fd = fd;
2336         return file;
2337 }
2338
2339 struct bpf_link *bpf_link_get_from_fd(u32 ufd)
2340 {
2341         struct fd f = fdget(ufd);
2342         struct bpf_link *link;
2343
2344         if (!f.file)
2345                 return ERR_PTR(-EBADF);
2346         if (f.file->f_op != &bpf_link_fops) {
2347                 fdput(f);
2348                 return ERR_PTR(-EINVAL);
2349         }
2350
2351         link = f.file->private_data;
2352         bpf_link_inc(link);
2353         fdput(f);
2354
2355         return link;
2356 }
2357
2358 struct bpf_tracing_link {
2359         struct bpf_link link;
2360 };
2361
2362 static void bpf_tracing_link_release(struct bpf_link *link)
2363 {
2364         WARN_ON_ONCE(bpf_trampoline_unlink_prog(link->prog));
2365 }
2366
2367 static void bpf_tracing_link_dealloc(struct bpf_link *link)
2368 {
2369         struct bpf_tracing_link *tr_link =
2370                 container_of(link, struct bpf_tracing_link, link);
2371
2372         kfree(tr_link);
2373 }
2374
2375 static const struct bpf_link_ops bpf_tracing_link_lops = {
2376         .release = bpf_tracing_link_release,
2377         .dealloc = bpf_tracing_link_dealloc,
2378 };
2379
2380 static int bpf_tracing_prog_attach(struct bpf_prog *prog)
2381 {
2382         struct bpf_tracing_link *link;
2383         struct file *link_file;
2384         int link_fd, err;
2385
2386         switch (prog->type) {
2387         case BPF_PROG_TYPE_TRACING:
2388                 if (prog->expected_attach_type != BPF_TRACE_FENTRY &&
2389                     prog->expected_attach_type != BPF_TRACE_FEXIT &&
2390                     prog->expected_attach_type != BPF_MODIFY_RETURN) {
2391                         err = -EINVAL;
2392                         goto out_put_prog;
2393                 }
2394                 break;
2395         case BPF_PROG_TYPE_EXT:
2396                 if (prog->expected_attach_type != 0) {
2397                         err = -EINVAL;
2398                         goto out_put_prog;
2399                 }
2400                 break;
2401         case BPF_PROG_TYPE_LSM:
2402                 if (prog->expected_attach_type != BPF_LSM_MAC) {
2403                         err = -EINVAL;
2404                         goto out_put_prog;
2405                 }
2406                 break;
2407         default:
2408                 err = -EINVAL;
2409                 goto out_put_prog;
2410         }
2411
2412         link = kzalloc(sizeof(*link), GFP_USER);
2413         if (!link) {
2414                 err = -ENOMEM;
2415                 goto out_put_prog;
2416         }
2417         bpf_link_init(&link->link, &bpf_tracing_link_lops, prog);
2418
2419         link_file = bpf_link_new_file(&link->link, &link_fd);
2420         if (IS_ERR(link_file)) {
2421                 kfree(link);
2422                 err = PTR_ERR(link_file);
2423                 goto out_put_prog;
2424         }
2425
2426         err = bpf_trampoline_link_prog(prog);
2427         if (err) {
2428                 bpf_link_cleanup(&link->link, link_file, link_fd);
2429                 goto out_put_prog;
2430         }
2431
2432         fd_install(link_fd, link_file);
2433         return link_fd;
2434
2435 out_put_prog:
2436         bpf_prog_put(prog);
2437         return err;
2438 }
2439
2440 struct bpf_raw_tp_link {
2441         struct bpf_link link;
2442         struct bpf_raw_event_map *btp;
2443 };
2444
2445 static void bpf_raw_tp_link_release(struct bpf_link *link)
2446 {
2447         struct bpf_raw_tp_link *raw_tp =
2448                 container_of(link, struct bpf_raw_tp_link, link);
2449
2450         bpf_probe_unregister(raw_tp->btp, raw_tp->link.prog);
2451         bpf_put_raw_tracepoint(raw_tp->btp);
2452 }
2453
2454 static void bpf_raw_tp_link_dealloc(struct bpf_link *link)
2455 {
2456         struct bpf_raw_tp_link *raw_tp =
2457                 container_of(link, struct bpf_raw_tp_link, link);
2458
2459         kfree(raw_tp);
2460 }
2461
2462 static const struct bpf_link_ops bpf_raw_tp_lops = {
2463         .release = bpf_raw_tp_link_release,
2464         .dealloc = bpf_raw_tp_link_dealloc,
2465 };
2466
2467 #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
2468
2469 static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
2470 {
2471         struct bpf_raw_tp_link *link;
2472         struct bpf_raw_event_map *btp;
2473         struct file *link_file;
2474         struct bpf_prog *prog;
2475         const char *tp_name;
2476         char buf[128];
2477         int link_fd, err;
2478
2479         if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN))
2480                 return -EINVAL;
2481
2482         prog = bpf_prog_get(attr->raw_tracepoint.prog_fd);
2483         if (IS_ERR(prog))
2484                 return PTR_ERR(prog);
2485
2486         switch (prog->type) {
2487         case BPF_PROG_TYPE_TRACING:
2488         case BPF_PROG_TYPE_EXT:
2489         case BPF_PROG_TYPE_LSM:
2490                 if (attr->raw_tracepoint.name) {
2491                         /* The attach point for this category of programs
2492                          * should be specified via btf_id during program load.
2493                          */
2494                         err = -EINVAL;
2495                         goto out_put_prog;
2496                 }
2497                 if (prog->type == BPF_PROG_TYPE_TRACING &&
2498                     prog->expected_attach_type == BPF_TRACE_RAW_TP) {
2499                         tp_name = prog->aux->attach_func_name;
2500                         break;
2501                 }
2502                 return bpf_tracing_prog_attach(prog);
2503         case BPF_PROG_TYPE_RAW_TRACEPOINT:
2504         case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
2505                 if (strncpy_from_user(buf,
2506                                       u64_to_user_ptr(attr->raw_tracepoint.name),
2507                                       sizeof(buf) - 1) < 0) {
2508                         err = -EFAULT;
2509                         goto out_put_prog;
2510                 }
2511                 buf[sizeof(buf) - 1] = 0;
2512                 tp_name = buf;
2513                 break;
2514         default:
2515                 err = -EINVAL;
2516                 goto out_put_prog;
2517         }
2518
2519         btp = bpf_get_raw_tracepoint(tp_name);
2520         if (!btp) {
2521                 err = -ENOENT;
2522                 goto out_put_prog;
2523         }
2524
2525         link = kzalloc(sizeof(*link), GFP_USER);
2526         if (!link) {
2527                 err = -ENOMEM;
2528                 goto out_put_btp;
2529         }
2530         bpf_link_init(&link->link, &bpf_raw_tp_lops, prog);
2531         link->btp = btp;
2532
2533         link_file = bpf_link_new_file(&link->link, &link_fd);
2534         if (IS_ERR(link_file)) {
2535                 kfree(link);
2536                 err = PTR_ERR(link_file);
2537                 goto out_put_btp;
2538         }
2539
2540         err = bpf_probe_register(link->btp, prog);
2541         if (err) {
2542                 bpf_link_cleanup(&link->link, link_file, link_fd);
2543                 goto out_put_btp;
2544         }
2545
2546         fd_install(link_fd, link_file);
2547         return link_fd;
2548
2549 out_put_btp:
2550         bpf_put_raw_tracepoint(btp);
2551 out_put_prog:
2552         bpf_prog_put(prog);
2553         return err;
2554 }
2555
2556 static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
2557                                              enum bpf_attach_type attach_type)
2558 {
2559         switch (prog->type) {
2560         case BPF_PROG_TYPE_CGROUP_SOCK:
2561         case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
2562         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
2563                 return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
2564         case BPF_PROG_TYPE_CGROUP_SKB:
2565                 return prog->enforce_expected_attach_type &&
2566                         prog->expected_attach_type != attach_type ?
2567                         -EINVAL : 0;
2568         default:
2569                 return 0;
2570         }
2571 }
2572
2573 static enum bpf_prog_type
2574 attach_type_to_prog_type(enum bpf_attach_type attach_type)
2575 {
2576         switch (attach_type) {
2577         case BPF_CGROUP_INET_INGRESS:
2578         case BPF_CGROUP_INET_EGRESS:
2579                 return BPF_PROG_TYPE_CGROUP_SKB;
2580                 break;
2581         case BPF_CGROUP_INET_SOCK_CREATE:
2582         case BPF_CGROUP_INET4_POST_BIND:
2583         case BPF_CGROUP_INET6_POST_BIND:
2584                 return BPF_PROG_TYPE_CGROUP_SOCK;
2585         case BPF_CGROUP_INET4_BIND:
2586         case BPF_CGROUP_INET6_BIND:
2587         case BPF_CGROUP_INET4_CONNECT:
2588         case BPF_CGROUP_INET6_CONNECT:
2589         case BPF_CGROUP_UDP4_SENDMSG:
2590         case BPF_CGROUP_UDP6_SENDMSG:
2591         case BPF_CGROUP_UDP4_RECVMSG:
2592         case BPF_CGROUP_UDP6_RECVMSG:
2593                 return BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
2594         case BPF_CGROUP_SOCK_OPS:
2595                 return BPF_PROG_TYPE_SOCK_OPS;
2596         case BPF_CGROUP_DEVICE:
2597                 return BPF_PROG_TYPE_CGROUP_DEVICE;
2598         case BPF_SK_MSG_VERDICT:
2599                 return BPF_PROG_TYPE_SK_MSG;
2600         case BPF_SK_SKB_STREAM_PARSER:
2601         case BPF_SK_SKB_STREAM_VERDICT:
2602                 return BPF_PROG_TYPE_SK_SKB;
2603         case BPF_LIRC_MODE2:
2604                 return BPF_PROG_TYPE_LIRC_MODE2;
2605         case BPF_FLOW_DISSECTOR:
2606                 return BPF_PROG_TYPE_FLOW_DISSECTOR;
2607         case BPF_CGROUP_SYSCTL:
2608                 return BPF_PROG_TYPE_CGROUP_SYSCTL;
2609         case BPF_CGROUP_GETSOCKOPT:
2610         case BPF_CGROUP_SETSOCKOPT:
2611                 return BPF_PROG_TYPE_CGROUP_SOCKOPT;
2612         default:
2613                 return BPF_PROG_TYPE_UNSPEC;
2614         }
2615 }
2616
2617 #define BPF_PROG_ATTACH_LAST_FIELD replace_bpf_fd
2618
2619 #define BPF_F_ATTACH_MASK \
2620         (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI | BPF_F_REPLACE)
2621
2622 static int bpf_prog_attach(const union bpf_attr *attr)
2623 {
2624         enum bpf_prog_type ptype;
2625         struct bpf_prog *prog;
2626         int ret;
2627
2628         if (!capable(CAP_NET_ADMIN))
2629                 return -EPERM;
2630
2631         if (CHECK_ATTR(BPF_PROG_ATTACH))
2632                 return -EINVAL;
2633
2634         if (attr->attach_flags & ~BPF_F_ATTACH_MASK)
2635                 return -EINVAL;
2636
2637         ptype = attach_type_to_prog_type(attr->attach_type);
2638         if (ptype == BPF_PROG_TYPE_UNSPEC)
2639                 return -EINVAL;
2640
2641         prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
2642         if (IS_ERR(prog))
2643                 return PTR_ERR(prog);
2644
2645         if (bpf_prog_attach_check_attach_type(prog, attr->attach_type)) {
2646                 bpf_prog_put(prog);
2647                 return -EINVAL;
2648         }
2649
2650         switch (ptype) {
2651         case BPF_PROG_TYPE_SK_SKB:
2652         case BPF_PROG_TYPE_SK_MSG:
2653                 ret = sock_map_get_from_fd(attr, prog);
2654                 break;
2655         case BPF_PROG_TYPE_LIRC_MODE2:
2656                 ret = lirc_prog_attach(attr, prog);
2657                 break;
2658         case BPF_PROG_TYPE_FLOW_DISSECTOR:
2659                 ret = skb_flow_dissector_bpf_prog_attach(attr, prog);
2660                 break;
2661         case BPF_PROG_TYPE_CGROUP_DEVICE:
2662         case BPF_PROG_TYPE_CGROUP_SKB:
2663         case BPF_PROG_TYPE_CGROUP_SOCK:
2664         case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
2665         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
2666         case BPF_PROG_TYPE_CGROUP_SYSCTL:
2667         case BPF_PROG_TYPE_SOCK_OPS:
2668                 ret = cgroup_bpf_prog_attach(attr, ptype, prog);
2669                 break;
2670         default:
2671                 ret = -EINVAL;
2672         }
2673
2674         if (ret)
2675                 bpf_prog_put(prog);
2676         return ret;
2677 }
2678
2679 #define BPF_PROG_DETACH_LAST_FIELD attach_type
2680
2681 static int bpf_prog_detach(const union bpf_attr *attr)
2682 {
2683         enum bpf_prog_type ptype;
2684
2685         if (!capable(CAP_NET_ADMIN))
2686                 return -EPERM;
2687
2688         if (CHECK_ATTR(BPF_PROG_DETACH))
2689                 return -EINVAL;
2690
2691         ptype = attach_type_to_prog_type(attr->attach_type);
2692
2693         switch (ptype) {
2694         case BPF_PROG_TYPE_SK_MSG:
2695         case BPF_PROG_TYPE_SK_SKB:
2696                 return sock_map_get_from_fd(attr, NULL);
2697         case BPF_PROG_TYPE_LIRC_MODE2:
2698                 return lirc_prog_detach(attr);
2699         case BPF_PROG_TYPE_FLOW_DISSECTOR:
2700                 return skb_flow_dissector_bpf_prog_detach(attr);
2701         case BPF_PROG_TYPE_CGROUP_DEVICE:
2702         case BPF_PROG_TYPE_CGROUP_SKB:
2703         case BPF_PROG_TYPE_CGROUP_SOCK:
2704         case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
2705         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
2706         case BPF_PROG_TYPE_CGROUP_SYSCTL:
2707         case BPF_PROG_TYPE_SOCK_OPS:
2708                 return cgroup_bpf_prog_detach(attr, ptype);
2709         default:
2710                 return -EINVAL;
2711         }
2712 }
2713
2714 #define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt
2715
2716 static int bpf_prog_query(const union bpf_attr *attr,
2717                           union bpf_attr __user *uattr)
2718 {
2719         if (!capable(CAP_NET_ADMIN))
2720                 return -EPERM;
2721         if (CHECK_ATTR(BPF_PROG_QUERY))
2722                 return -EINVAL;
2723         if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE)
2724                 return -EINVAL;
2725
2726         switch (attr->query.attach_type) {
2727         case BPF_CGROUP_INET_INGRESS:
2728         case BPF_CGROUP_INET_EGRESS:
2729         case BPF_CGROUP_INET_SOCK_CREATE:
2730         case BPF_CGROUP_INET4_BIND:
2731         case BPF_CGROUP_INET6_BIND:
2732         case BPF_CGROUP_INET4_POST_BIND:
2733         case BPF_CGROUP_INET6_POST_BIND:
2734         case BPF_CGROUP_INET4_CONNECT:
2735         case BPF_CGROUP_INET6_CONNECT:
2736         case BPF_CGROUP_UDP4_SENDMSG:
2737         case BPF_CGROUP_UDP6_SENDMSG:
2738         case BPF_CGROUP_UDP4_RECVMSG:
2739         case BPF_CGROUP_UDP6_RECVMSG:
2740         case BPF_CGROUP_SOCK_OPS:
2741         case BPF_CGROUP_DEVICE:
2742         case BPF_CGROUP_SYSCTL:
2743         case BPF_CGROUP_GETSOCKOPT:
2744         case BPF_CGROUP_SETSOCKOPT:
2745                 return cgroup_bpf_prog_query(attr, uattr);
2746         case BPF_LIRC_MODE2:
2747                 return lirc_prog_query(attr, uattr);
2748         case BPF_FLOW_DISSECTOR:
2749                 return skb_flow_dissector_prog_query(attr, uattr);
2750         default:
2751                 return -EINVAL;
2752         }
2753 }
2754
2755 #define BPF_PROG_TEST_RUN_LAST_FIELD test.ctx_out
2756
2757 static int bpf_prog_test_run(const union bpf_attr *attr,
2758                              union bpf_attr __user *uattr)
2759 {
2760         struct bpf_prog *prog;
2761         int ret = -ENOTSUPP;
2762
2763         if (!capable(CAP_SYS_ADMIN))
2764                 return -EPERM;
2765         if (CHECK_ATTR(BPF_PROG_TEST_RUN))
2766                 return -EINVAL;
2767
2768         if ((attr->test.ctx_size_in && !attr->test.ctx_in) ||
2769             (!attr->test.ctx_size_in && attr->test.ctx_in))
2770                 return -EINVAL;
2771
2772         if ((attr->test.ctx_size_out && !attr->test.ctx_out) ||
2773             (!attr->test.ctx_size_out && attr->test.ctx_out))
2774                 return -EINVAL;
2775
2776         prog = bpf_prog_get(attr->test.prog_fd);
2777         if (IS_ERR(prog))
2778                 return PTR_ERR(prog);
2779
2780         if (prog->aux->ops->test_run)
2781                 ret = prog->aux->ops->test_run(prog, attr, uattr);
2782
2783         bpf_prog_put(prog);
2784         return ret;
2785 }
2786
2787 #define BPF_OBJ_GET_NEXT_ID_LAST_FIELD next_id
2788
2789 static int bpf_obj_get_next_id(const union bpf_attr *attr,
2790                                union bpf_attr __user *uattr,
2791                                struct idr *idr,
2792                                spinlock_t *lock)
2793 {
2794         u32 next_id = attr->start_id;
2795         int err = 0;
2796
2797         if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX)
2798                 return -EINVAL;
2799
2800         if (!capable(CAP_SYS_ADMIN))
2801                 return -EPERM;
2802
2803         next_id++;
2804         spin_lock_bh(lock);
2805         if (!idr_get_next(idr, &next_id))
2806                 err = -ENOENT;
2807         spin_unlock_bh(lock);
2808
2809         if (!err)
2810                 err = put_user(next_id, &uattr->next_id);
2811
2812         return err;
2813 }
2814
2815 #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
2816
2817 struct bpf_prog *bpf_prog_by_id(u32 id)
2818 {
2819         struct bpf_prog *prog;
2820
2821         if (!id)
2822                 return ERR_PTR(-ENOENT);
2823
2824         spin_lock_bh(&prog_idr_lock);
2825         prog = idr_find(&prog_idr, id);
2826         if (prog)
2827                 prog = bpf_prog_inc_not_zero(prog);
2828         else
2829                 prog = ERR_PTR(-ENOENT);
2830         spin_unlock_bh(&prog_idr_lock);
2831         return prog;
2832 }
2833
2834 static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
2835 {
2836         struct bpf_prog *prog;
2837         u32 id = attr->prog_id;
2838         int fd;
2839
2840         if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
2841                 return -EINVAL;
2842
2843         if (!capable(CAP_SYS_ADMIN))
2844                 return -EPERM;
2845
2846         prog = bpf_prog_by_id(id);
2847         if (IS_ERR(prog))
2848                 return PTR_ERR(prog);
2849
2850         fd = bpf_prog_new_fd(prog);
2851         if (fd < 0)
2852                 bpf_prog_put(prog);
2853
2854         return fd;
2855 }
2856
2857 #define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags
2858
2859 static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
2860 {
2861         struct bpf_map *map;
2862         u32 id = attr->map_id;
2863         int f_flags;
2864         int fd;
2865
2866         if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) ||
2867             attr->open_flags & ~BPF_OBJ_FLAG_MASK)
2868                 return -EINVAL;
2869
2870         if (!capable(CAP_SYS_ADMIN))
2871                 return -EPERM;
2872
2873         f_flags = bpf_get_file_flag(attr->open_flags);
2874         if (f_flags < 0)
2875                 return f_flags;
2876
2877         spin_lock_bh(&map_idr_lock);
2878         map = idr_find(&map_idr, id);
2879         if (map)
2880                 map = __bpf_map_inc_not_zero(map, true);
2881         else
2882                 map = ERR_PTR(-ENOENT);
2883         spin_unlock_bh(&map_idr_lock);
2884
2885         if (IS_ERR(map))
2886                 return PTR_ERR(map);
2887
2888         fd = bpf_map_new_fd(map, f_flags);
2889         if (fd < 0)
2890                 bpf_map_put_with_uref(map);
2891
2892         return fd;
2893 }
2894
2895 static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog,
2896                                               unsigned long addr, u32 *off,
2897                                               u32 *type)
2898 {
2899         const struct bpf_map *map;
2900         int i;
2901
2902         for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) {
2903                 map = prog->aux->used_maps[i];
2904                 if (map == (void *)addr) {
2905                         *type = BPF_PSEUDO_MAP_FD;
2906                         return map;
2907                 }
2908                 if (!map->ops->map_direct_value_meta)
2909                         continue;
2910                 if (!map->ops->map_direct_value_meta(map, addr, off)) {
2911                         *type = BPF_PSEUDO_MAP_VALUE;
2912                         return map;
2913                 }
2914         }
2915
2916         return NULL;
2917 }
2918
2919 static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog)
2920 {
2921         const struct bpf_map *map;
2922         struct bpf_insn *insns;
2923         u32 off, type;
2924         u64 imm;
2925         int i;
2926
2927         insns = kmemdup(prog->insnsi, bpf_prog_insn_size(prog),
2928                         GFP_USER);
2929         if (!insns)
2930                 return insns;
2931
2932         for (i = 0; i < prog->len; i++) {
2933                 if (insns[i].code == (BPF_JMP | BPF_TAIL_CALL)) {
2934                         insns[i].code = BPF_JMP | BPF_CALL;
2935                         insns[i].imm = BPF_FUNC_tail_call;
2936                         /* fall-through */
2937                 }
2938                 if (insns[i].code == (BPF_JMP | BPF_CALL) ||
2939                     insns[i].code == (BPF_JMP | BPF_CALL_ARGS)) {
2940                         if (insns[i].code == (BPF_JMP | BPF_CALL_ARGS))
2941                                 insns[i].code = BPF_JMP | BPF_CALL;
2942                         if (!bpf_dump_raw_ok())
2943                                 insns[i].imm = 0;
2944                         continue;
2945                 }
2946
2947                 if (insns[i].code != (BPF_LD | BPF_IMM | BPF_DW))
2948                         continue;
2949
2950                 imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm;
2951                 map = bpf_map_from_imm(prog, imm, &off, &type);
2952                 if (map) {
2953                         insns[i].src_reg = type;
2954                         insns[i].imm = map->id;
2955                         insns[i + 1].imm = off;
2956                         continue;
2957                 }
2958         }
2959
2960         return insns;
2961 }
2962
2963 static int set_info_rec_size(struct bpf_prog_info *info)
2964 {
2965         /*
2966          * Ensure info.*_rec_size is the same as kernel expected size
2967          *
2968          * or
2969          *
2970          * Only allow zero *_rec_size if both _rec_size and _cnt are
2971          * zero.  In this case, the kernel will set the expected
2972          * _rec_size back to the info.
2973          */
2974
2975         if ((info->nr_func_info || info->func_info_rec_size) &&
2976             info->func_info_rec_size != sizeof(struct bpf_func_info))
2977                 return -EINVAL;
2978
2979         if ((info->nr_line_info || info->line_info_rec_size) &&
2980             info->line_info_rec_size != sizeof(struct bpf_line_info))
2981                 return -EINVAL;
2982
2983         if ((info->nr_jited_line_info || info->jited_line_info_rec_size) &&
2984             info->jited_line_info_rec_size != sizeof(__u64))
2985                 return -EINVAL;
2986
2987         info->func_info_rec_size = sizeof(struct bpf_func_info);
2988         info->line_info_rec_size = sizeof(struct bpf_line_info);
2989         info->jited_line_info_rec_size = sizeof(__u64);
2990
2991         return 0;
2992 }
2993
2994 static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
2995                                    const union bpf_attr *attr,
2996                                    union bpf_attr __user *uattr)
2997 {
2998         struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
2999         struct bpf_prog_info info;
3000         u32 info_len = attr->info.info_len;
3001         struct bpf_prog_stats stats;
3002         char __user *uinsns;
3003         u32 ulen;
3004         int err;
3005
3006         err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len);
3007         if (err)
3008                 return err;
3009         info_len = min_t(u32, sizeof(info), info_len);
3010
3011         memset(&info, 0, sizeof(info));
3012         if (copy_from_user(&info, uinfo, info_len))
3013                 return -EFAULT;
3014
3015         info.type = prog->type;
3016         info.id = prog->aux->id;
3017         info.load_time = prog->aux->load_time;
3018         info.created_by_uid = from_kuid_munged(current_user_ns(),
3019                                                prog->aux->user->uid);
3020         info.gpl_compatible = prog->gpl_compatible;
3021
3022         memcpy(info.tag, prog->tag, sizeof(prog->tag));
3023         memcpy(info.name, prog->aux->name, sizeof(prog->aux->name));
3024
3025         ulen = info.nr_map_ids;
3026         info.nr_map_ids = prog->aux->used_map_cnt;
3027         ulen = min_t(u32, info.nr_map_ids, ulen);
3028         if (ulen) {
3029                 u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids);
3030                 u32 i;
3031
3032                 for (i = 0; i < ulen; i++)
3033                         if (put_user(prog->aux->used_maps[i]->id,
3034                                      &user_map_ids[i]))
3035                                 return -EFAULT;
3036         }
3037
3038         err = set_info_rec_size(&info);
3039         if (err)
3040                 return err;
3041
3042         bpf_prog_get_stats(prog, &stats);
3043         info.run_time_ns = stats.nsecs;
3044         info.run_cnt = stats.cnt;
3045
3046         if (!capable(CAP_SYS_ADMIN)) {
3047                 info.jited_prog_len = 0;
3048                 info.xlated_prog_len = 0;
3049                 info.nr_jited_ksyms = 0;
3050                 info.nr_jited_func_lens = 0;
3051                 info.nr_func_info = 0;
3052                 info.nr_line_info = 0;
3053                 info.nr_jited_line_info = 0;
3054                 goto done;
3055         }
3056
3057         ulen = info.xlated_prog_len;
3058         info.xlated_prog_len = bpf_prog_insn_size(prog);
3059         if (info.xlated_prog_len && ulen) {
3060                 struct bpf_insn *insns_sanitized;
3061                 bool fault;
3062
3063                 if (prog->blinded && !bpf_dump_raw_ok()) {
3064                         info.xlated_prog_insns = 0;
3065                         goto done;
3066                 }
3067                 insns_sanitized = bpf_insn_prepare_dump(prog);
3068                 if (!insns_sanitized)
3069                         return -ENOMEM;
3070                 uinsns = u64_to_user_ptr(info.xlated_prog_insns);
3071                 ulen = min_t(u32, info.xlated_prog_len, ulen);
3072                 fault = copy_to_user(uinsns, insns_sanitized, ulen);
3073                 kfree(insns_sanitized);
3074                 if (fault)
3075                         return -EFAULT;
3076         }
3077
3078         if (bpf_prog_is_dev_bound(prog->aux)) {
3079                 err = bpf_prog_offload_info_fill(&info, prog);
3080                 if (err)
3081                         return err;
3082                 goto done;
3083         }
3084
3085         /* NOTE: the following code is supposed to be skipped for offload.
3086          * bpf_prog_offload_info_fill() is the place to fill similar fields
3087          * for offload.
3088          */
3089         ulen = info.jited_prog_len;
3090         if (prog->aux->func_cnt) {
3091                 u32 i;
3092
3093                 info.jited_prog_len = 0;
3094                 for (i = 0; i < prog->aux->func_cnt; i++)
3095                         info.jited_prog_len += prog->aux->func[i]->jited_len;
3096         } else {
3097                 info.jited_prog_len = prog->jited_len;
3098         }
3099
3100         if (info.jited_prog_len && ulen) {
3101                 if (bpf_dump_raw_ok()) {
3102                         uinsns = u64_to_user_ptr(info.jited_prog_insns);
3103                         ulen = min_t(u32, info.jited_prog_len, ulen);
3104
3105                         /* for multi-function programs, copy the JITed
3106                          * instructions for all the functions
3107                          */
3108                         if (prog->aux->func_cnt) {
3109                                 u32 len, free, i;
3110                                 u8 *img;
3111
3112                                 free = ulen;
3113                                 for (i = 0; i < prog->aux->func_cnt; i++) {
3114                                         len = prog->aux->func[i]->jited_len;
3115                                         len = min_t(u32, len, free);
3116                                         img = (u8 *) prog->aux->func[i]->bpf_func;
3117                                         if (copy_to_user(uinsns, img, len))
3118                                                 return -EFAULT;
3119                                         uinsns += len;
3120                                         free -= len;
3121                                         if (!free)
3122                                                 break;
3123                                 }
3124                         } else {
3125                                 if (copy_to_user(uinsns, prog->bpf_func, ulen))
3126                                         return -EFAULT;
3127                         }
3128                 } else {
3129                         info.jited_prog_insns = 0;
3130                 }
3131         }
3132
3133         ulen = info.nr_jited_ksyms;
3134         info.nr_jited_ksyms = prog->aux->func_cnt ? : 1;
3135         if (ulen) {
3136                 if (bpf_dump_raw_ok()) {
3137                         unsigned long ksym_addr;
3138                         u64 __user *user_ksyms;
3139                         u32 i;
3140
3141                         /* copy the address of the kernel symbol
3142                          * corresponding to each function
3143                          */
3144                         ulen = min_t(u32, info.nr_jited_ksyms, ulen);
3145                         user_ksyms = u64_to_user_ptr(info.jited_ksyms);
3146                         if (prog->aux->func_cnt) {
3147                                 for (i = 0; i < ulen; i++) {
3148                                         ksym_addr = (unsigned long)
3149                                                 prog->aux->func[i]->bpf_func;
3150                                         if (put_user((u64) ksym_addr,
3151                                                      &user_ksyms[i]))
3152                                                 return -EFAULT;
3153                                 }
3154                         } else {
3155                                 ksym_addr = (unsigned long) prog->bpf_func;
3156                                 if (put_user((u64) ksym_addr, &user_ksyms[0]))
3157                                         return -EFAULT;
3158                         }
3159                 } else {
3160                         info.jited_ksyms = 0;
3161                 }
3162         }
3163
3164         ulen = info.nr_jited_func_lens;
3165         info.nr_jited_func_lens = prog->aux->func_cnt ? : 1;
3166         if (ulen) {
3167                 if (bpf_dump_raw_ok()) {
3168                         u32 __user *user_lens;
3169                         u32 func_len, i;
3170
3171                         /* copy the JITed image lengths for each function */
3172                         ulen = min_t(u32, info.nr_jited_func_lens, ulen);
3173                         user_lens = u64_to_user_ptr(info.jited_func_lens);
3174                         if (prog->aux->func_cnt) {
3175                                 for (i = 0; i < ulen; i++) {
3176                                         func_len =
3177                                                 prog->aux->func[i]->jited_len;
3178                                         if (put_user(func_len, &user_lens[i]))
3179                                                 return -EFAULT;
3180                                 }
3181                         } else {
3182                                 func_len = prog->jited_len;
3183                                 if (put_user(func_len, &user_lens[0]))
3184                                         return -EFAULT;
3185                         }
3186                 } else {
3187                         info.jited_func_lens = 0;
3188                 }
3189         }
3190
3191         if (prog->aux->btf)
3192                 info.btf_id = btf_id(prog->aux->btf);
3193
3194         ulen = info.nr_func_info;
3195         info.nr_func_info = prog->aux->func_info_cnt;
3196         if (info.nr_func_info && ulen) {
3197                 char __user *user_finfo;
3198
3199                 user_finfo = u64_to_user_ptr(info.func_info);
3200                 ulen = min_t(u32, info.nr_func_info, ulen);
3201                 if (copy_to_user(user_finfo, prog->aux->func_info,
3202                                  info.func_info_rec_size * ulen))
3203                         return -EFAULT;
3204         }
3205
3206         ulen = info.nr_line_info;
3207         info.nr_line_info = prog->aux->nr_linfo;
3208         if (info.nr_line_info && ulen) {
3209                 __u8 __user *user_linfo;
3210
3211                 user_linfo = u64_to_user_ptr(info.line_info);
3212                 ulen = min_t(u32, info.nr_line_info, ulen);
3213                 if (copy_to_user(user_linfo, prog->aux->linfo,
3214                                  info.line_info_rec_size * ulen))
3215                         return -EFAULT;
3216         }
3217
3218         ulen = info.nr_jited_line_info;
3219         if (prog->aux->jited_linfo)
3220                 info.nr_jited_line_info = prog->aux->nr_linfo;
3221         else
3222                 info.nr_jited_line_info = 0;
3223         if (info.nr_jited_line_info && ulen) {
3224                 if (bpf_dump_raw_ok()) {
3225                         __u64 __user *user_linfo;
3226                         u32 i;
3227
3228                         user_linfo = u64_to_user_ptr(info.jited_line_info);
3229                         ulen = min_t(u32, info.nr_jited_line_info, ulen);
3230                         for (i = 0; i < ulen; i++) {
3231                                 if (put_user((__u64)(long)prog->aux->jited_linfo[i],
3232                                              &user_linfo[i]))
3233                                         return -EFAULT;
3234                         }
3235                 } else {
3236                         info.jited_line_info = 0;
3237                 }
3238         }
3239
3240         ulen = info.nr_prog_tags;
3241         info.nr_prog_tags = prog->aux->func_cnt ? : 1;
3242         if (ulen) {
3243                 __u8 __user (*user_prog_tags)[BPF_TAG_SIZE];
3244                 u32 i;
3245
3246                 user_prog_tags = u64_to_user_ptr(info.prog_tags);
3247                 ulen = min_t(u32, info.nr_prog_tags, ulen);
3248                 if (prog->aux->func_cnt) {
3249                         for (i = 0; i < ulen; i++) {
3250                                 if (copy_to_user(user_prog_tags[i],
3251                                                  prog->aux->func[i]->tag,
3252                                                  BPF_TAG_SIZE))
3253                                         return -EFAULT;
3254                         }
3255                 } else {
3256                         if (copy_to_user(user_prog_tags[0],
3257                                          prog->tag, BPF_TAG_SIZE))
3258                                 return -EFAULT;
3259                 }
3260         }
3261
3262 done:
3263         if (copy_to_user(uinfo, &info, info_len) ||
3264             put_user(info_len, &uattr->info.info_len))
3265                 return -EFAULT;
3266
3267         return 0;
3268 }
3269
3270 static int bpf_map_get_info_by_fd(struct bpf_map *map,
3271                                   const union bpf_attr *attr,
3272                                   union bpf_attr __user *uattr)
3273 {
3274         struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info);
3275         struct bpf_map_info info;
3276         u32 info_len = attr->info.info_len;
3277         int err;
3278
3279         err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len);
3280         if (err)
3281                 return err;
3282         info_len = min_t(u32, sizeof(info), info_len);
3283
3284         memset(&info, 0, sizeof(info));
3285         info.type = map->map_type;
3286         info.id = map->id;
3287         info.key_size = map->key_size;
3288         info.value_size = map->value_size;
3289         info.max_entries = map->max_entries;
3290         info.map_flags = map->map_flags;
3291         memcpy(info.name, map->name, sizeof(map->name));
3292
3293         if (map->btf) {
3294                 info.btf_id = btf_id(map->btf);
3295                 info.btf_key_type_id = map->btf_key_type_id;
3296                 info.btf_value_type_id = map->btf_value_type_id;
3297         }
3298         info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id;
3299
3300         if (bpf_map_is_dev_bound(map)) {
3301                 err = bpf_map_offload_info_fill(&info, map);
3302                 if (err)
3303                         return err;
3304         }
3305
3306         if (copy_to_user(uinfo, &info, info_len) ||
3307             put_user(info_len, &uattr->info.info_len))
3308                 return -EFAULT;
3309
3310         return 0;
3311 }
3312
3313 static int bpf_btf_get_info_by_fd(struct btf *btf,
3314                                   const union bpf_attr *attr,
3315                                   union bpf_attr __user *uattr)
3316 {
3317         struct bpf_btf_info __user *uinfo = u64_to_user_ptr(attr->info.info);
3318         u32 info_len = attr->info.info_len;
3319         int err;
3320
3321         err = bpf_check_uarg_tail_zero(uinfo, sizeof(*uinfo), info_len);
3322         if (err)
3323                 return err;
3324
3325         return btf_get_info_by_fd(btf, attr, uattr);
3326 }
3327
3328 #define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info
3329
3330 static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
3331                                   union bpf_attr __user *uattr)
3332 {
3333         int ufd = attr->info.bpf_fd;
3334         struct fd f;
3335         int err;
3336
3337         if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD))
3338                 return -EINVAL;
3339
3340         f = fdget(ufd);
3341         if (!f.file)
3342                 return -EBADFD;
3343
3344         if (f.file->f_op == &bpf_prog_fops)
3345                 err = bpf_prog_get_info_by_fd(f.file->private_data, attr,
3346                                               uattr);
3347         else if (f.file->f_op == &bpf_map_fops)
3348                 err = bpf_map_get_info_by_fd(f.file->private_data, attr,
3349                                              uattr);
3350         else if (f.file->f_op == &btf_fops)
3351                 err = bpf_btf_get_info_by_fd(f.file->private_data, attr, uattr);
3352         else
3353                 err = -EINVAL;
3354
3355         fdput(f);
3356         return err;
3357 }
3358
3359 #define BPF_BTF_LOAD_LAST_FIELD btf_log_level
3360
3361 static int bpf_btf_load(const union bpf_attr *attr)
3362 {
3363         if (CHECK_ATTR(BPF_BTF_LOAD))
3364                 return -EINVAL;
3365
3366         if (!capable(CAP_SYS_ADMIN))
3367                 return -EPERM;
3368
3369         return btf_new_fd(attr);
3370 }
3371
3372 #define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id
3373
3374 static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
3375 {
3376         if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID))
3377                 return -EINVAL;
3378
3379         if (!capable(CAP_SYS_ADMIN))
3380                 return -EPERM;
3381
3382         return btf_get_fd_by_id(attr->btf_id);
3383 }
3384
3385 static int bpf_task_fd_query_copy(const union bpf_attr *attr,
3386                                     union bpf_attr __user *uattr,
3387                                     u32 prog_id, u32 fd_type,
3388                                     const char *buf, u64 probe_offset,
3389                                     u64 probe_addr)
3390 {
3391         char __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf);
3392         u32 len = buf ? strlen(buf) : 0, input_len;
3393         int err = 0;
3394
3395         if (put_user(len, &uattr->task_fd_query.buf_len))
3396                 return -EFAULT;
3397         input_len = attr->task_fd_query.buf_len;
3398         if (input_len && ubuf) {
3399                 if (!len) {
3400                         /* nothing to copy, just make ubuf NULL terminated */
3401                         char zero = '\0';
3402
3403                         if (put_user(zero, ubuf))
3404                                 return -EFAULT;
3405                 } else if (input_len >= len + 1) {
3406                         /* ubuf can hold the string with NULL terminator */
3407                         if (copy_to_user(ubuf, buf, len + 1))
3408                                 return -EFAULT;
3409                 } else {
3410                         /* ubuf cannot hold the string with NULL terminator,
3411                          * do a partial copy with NULL terminator.
3412                          */
3413                         char zero = '\0';
3414
3415                         err = -ENOSPC;
3416                         if (copy_to_user(ubuf, buf, input_len - 1))
3417                                 return -EFAULT;
3418                         if (put_user(zero, ubuf + input_len - 1))
3419                                 return -EFAULT;
3420                 }
3421         }
3422
3423         if (put_user(prog_id, &uattr->task_fd_query.prog_id) ||
3424             put_user(fd_type, &uattr->task_fd_query.fd_type) ||
3425             put_user(probe_offset, &uattr->task_fd_query.probe_offset) ||
3426             put_user(probe_addr, &uattr->task_fd_query.probe_addr))
3427                 return -EFAULT;
3428
3429         return err;
3430 }
3431
3432 #define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr
3433
3434 static int bpf_task_fd_query(const union bpf_attr *attr,
3435                              union bpf_attr __user *uattr)
3436 {
3437         pid_t pid = attr->task_fd_query.pid;
3438         u32 fd = attr->task_fd_query.fd;
3439         const struct perf_event *event;
3440         struct files_struct *files;
3441         struct task_struct *task;
3442         struct file *file;
3443         int err;
3444
3445         if (CHECK_ATTR(BPF_TASK_FD_QUERY))
3446                 return -EINVAL;
3447
3448         if (!capable(CAP_SYS_ADMIN))
3449                 return -EPERM;
3450
3451         if (attr->task_fd_query.flags != 0)
3452                 return -EINVAL;
3453
3454         task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
3455         if (!task)
3456                 return -ENOENT;
3457
3458         files = get_files_struct(task);
3459         put_task_struct(task);
3460         if (!files)
3461                 return -ENOENT;
3462
3463         err = 0;
3464         spin_lock(&files->file_lock);
3465         file = fcheck_files(files, fd);
3466         if (!file)
3467                 err = -EBADF;
3468         else
3469                 get_file(file);
3470         spin_unlock(&files->file_lock);
3471         put_files_struct(files);
3472
3473         if (err)
3474                 goto out;
3475
3476         if (file->f_op == &bpf_link_fops) {
3477                 struct bpf_link *link = file->private_data;
3478
3479                 if (link->ops == &bpf_raw_tp_lops) {
3480                         struct bpf_raw_tp_link *raw_tp =
3481                                 container_of(link, struct bpf_raw_tp_link, link);
3482                         struct bpf_raw_event_map *btp = raw_tp->btp;
3483
3484                         err = bpf_task_fd_query_copy(attr, uattr,
3485                                                      raw_tp->link.prog->aux->id,
3486                                                      BPF_FD_TYPE_RAW_TRACEPOINT,
3487                                                      btp->tp->name, 0, 0);
3488                         goto put_file;
3489                 }
3490                 goto out_not_supp;
3491         }
3492
3493         event = perf_get_event(file);
3494         if (!IS_ERR(event)) {
3495                 u64 probe_offset, probe_addr;
3496                 u32 prog_id, fd_type;
3497                 const char *buf;
3498
3499                 err = bpf_get_perf_event_info(event, &prog_id, &fd_type,
3500                                               &buf, &probe_offset,
3501                                               &probe_addr);
3502                 if (!err)
3503                         err = bpf_task_fd_query_copy(attr, uattr, prog_id,
3504                                                      fd_type, buf,
3505                                                      probe_offset,
3506                                                      probe_addr);
3507                 goto put_file;
3508         }
3509
3510 out_not_supp:
3511         err = -ENOTSUPP;
3512 put_file:
3513         fput(file);
3514 out:
3515         return err;
3516 }
3517
3518 #define BPF_MAP_BATCH_LAST_FIELD batch.flags
3519
3520 #define BPF_DO_BATCH(fn)                        \
3521         do {                                    \
3522                 if (!fn) {                      \
3523                         err = -ENOTSUPP;        \
3524                         goto err_put;           \
3525                 }                               \
3526                 err = fn(map, attr, uattr);     \
3527         } while (0)
3528
3529 static int bpf_map_do_batch(const union bpf_attr *attr,
3530                             union bpf_attr __user *uattr,
3531                             int cmd)
3532 {
3533         struct bpf_map *map;
3534         int err, ufd;
3535         struct fd f;
3536
3537         if (CHECK_ATTR(BPF_MAP_BATCH))
3538                 return -EINVAL;
3539
3540         ufd = attr->batch.map_fd;
3541         f = fdget(ufd);
3542         map = __bpf_map_get(f);
3543         if (IS_ERR(map))
3544                 return PTR_ERR(map);
3545
3546         if ((cmd == BPF_MAP_LOOKUP_BATCH ||
3547              cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) &&
3548             !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
3549                 err = -EPERM;
3550                 goto err_put;
3551         }
3552
3553         if (cmd != BPF_MAP_LOOKUP_BATCH &&
3554             !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
3555                 err = -EPERM;
3556                 goto err_put;
3557         }
3558
3559         if (cmd == BPF_MAP_LOOKUP_BATCH)
3560                 BPF_DO_BATCH(map->ops->map_lookup_batch);
3561         else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH)
3562                 BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch);
3563         else if (cmd == BPF_MAP_UPDATE_BATCH)
3564                 BPF_DO_BATCH(map->ops->map_update_batch);
3565         else
3566                 BPF_DO_BATCH(map->ops->map_delete_batch);
3567
3568 err_put:
3569         fdput(f);
3570         return err;
3571 }
3572
3573 #define BPF_LINK_CREATE_LAST_FIELD link_create.flags
3574 static int link_create(union bpf_attr *attr)
3575 {
3576         enum bpf_prog_type ptype;
3577         struct bpf_prog *prog;
3578         int ret;
3579
3580         if (!capable(CAP_NET_ADMIN))
3581                 return -EPERM;
3582
3583         if (CHECK_ATTR(BPF_LINK_CREATE))
3584                 return -EINVAL;
3585
3586         ptype = attach_type_to_prog_type(attr->link_create.attach_type);
3587         if (ptype == BPF_PROG_TYPE_UNSPEC)
3588                 return -EINVAL;
3589
3590         prog = bpf_prog_get_type(attr->link_create.prog_fd, ptype);
3591         if (IS_ERR(prog))
3592                 return PTR_ERR(prog);
3593
3594         ret = bpf_prog_attach_check_attach_type(prog,
3595                                                 attr->link_create.attach_type);
3596         if (ret)
3597                 goto err_out;
3598
3599         switch (ptype) {
3600         case BPF_PROG_TYPE_CGROUP_SKB:
3601         case BPF_PROG_TYPE_CGROUP_SOCK:
3602         case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
3603         case BPF_PROG_TYPE_SOCK_OPS:
3604         case BPF_PROG_TYPE_CGROUP_DEVICE:
3605         case BPF_PROG_TYPE_CGROUP_SYSCTL:
3606         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
3607                 ret = cgroup_bpf_link_attach(attr, prog);
3608                 break;
3609         default:
3610                 ret = -EINVAL;
3611         }
3612
3613 err_out:
3614         if (ret < 0)
3615                 bpf_prog_put(prog);
3616         return ret;
3617 }
3618
3619 #define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd
3620
3621 static int link_update(union bpf_attr *attr)
3622 {
3623         struct bpf_prog *old_prog = NULL, *new_prog;
3624         struct bpf_link *link;
3625         u32 flags;
3626         int ret;
3627
3628         if (!capable(CAP_NET_ADMIN))
3629                 return -EPERM;
3630
3631         if (CHECK_ATTR(BPF_LINK_UPDATE))
3632                 return -EINVAL;
3633
3634         flags = attr->link_update.flags;
3635         if (flags & ~BPF_F_REPLACE)
3636                 return -EINVAL;
3637
3638         link = bpf_link_get_from_fd(attr->link_update.link_fd);
3639         if (IS_ERR(link))
3640                 return PTR_ERR(link);
3641
3642         new_prog = bpf_prog_get(attr->link_update.new_prog_fd);
3643         if (IS_ERR(new_prog)) {
3644                 ret = PTR_ERR(new_prog);
3645                 goto out_put_link;
3646         }
3647
3648         if (flags & BPF_F_REPLACE) {
3649                 old_prog = bpf_prog_get(attr->link_update.old_prog_fd);
3650                 if (IS_ERR(old_prog)) {
3651                         ret = PTR_ERR(old_prog);
3652                         old_prog = NULL;
3653                         goto out_put_progs;
3654                 }
3655         } else if (attr->link_update.old_prog_fd) {
3656                 ret = -EINVAL;
3657                 goto out_put_progs;
3658         }
3659
3660 #ifdef CONFIG_CGROUP_BPF
3661         if (link->ops == &bpf_cgroup_link_lops) {
3662                 ret = cgroup_bpf_replace(link, old_prog, new_prog);
3663                 goto out_put_progs;
3664         }
3665 #endif
3666         ret = -EINVAL;
3667
3668 out_put_progs:
3669         if (old_prog)
3670                 bpf_prog_put(old_prog);
3671         if (ret)
3672                 bpf_prog_put(new_prog);
3673 out_put_link:
3674         bpf_link_put(link);
3675         return ret;
3676 }
3677
3678 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
3679 {
3680         union bpf_attr attr;
3681         int err;
3682
3683         if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))
3684                 return -EPERM;
3685
3686         err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
3687         if (err)
3688                 return err;
3689         size = min_t(u32, size, sizeof(attr));
3690
3691         /* copy attributes from user space, may be less than sizeof(bpf_attr) */
3692         memset(&attr, 0, sizeof(attr));
3693         if (copy_from_user(&attr, uattr, size) != 0)
3694                 return -EFAULT;
3695
3696         err = security_bpf(cmd, &attr, size);
3697         if (err < 0)
3698                 return err;
3699
3700         switch (cmd) {
3701         case BPF_MAP_CREATE:
3702                 err = map_create(&attr);
3703                 break;
3704         case BPF_MAP_LOOKUP_ELEM:
3705                 err = map_lookup_elem(&attr);
3706                 break;
3707         case BPF_MAP_UPDATE_ELEM:
3708                 err = map_update_elem(&attr);
3709                 break;
3710         case BPF_MAP_DELETE_ELEM:
3711                 err = map_delete_elem(&attr);
3712                 break;
3713         case BPF_MAP_GET_NEXT_KEY:
3714                 err = map_get_next_key(&attr);
3715                 break;
3716         case BPF_MAP_FREEZE:
3717                 err = map_freeze(&attr);
3718                 break;
3719         case BPF_PROG_LOAD:
3720                 err = bpf_prog_load(&attr, uattr);
3721                 break;
3722         case BPF_OBJ_PIN:
3723                 err = bpf_obj_pin(&attr);
3724                 break;
3725         case BPF_OBJ_GET:
3726                 err = bpf_obj_get(&attr);
3727                 break;
3728         case BPF_PROG_ATTACH:
3729                 err = bpf_prog_attach(&attr);
3730                 break;
3731         case BPF_PROG_DETACH:
3732                 err = bpf_prog_detach(&attr);
3733                 break;
3734         case BPF_PROG_QUERY:
3735                 err = bpf_prog_query(&attr, uattr);
3736                 break;
3737         case BPF_PROG_TEST_RUN:
3738                 err = bpf_prog_test_run(&attr, uattr);
3739                 break;
3740         case BPF_PROG_GET_NEXT_ID:
3741                 err = bpf_obj_get_next_id(&attr, uattr,
3742                                           &prog_idr, &prog_idr_lock);
3743                 break;
3744         case BPF_MAP_GET_NEXT_ID:
3745                 err = bpf_obj_get_next_id(&attr, uattr,
3746                                           &map_idr, &map_idr_lock);
3747                 break;
3748         case BPF_BTF_GET_NEXT_ID:
3749                 err = bpf_obj_get_next_id(&attr, uattr,
3750                                           &btf_idr, &btf_idr_lock);
3751                 break;
3752         case BPF_PROG_GET_FD_BY_ID:
3753                 err = bpf_prog_get_fd_by_id(&attr);
3754                 break;
3755         case BPF_MAP_GET_FD_BY_ID:
3756                 err = bpf_map_get_fd_by_id(&attr);
3757                 break;
3758         case BPF_OBJ_GET_INFO_BY_FD:
3759                 err = bpf_obj_get_info_by_fd(&attr, uattr);
3760                 break;
3761         case BPF_RAW_TRACEPOINT_OPEN:
3762                 err = bpf_raw_tracepoint_open(&attr);
3763                 break;
3764         case BPF_BTF_LOAD:
3765                 err = bpf_btf_load(&attr);
3766                 break;
3767         case BPF_BTF_GET_FD_BY_ID:
3768                 err = bpf_btf_get_fd_by_id(&attr);
3769                 break;
3770         case BPF_TASK_FD_QUERY:
3771                 err = bpf_task_fd_query(&attr, uattr);
3772                 break;
3773         case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
3774                 err = map_lookup_and_delete_elem(&attr);
3775                 break;
3776         case BPF_MAP_LOOKUP_BATCH:
3777                 err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_BATCH);
3778                 break;
3779         case BPF_MAP_LOOKUP_AND_DELETE_BATCH:
3780                 err = bpf_map_do_batch(&attr, uattr,
3781                                        BPF_MAP_LOOKUP_AND_DELETE_BATCH);
3782                 break;
3783         case BPF_MAP_UPDATE_BATCH:
3784                 err = bpf_map_do_batch(&attr, uattr, BPF_MAP_UPDATE_BATCH);
3785                 break;
3786         case BPF_MAP_DELETE_BATCH:
3787                 err = bpf_map_do_batch(&attr, uattr, BPF_MAP_DELETE_BATCH);
3788                 break;
3789         case BPF_LINK_CREATE:
3790                 err = link_create(&attr);
3791                 break;
3792         case BPF_LINK_UPDATE:
3793                 err = link_update(&attr);
3794                 break;
3795         default:
3796                 err = -EINVAL;
3797                 break;
3798         }
3799
3800         return err;
3801 }