Merge tag 'csky-for-linus-6.6' of https://github.com/c-sky/csky-linux
[linux-2.6-microblaze.git] / kernel / bpf / syscall.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
3  */
4 #include <linux/bpf.h>
5 #include <linux/bpf-cgroup.h>
6 #include <linux/bpf_trace.h>
7 #include <linux/bpf_lirc.h>
8 #include <linux/bpf_verifier.h>
9 #include <linux/bsearch.h>
10 #include <linux/btf.h>
11 #include <linux/syscalls.h>
12 #include <linux/slab.h>
13 #include <linux/sched/signal.h>
14 #include <linux/vmalloc.h>
15 #include <linux/mmzone.h>
16 #include <linux/anon_inodes.h>
17 #include <linux/fdtable.h>
18 #include <linux/file.h>
19 #include <linux/fs.h>
20 #include <linux/license.h>
21 #include <linux/filter.h>
22 #include <linux/kernel.h>
23 #include <linux/idr.h>
24 #include <linux/cred.h>
25 #include <linux/timekeeping.h>
26 #include <linux/ctype.h>
27 #include <linux/nospec.h>
28 #include <linux/audit.h>
29 #include <uapi/linux/btf.h>
30 #include <linux/pgtable.h>
31 #include <linux/bpf_lsm.h>
32 #include <linux/poll.h>
33 #include <linux/sort.h>
34 #include <linux/bpf-netns.h>
35 #include <linux/rcupdate_trace.h>
36 #include <linux/memcontrol.h>
37 #include <linux/trace_events.h>
38 #include <net/netfilter/nf_bpf_link.h>
39
40 #include <net/tcx.h>
41
42 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
43                           (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
44                           (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
45 #define IS_FD_PROG_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY)
46 #define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
47 #define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map) || \
48                         IS_FD_HASH(map))
49
50 #define BPF_OBJ_FLAG_MASK   (BPF_F_RDONLY | BPF_F_WRONLY)
51
52 DEFINE_PER_CPU(int, bpf_prog_active);
53 static DEFINE_IDR(prog_idr);
54 static DEFINE_SPINLOCK(prog_idr_lock);
55 static DEFINE_IDR(map_idr);
56 static DEFINE_SPINLOCK(map_idr_lock);
57 static DEFINE_IDR(link_idr);
58 static DEFINE_SPINLOCK(link_idr_lock);
59
60 int sysctl_unprivileged_bpf_disabled __read_mostly =
61         IS_BUILTIN(CONFIG_BPF_UNPRIV_DEFAULT_OFF) ? 2 : 0;
62
63 static const struct bpf_map_ops * const bpf_map_types[] = {
64 #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
65 #define BPF_MAP_TYPE(_id, _ops) \
66         [_id] = &_ops,
67 #define BPF_LINK_TYPE(_id, _name)
68 #include <linux/bpf_types.h>
69 #undef BPF_PROG_TYPE
70 #undef BPF_MAP_TYPE
71 #undef BPF_LINK_TYPE
72 };
73
74 /*
75  * If we're handed a bigger struct than we know of, ensure all the unknown bits
76  * are 0 - i.e. new user-space does not rely on any kernel feature extensions
77  * we don't know about yet.
78  *
79  * There is a ToCToU between this function call and the following
80  * copy_from_user() call. However, this is not a concern since this function is
81  * meant to be a future-proofing of bits.
82  */
83 int bpf_check_uarg_tail_zero(bpfptr_t uaddr,
84                              size_t expected_size,
85                              size_t actual_size)
86 {
87         int res;
88
89         if (unlikely(actual_size > PAGE_SIZE))  /* silly large */
90                 return -E2BIG;
91
92         if (actual_size <= expected_size)
93                 return 0;
94
95         if (uaddr.is_kernel)
96                 res = memchr_inv(uaddr.kernel + expected_size, 0,
97                                  actual_size - expected_size) == NULL;
98         else
99                 res = check_zeroed_user(uaddr.user + expected_size,
100                                         actual_size - expected_size);
101         if (res < 0)
102                 return res;
103         return res ? 0 : -E2BIG;
104 }
105
106 const struct bpf_map_ops bpf_map_offload_ops = {
107         .map_meta_equal = bpf_map_meta_equal,
108         .map_alloc = bpf_map_offload_map_alloc,
109         .map_free = bpf_map_offload_map_free,
110         .map_check_btf = map_check_no_btf,
111         .map_mem_usage = bpf_map_offload_map_mem_usage,
112 };
113
114 static void bpf_map_write_active_inc(struct bpf_map *map)
115 {
116         atomic64_inc(&map->writecnt);
117 }
118
119 static void bpf_map_write_active_dec(struct bpf_map *map)
120 {
121         atomic64_dec(&map->writecnt);
122 }
123
124 bool bpf_map_write_active(const struct bpf_map *map)
125 {
126         return atomic64_read(&map->writecnt) != 0;
127 }
128
129 static u32 bpf_map_value_size(const struct bpf_map *map)
130 {
131         if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
132             map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
133             map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
134             map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
135                 return round_up(map->value_size, 8) * num_possible_cpus();
136         else if (IS_FD_MAP(map))
137                 return sizeof(u32);
138         else
139                 return  map->value_size;
140 }
141
142 static void maybe_wait_bpf_programs(struct bpf_map *map)
143 {
144         /* Wait for any running BPF programs to complete so that
145          * userspace, when we return to it, knows that all programs
146          * that could be running use the new map value.
147          */
148         if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS ||
149             map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
150                 synchronize_rcu();
151 }
152
153 static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
154                                 void *key, void *value, __u64 flags)
155 {
156         int err;
157
158         /* Need to create a kthread, thus must support schedule */
159         if (bpf_map_is_offloaded(map)) {
160                 return bpf_map_offload_update_elem(map, key, value, flags);
161         } else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
162                    map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
163                 return map->ops->map_update_elem(map, key, value, flags);
164         } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH ||
165                    map->map_type == BPF_MAP_TYPE_SOCKMAP) {
166                 return sock_map_update_elem_sys(map, key, value, flags);
167         } else if (IS_FD_PROG_ARRAY(map)) {
168                 return bpf_fd_array_map_update_elem(map, map_file, key, value,
169                                                     flags);
170         }
171
172         bpf_disable_instrumentation();
173         if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
174             map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
175                 err = bpf_percpu_hash_update(map, key, value, flags);
176         } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
177                 err = bpf_percpu_array_update(map, key, value, flags);
178         } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
179                 err = bpf_percpu_cgroup_storage_update(map, key, value,
180                                                        flags);
181         } else if (IS_FD_ARRAY(map)) {
182                 rcu_read_lock();
183                 err = bpf_fd_array_map_update_elem(map, map_file, key, value,
184                                                    flags);
185                 rcu_read_unlock();
186         } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
187                 rcu_read_lock();
188                 err = bpf_fd_htab_map_update_elem(map, map_file, key, value,
189                                                   flags);
190                 rcu_read_unlock();
191         } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
192                 /* rcu_read_lock() is not needed */
193                 err = bpf_fd_reuseport_array_update_elem(map, key, value,
194                                                          flags);
195         } else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
196                    map->map_type == BPF_MAP_TYPE_STACK ||
197                    map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
198                 err = map->ops->map_push_elem(map, value, flags);
199         } else {
200                 rcu_read_lock();
201                 err = map->ops->map_update_elem(map, key, value, flags);
202                 rcu_read_unlock();
203         }
204         bpf_enable_instrumentation();
205         maybe_wait_bpf_programs(map);
206
207         return err;
208 }
209
210 static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
211                               __u64 flags)
212 {
213         void *ptr;
214         int err;
215
216         if (bpf_map_is_offloaded(map))
217                 return bpf_map_offload_lookup_elem(map, key, value);
218
219         bpf_disable_instrumentation();
220         if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
221             map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
222                 err = bpf_percpu_hash_copy(map, key, value);
223         } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
224                 err = bpf_percpu_array_copy(map, key, value);
225         } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
226                 err = bpf_percpu_cgroup_storage_copy(map, key, value);
227         } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
228                 err = bpf_stackmap_copy(map, key, value);
229         } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) {
230                 err = bpf_fd_array_map_lookup_elem(map, key, value);
231         } else if (IS_FD_HASH(map)) {
232                 err = bpf_fd_htab_map_lookup_elem(map, key, value);
233         } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
234                 err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
235         } else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
236                    map->map_type == BPF_MAP_TYPE_STACK ||
237                    map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
238                 err = map->ops->map_peek_elem(map, value);
239         } else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
240                 /* struct_ops map requires directly updating "value" */
241                 err = bpf_struct_ops_map_sys_lookup_elem(map, key, value);
242         } else {
243                 rcu_read_lock();
244                 if (map->ops->map_lookup_elem_sys_only)
245                         ptr = map->ops->map_lookup_elem_sys_only(map, key);
246                 else
247                         ptr = map->ops->map_lookup_elem(map, key);
248                 if (IS_ERR(ptr)) {
249                         err = PTR_ERR(ptr);
250                 } else if (!ptr) {
251                         err = -ENOENT;
252                 } else {
253                         err = 0;
254                         if (flags & BPF_F_LOCK)
255                                 /* lock 'ptr' and copy everything but lock */
256                                 copy_map_value_locked(map, value, ptr, true);
257                         else
258                                 copy_map_value(map, value, ptr);
259                         /* mask lock and timer, since value wasn't zero inited */
260                         check_and_init_map_value(map, value);
261                 }
262                 rcu_read_unlock();
263         }
264
265         bpf_enable_instrumentation();
266         maybe_wait_bpf_programs(map);
267
268         return err;
269 }
270
271 /* Please, do not use this function outside from the map creation path
272  * (e.g. in map update path) without taking care of setting the active
273  * memory cgroup (see at bpf_map_kmalloc_node() for example).
274  */
275 static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable)
276 {
277         /* We really just want to fail instead of triggering OOM killer
278          * under memory pressure, therefore we set __GFP_NORETRY to kmalloc,
279          * which is used for lower order allocation requests.
280          *
281          * It has been observed that higher order allocation requests done by
282          * vmalloc with __GFP_NORETRY being set might fail due to not trying
283          * to reclaim memory from the page cache, thus we set
284          * __GFP_RETRY_MAYFAIL to avoid such situations.
285          */
286
287         gfp_t gfp = bpf_memcg_flags(__GFP_NOWARN | __GFP_ZERO);
288         unsigned int flags = 0;
289         unsigned long align = 1;
290         void *area;
291
292         if (size >= SIZE_MAX)
293                 return NULL;
294
295         /* kmalloc()'ed memory can't be mmap()'ed */
296         if (mmapable) {
297                 BUG_ON(!PAGE_ALIGNED(size));
298                 align = SHMLBA;
299                 flags = VM_USERMAP;
300         } else if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
301                 area = kmalloc_node(size, gfp | GFP_USER | __GFP_NORETRY,
302                                     numa_node);
303                 if (area != NULL)
304                         return area;
305         }
306
307         return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
308                         gfp | GFP_KERNEL | __GFP_RETRY_MAYFAIL, PAGE_KERNEL,
309                         flags, numa_node, __builtin_return_address(0));
310 }
311
312 void *bpf_map_area_alloc(u64 size, int numa_node)
313 {
314         return __bpf_map_area_alloc(size, numa_node, false);
315 }
316
317 void *bpf_map_area_mmapable_alloc(u64 size, int numa_node)
318 {
319         return __bpf_map_area_alloc(size, numa_node, true);
320 }
321
322 void bpf_map_area_free(void *area)
323 {
324         kvfree(area);
325 }
326
327 static u32 bpf_map_flags_retain_permanent(u32 flags)
328 {
329         /* Some map creation flags are not tied to the map object but
330          * rather to the map fd instead, so they have no meaning upon
331          * map object inspection since multiple file descriptors with
332          * different (access) properties can exist here. Thus, given
333          * this has zero meaning for the map itself, lets clear these
334          * from here.
335          */
336         return flags & ~(BPF_F_RDONLY | BPF_F_WRONLY);
337 }
338
339 void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr)
340 {
341         map->map_type = attr->map_type;
342         map->key_size = attr->key_size;
343         map->value_size = attr->value_size;
344         map->max_entries = attr->max_entries;
345         map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags);
346         map->numa_node = bpf_map_attr_numa_node(attr);
347         map->map_extra = attr->map_extra;
348 }
349
350 static int bpf_map_alloc_id(struct bpf_map *map)
351 {
352         int id;
353
354         idr_preload(GFP_KERNEL);
355         spin_lock_bh(&map_idr_lock);
356         id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC);
357         if (id > 0)
358                 map->id = id;
359         spin_unlock_bh(&map_idr_lock);
360         idr_preload_end();
361
362         if (WARN_ON_ONCE(!id))
363                 return -ENOSPC;
364
365         return id > 0 ? 0 : id;
366 }
367
368 void bpf_map_free_id(struct bpf_map *map)
369 {
370         unsigned long flags;
371
372         /* Offloaded maps are removed from the IDR store when their device
373          * disappears - even if someone holds an fd to them they are unusable,
374          * the memory is gone, all ops will fail; they are simply waiting for
375          * refcnt to drop to be freed.
376          */
377         if (!map->id)
378                 return;
379
380         spin_lock_irqsave(&map_idr_lock, flags);
381
382         idr_remove(&map_idr, map->id);
383         map->id = 0;
384
385         spin_unlock_irqrestore(&map_idr_lock, flags);
386 }
387
388 #ifdef CONFIG_MEMCG_KMEM
389 static void bpf_map_save_memcg(struct bpf_map *map)
390 {
391         /* Currently if a map is created by a process belonging to the root
392          * memory cgroup, get_obj_cgroup_from_current() will return NULL.
393          * So we have to check map->objcg for being NULL each time it's
394          * being used.
395          */
396         if (memcg_bpf_enabled())
397                 map->objcg = get_obj_cgroup_from_current();
398 }
399
400 static void bpf_map_release_memcg(struct bpf_map *map)
401 {
402         if (map->objcg)
403                 obj_cgroup_put(map->objcg);
404 }
405
406 static struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map)
407 {
408         if (map->objcg)
409                 return get_mem_cgroup_from_objcg(map->objcg);
410
411         return root_mem_cgroup;
412 }
413
414 void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
415                            int node)
416 {
417         struct mem_cgroup *memcg, *old_memcg;
418         void *ptr;
419
420         memcg = bpf_map_get_memcg(map);
421         old_memcg = set_active_memcg(memcg);
422         ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node);
423         set_active_memcg(old_memcg);
424         mem_cgroup_put(memcg);
425
426         return ptr;
427 }
428
429 void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
430 {
431         struct mem_cgroup *memcg, *old_memcg;
432         void *ptr;
433
434         memcg = bpf_map_get_memcg(map);
435         old_memcg = set_active_memcg(memcg);
436         ptr = kzalloc(size, flags | __GFP_ACCOUNT);
437         set_active_memcg(old_memcg);
438         mem_cgroup_put(memcg);
439
440         return ptr;
441 }
442
443 void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size,
444                        gfp_t flags)
445 {
446         struct mem_cgroup *memcg, *old_memcg;
447         void *ptr;
448
449         memcg = bpf_map_get_memcg(map);
450         old_memcg = set_active_memcg(memcg);
451         ptr = kvcalloc(n, size, flags | __GFP_ACCOUNT);
452         set_active_memcg(old_memcg);
453         mem_cgroup_put(memcg);
454
455         return ptr;
456 }
457
458 void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
459                                     size_t align, gfp_t flags)
460 {
461         struct mem_cgroup *memcg, *old_memcg;
462         void __percpu *ptr;
463
464         memcg = bpf_map_get_memcg(map);
465         old_memcg = set_active_memcg(memcg);
466         ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT);
467         set_active_memcg(old_memcg);
468         mem_cgroup_put(memcg);
469
470         return ptr;
471 }
472
473 #else
474 static void bpf_map_save_memcg(struct bpf_map *map)
475 {
476 }
477
478 static void bpf_map_release_memcg(struct bpf_map *map)
479 {
480 }
481 #endif
482
483 static int btf_field_cmp(const void *a, const void *b)
484 {
485         const struct btf_field *f1 = a, *f2 = b;
486
487         if (f1->offset < f2->offset)
488                 return -1;
489         else if (f1->offset > f2->offset)
490                 return 1;
491         return 0;
492 }
493
494 struct btf_field *btf_record_find(const struct btf_record *rec, u32 offset,
495                                   u32 field_mask)
496 {
497         struct btf_field *field;
498
499         if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & field_mask))
500                 return NULL;
501         field = bsearch(&offset, rec->fields, rec->cnt, sizeof(rec->fields[0]), btf_field_cmp);
502         if (!field || !(field->type & field_mask))
503                 return NULL;
504         return field;
505 }
506
507 void btf_record_free(struct btf_record *rec)
508 {
509         int i;
510
511         if (IS_ERR_OR_NULL(rec))
512                 return;
513         for (i = 0; i < rec->cnt; i++) {
514                 switch (rec->fields[i].type) {
515                 case BPF_KPTR_UNREF:
516                 case BPF_KPTR_REF:
517                         if (rec->fields[i].kptr.module)
518                                 module_put(rec->fields[i].kptr.module);
519                         btf_put(rec->fields[i].kptr.btf);
520                         break;
521                 case BPF_LIST_HEAD:
522                 case BPF_LIST_NODE:
523                 case BPF_RB_ROOT:
524                 case BPF_RB_NODE:
525                 case BPF_SPIN_LOCK:
526                 case BPF_TIMER:
527                 case BPF_REFCOUNT:
528                         /* Nothing to release */
529                         break;
530                 default:
531                         WARN_ON_ONCE(1);
532                         continue;
533                 }
534         }
535         kfree(rec);
536 }
537
538 void bpf_map_free_record(struct bpf_map *map)
539 {
540         btf_record_free(map->record);
541         map->record = NULL;
542 }
543
544 struct btf_record *btf_record_dup(const struct btf_record *rec)
545 {
546         const struct btf_field *fields;
547         struct btf_record *new_rec;
548         int ret, size, i;
549
550         if (IS_ERR_OR_NULL(rec))
551                 return NULL;
552         size = offsetof(struct btf_record, fields[rec->cnt]);
553         new_rec = kmemdup(rec, size, GFP_KERNEL | __GFP_NOWARN);
554         if (!new_rec)
555                 return ERR_PTR(-ENOMEM);
556         /* Do a deep copy of the btf_record */
557         fields = rec->fields;
558         new_rec->cnt = 0;
559         for (i = 0; i < rec->cnt; i++) {
560                 switch (fields[i].type) {
561                 case BPF_KPTR_UNREF:
562                 case BPF_KPTR_REF:
563                         btf_get(fields[i].kptr.btf);
564                         if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) {
565                                 ret = -ENXIO;
566                                 goto free;
567                         }
568                         break;
569                 case BPF_LIST_HEAD:
570                 case BPF_LIST_NODE:
571                 case BPF_RB_ROOT:
572                 case BPF_RB_NODE:
573                 case BPF_SPIN_LOCK:
574                 case BPF_TIMER:
575                 case BPF_REFCOUNT:
576                         /* Nothing to acquire */
577                         break;
578                 default:
579                         ret = -EFAULT;
580                         WARN_ON_ONCE(1);
581                         goto free;
582                 }
583                 new_rec->cnt++;
584         }
585         return new_rec;
586 free:
587         btf_record_free(new_rec);
588         return ERR_PTR(ret);
589 }
590
591 bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b)
592 {
593         bool a_has_fields = !IS_ERR_OR_NULL(rec_a), b_has_fields = !IS_ERR_OR_NULL(rec_b);
594         int size;
595
596         if (!a_has_fields && !b_has_fields)
597                 return true;
598         if (a_has_fields != b_has_fields)
599                 return false;
600         if (rec_a->cnt != rec_b->cnt)
601                 return false;
602         size = offsetof(struct btf_record, fields[rec_a->cnt]);
603         /* btf_parse_fields uses kzalloc to allocate a btf_record, so unused
604          * members are zeroed out. So memcmp is safe to do without worrying
605          * about padding/unused fields.
606          *
607          * While spin_lock, timer, and kptr have no relation to map BTF,
608          * list_head metadata is specific to map BTF, the btf and value_rec
609          * members in particular. btf is the map BTF, while value_rec points to
610          * btf_record in that map BTF.
611          *
612          * So while by default, we don't rely on the map BTF (which the records
613          * were parsed from) matching for both records, which is not backwards
614          * compatible, in case list_head is part of it, we implicitly rely on
615          * that by way of depending on memcmp succeeding for it.
616          */
617         return !memcmp(rec_a, rec_b, size);
618 }
619
620 void bpf_obj_free_timer(const struct btf_record *rec, void *obj)
621 {
622         if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TIMER)))
623                 return;
624         bpf_timer_cancel_and_free(obj + rec->timer_off);
625 }
626
627 extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
628
629 void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
630 {
631         const struct btf_field *fields;
632         int i;
633
634         if (IS_ERR_OR_NULL(rec))
635                 return;
636         fields = rec->fields;
637         for (i = 0; i < rec->cnt; i++) {
638                 struct btf_struct_meta *pointee_struct_meta;
639                 const struct btf_field *field = &fields[i];
640                 void *field_ptr = obj + field->offset;
641                 void *xchgd_field;
642
643                 switch (fields[i].type) {
644                 case BPF_SPIN_LOCK:
645                         break;
646                 case BPF_TIMER:
647                         bpf_timer_cancel_and_free(field_ptr);
648                         break;
649                 case BPF_KPTR_UNREF:
650                         WRITE_ONCE(*(u64 *)field_ptr, 0);
651                         break;
652                 case BPF_KPTR_REF:
653                         xchgd_field = (void *)xchg((unsigned long *)field_ptr, 0);
654                         if (!xchgd_field)
655                                 break;
656
657                         if (!btf_is_kernel(field->kptr.btf)) {
658                                 pointee_struct_meta = btf_find_struct_meta(field->kptr.btf,
659                                                                            field->kptr.btf_id);
660                                 migrate_disable();
661                                 __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ?
662                                                                  pointee_struct_meta->record :
663                                                                  NULL);
664                                 migrate_enable();
665                         } else {
666                                 field->kptr.dtor(xchgd_field);
667                         }
668                         break;
669                 case BPF_LIST_HEAD:
670                         if (WARN_ON_ONCE(rec->spin_lock_off < 0))
671                                 continue;
672                         bpf_list_head_free(field, field_ptr, obj + rec->spin_lock_off);
673                         break;
674                 case BPF_RB_ROOT:
675                         if (WARN_ON_ONCE(rec->spin_lock_off < 0))
676                                 continue;
677                         bpf_rb_root_free(field, field_ptr, obj + rec->spin_lock_off);
678                         break;
679                 case BPF_LIST_NODE:
680                 case BPF_RB_NODE:
681                 case BPF_REFCOUNT:
682                         break;
683                 default:
684                         WARN_ON_ONCE(1);
685                         continue;
686                 }
687         }
688 }
689
690 /* called from workqueue */
691 static void bpf_map_free_deferred(struct work_struct *work)
692 {
693         struct bpf_map *map = container_of(work, struct bpf_map, work);
694         struct btf_record *rec = map->record;
695
696         security_bpf_map_free(map);
697         bpf_map_release_memcg(map);
698         /* implementation dependent freeing */
699         map->ops->map_free(map);
700         /* Delay freeing of btf_record for maps, as map_free
701          * callback usually needs access to them. It is better to do it here
702          * than require each callback to do the free itself manually.
703          *
704          * Note that the btf_record stashed in map->inner_map_meta->record was
705          * already freed using the map_free callback for map in map case which
706          * eventually calls bpf_map_free_meta, since inner_map_meta is only a
707          * template bpf_map struct used during verification.
708          */
709         btf_record_free(rec);
710 }
711
712 static void bpf_map_put_uref(struct bpf_map *map)
713 {
714         if (atomic64_dec_and_test(&map->usercnt)) {
715                 if (map->ops->map_release_uref)
716                         map->ops->map_release_uref(map);
717         }
718 }
719
720 /* decrement map refcnt and schedule it for freeing via workqueue
721  * (underlying map implementation ops->map_free() might sleep)
722  */
723 void bpf_map_put(struct bpf_map *map)
724 {
725         if (atomic64_dec_and_test(&map->refcnt)) {
726                 /* bpf_map_free_id() must be called first */
727                 bpf_map_free_id(map);
728                 btf_put(map->btf);
729                 INIT_WORK(&map->work, bpf_map_free_deferred);
730                 /* Avoid spawning kworkers, since they all might contend
731                  * for the same mutex like slab_mutex.
732                  */
733                 queue_work(system_unbound_wq, &map->work);
734         }
735 }
736 EXPORT_SYMBOL_GPL(bpf_map_put);
737
738 void bpf_map_put_with_uref(struct bpf_map *map)
739 {
740         bpf_map_put_uref(map);
741         bpf_map_put(map);
742 }
743
744 static int bpf_map_release(struct inode *inode, struct file *filp)
745 {
746         struct bpf_map *map = filp->private_data;
747
748         if (map->ops->map_release)
749                 map->ops->map_release(map, filp);
750
751         bpf_map_put_with_uref(map);
752         return 0;
753 }
754
755 static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f)
756 {
757         fmode_t mode = f.file->f_mode;
758
759         /* Our file permissions may have been overridden by global
760          * map permissions facing syscall side.
761          */
762         if (READ_ONCE(map->frozen))
763                 mode &= ~FMODE_CAN_WRITE;
764         return mode;
765 }
766
767 #ifdef CONFIG_PROC_FS
768 /* Show the memory usage of a bpf map */
769 static u64 bpf_map_memory_usage(const struct bpf_map *map)
770 {
771         return map->ops->map_mem_usage(map);
772 }
773
774 static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
775 {
776         struct bpf_map *map = filp->private_data;
777         u32 type = 0, jited = 0;
778
779         if (map_type_contains_progs(map)) {
780                 spin_lock(&map->owner.lock);
781                 type  = map->owner.type;
782                 jited = map->owner.jited;
783                 spin_unlock(&map->owner.lock);
784         }
785
786         seq_printf(m,
787                    "map_type:\t%u\n"
788                    "key_size:\t%u\n"
789                    "value_size:\t%u\n"
790                    "max_entries:\t%u\n"
791                    "map_flags:\t%#x\n"
792                    "map_extra:\t%#llx\n"
793                    "memlock:\t%llu\n"
794                    "map_id:\t%u\n"
795                    "frozen:\t%u\n",
796                    map->map_type,
797                    map->key_size,
798                    map->value_size,
799                    map->max_entries,
800                    map->map_flags,
801                    (unsigned long long)map->map_extra,
802                    bpf_map_memory_usage(map),
803                    map->id,
804                    READ_ONCE(map->frozen));
805         if (type) {
806                 seq_printf(m, "owner_prog_type:\t%u\n", type);
807                 seq_printf(m, "owner_jited:\t%u\n", jited);
808         }
809 }
810 #endif
811
812 static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz,
813                               loff_t *ppos)
814 {
815         /* We need this handler such that alloc_file() enables
816          * f_mode with FMODE_CAN_READ.
817          */
818         return -EINVAL;
819 }
820
821 static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf,
822                                size_t siz, loff_t *ppos)
823 {
824         /* We need this handler such that alloc_file() enables
825          * f_mode with FMODE_CAN_WRITE.
826          */
827         return -EINVAL;
828 }
829
830 /* called for any extra memory-mapped regions (except initial) */
831 static void bpf_map_mmap_open(struct vm_area_struct *vma)
832 {
833         struct bpf_map *map = vma->vm_file->private_data;
834
835         if (vma->vm_flags & VM_MAYWRITE)
836                 bpf_map_write_active_inc(map);
837 }
838
839 /* called for all unmapped memory region (including initial) */
840 static void bpf_map_mmap_close(struct vm_area_struct *vma)
841 {
842         struct bpf_map *map = vma->vm_file->private_data;
843
844         if (vma->vm_flags & VM_MAYWRITE)
845                 bpf_map_write_active_dec(map);
846 }
847
848 static const struct vm_operations_struct bpf_map_default_vmops = {
849         .open           = bpf_map_mmap_open,
850         .close          = bpf_map_mmap_close,
851 };
852
853 static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
854 {
855         struct bpf_map *map = filp->private_data;
856         int err;
857
858         if (!map->ops->map_mmap || !IS_ERR_OR_NULL(map->record))
859                 return -ENOTSUPP;
860
861         if (!(vma->vm_flags & VM_SHARED))
862                 return -EINVAL;
863
864         mutex_lock(&map->freeze_mutex);
865
866         if (vma->vm_flags & VM_WRITE) {
867                 if (map->frozen) {
868                         err = -EPERM;
869                         goto out;
870                 }
871                 /* map is meant to be read-only, so do not allow mapping as
872                  * writable, because it's possible to leak a writable page
873                  * reference and allows user-space to still modify it after
874                  * freezing, while verifier will assume contents do not change
875                  */
876                 if (map->map_flags & BPF_F_RDONLY_PROG) {
877                         err = -EACCES;
878                         goto out;
879                 }
880         }
881
882         /* set default open/close callbacks */
883         vma->vm_ops = &bpf_map_default_vmops;
884         vma->vm_private_data = map;
885         vm_flags_clear(vma, VM_MAYEXEC);
886         if (!(vma->vm_flags & VM_WRITE))
887                 /* disallow re-mapping with PROT_WRITE */
888                 vm_flags_clear(vma, VM_MAYWRITE);
889
890         err = map->ops->map_mmap(map, vma);
891         if (err)
892                 goto out;
893
894         if (vma->vm_flags & VM_MAYWRITE)
895                 bpf_map_write_active_inc(map);
896 out:
897         mutex_unlock(&map->freeze_mutex);
898         return err;
899 }
900
901 static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts)
902 {
903         struct bpf_map *map = filp->private_data;
904
905         if (map->ops->map_poll)
906                 return map->ops->map_poll(map, filp, pts);
907
908         return EPOLLERR;
909 }
910
911 const struct file_operations bpf_map_fops = {
912 #ifdef CONFIG_PROC_FS
913         .show_fdinfo    = bpf_map_show_fdinfo,
914 #endif
915         .release        = bpf_map_release,
916         .read           = bpf_dummy_read,
917         .write          = bpf_dummy_write,
918         .mmap           = bpf_map_mmap,
919         .poll           = bpf_map_poll,
920 };
921
922 int bpf_map_new_fd(struct bpf_map *map, int flags)
923 {
924         int ret;
925
926         ret = security_bpf_map(map, OPEN_FMODE(flags));
927         if (ret < 0)
928                 return ret;
929
930         return anon_inode_getfd("bpf-map", &bpf_map_fops, map,
931                                 flags | O_CLOEXEC);
932 }
933
934 int bpf_get_file_flag(int flags)
935 {
936         if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY))
937                 return -EINVAL;
938         if (flags & BPF_F_RDONLY)
939                 return O_RDONLY;
940         if (flags & BPF_F_WRONLY)
941                 return O_WRONLY;
942         return O_RDWR;
943 }
944
945 /* helper macro to check that unused fields 'union bpf_attr' are zero */
946 #define CHECK_ATTR(CMD) \
947         memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
948                    sizeof(attr->CMD##_LAST_FIELD), 0, \
949                    sizeof(*attr) - \
950                    offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
951                    sizeof(attr->CMD##_LAST_FIELD)) != NULL
952
953 /* dst and src must have at least "size" number of bytes.
954  * Return strlen on success and < 0 on error.
955  */
956 int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size)
957 {
958         const char *end = src + size;
959         const char *orig_src = src;
960
961         memset(dst, 0, size);
962         /* Copy all isalnum(), '_' and '.' chars. */
963         while (src < end && *src) {
964                 if (!isalnum(*src) &&
965                     *src != '_' && *src != '.')
966                         return -EINVAL;
967                 *dst++ = *src++;
968         }
969
970         /* No '\0' found in "size" number of bytes */
971         if (src == end)
972                 return -EINVAL;
973
974         return src - orig_src;
975 }
976
977 int map_check_no_btf(const struct bpf_map *map,
978                      const struct btf *btf,
979                      const struct btf_type *key_type,
980                      const struct btf_type *value_type)
981 {
982         return -ENOTSUPP;
983 }
984
985 static int map_check_btf(struct bpf_map *map, const struct btf *btf,
986                          u32 btf_key_id, u32 btf_value_id)
987 {
988         const struct btf_type *key_type, *value_type;
989         u32 key_size, value_size;
990         int ret = 0;
991
992         /* Some maps allow key to be unspecified. */
993         if (btf_key_id) {
994                 key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
995                 if (!key_type || key_size != map->key_size)
996                         return -EINVAL;
997         } else {
998                 key_type = btf_type_by_id(btf, 0);
999                 if (!map->ops->map_check_btf)
1000                         return -EINVAL;
1001         }
1002
1003         value_type = btf_type_id_size(btf, &btf_value_id, &value_size);
1004         if (!value_type || value_size != map->value_size)
1005                 return -EINVAL;
1006
1007         map->record = btf_parse_fields(btf, value_type,
1008                                        BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD |
1009                                        BPF_RB_ROOT | BPF_REFCOUNT,
1010                                        map->value_size);
1011         if (!IS_ERR_OR_NULL(map->record)) {
1012                 int i;
1013
1014                 if (!bpf_capable()) {
1015                         ret = -EPERM;
1016                         goto free_map_tab;
1017                 }
1018                 if (map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) {
1019                         ret = -EACCES;
1020                         goto free_map_tab;
1021                 }
1022                 for (i = 0; i < sizeof(map->record->field_mask) * 8; i++) {
1023                         switch (map->record->field_mask & (1 << i)) {
1024                         case 0:
1025                                 continue;
1026                         case BPF_SPIN_LOCK:
1027                                 if (map->map_type != BPF_MAP_TYPE_HASH &&
1028                                     map->map_type != BPF_MAP_TYPE_ARRAY &&
1029                                     map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
1030                                     map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
1031                                     map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
1032                                     map->map_type != BPF_MAP_TYPE_TASK_STORAGE &&
1033                                     map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) {
1034                                         ret = -EOPNOTSUPP;
1035                                         goto free_map_tab;
1036                                 }
1037                                 break;
1038                         case BPF_TIMER:
1039                                 if (map->map_type != BPF_MAP_TYPE_HASH &&
1040                                     map->map_type != BPF_MAP_TYPE_LRU_HASH &&
1041                                     map->map_type != BPF_MAP_TYPE_ARRAY) {
1042                                         ret = -EOPNOTSUPP;
1043                                         goto free_map_tab;
1044                                 }
1045                                 break;
1046                         case BPF_KPTR_UNREF:
1047                         case BPF_KPTR_REF:
1048                         case BPF_REFCOUNT:
1049                                 if (map->map_type != BPF_MAP_TYPE_HASH &&
1050                                     map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
1051                                     map->map_type != BPF_MAP_TYPE_LRU_HASH &&
1052                                     map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH &&
1053                                     map->map_type != BPF_MAP_TYPE_ARRAY &&
1054                                     map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY &&
1055                                     map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
1056                                     map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
1057                                     map->map_type != BPF_MAP_TYPE_TASK_STORAGE &&
1058                                     map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) {
1059                                         ret = -EOPNOTSUPP;
1060                                         goto free_map_tab;
1061                                 }
1062                                 break;
1063                         case BPF_LIST_HEAD:
1064                         case BPF_RB_ROOT:
1065                                 if (map->map_type != BPF_MAP_TYPE_HASH &&
1066                                     map->map_type != BPF_MAP_TYPE_LRU_HASH &&
1067                                     map->map_type != BPF_MAP_TYPE_ARRAY) {
1068                                         ret = -EOPNOTSUPP;
1069                                         goto free_map_tab;
1070                                 }
1071                                 break;
1072                         default:
1073                                 /* Fail if map_type checks are missing for a field type */
1074                                 ret = -EOPNOTSUPP;
1075                                 goto free_map_tab;
1076                         }
1077                 }
1078         }
1079
1080         ret = btf_check_and_fixup_fields(btf, map->record);
1081         if (ret < 0)
1082                 goto free_map_tab;
1083
1084         if (map->ops->map_check_btf) {
1085                 ret = map->ops->map_check_btf(map, btf, key_type, value_type);
1086                 if (ret < 0)
1087                         goto free_map_tab;
1088         }
1089
1090         return ret;
1091 free_map_tab:
1092         bpf_map_free_record(map);
1093         return ret;
1094 }
1095
1096 #define BPF_MAP_CREATE_LAST_FIELD map_extra
1097 /* called via syscall */
1098 static int map_create(union bpf_attr *attr)
1099 {
1100         const struct bpf_map_ops *ops;
1101         int numa_node = bpf_map_attr_numa_node(attr);
1102         u32 map_type = attr->map_type;
1103         struct bpf_map *map;
1104         int f_flags;
1105         int err;
1106
1107         err = CHECK_ATTR(BPF_MAP_CREATE);
1108         if (err)
1109                 return -EINVAL;
1110
1111         if (attr->btf_vmlinux_value_type_id) {
1112                 if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS ||
1113                     attr->btf_key_type_id || attr->btf_value_type_id)
1114                         return -EINVAL;
1115         } else if (attr->btf_key_type_id && !attr->btf_value_type_id) {
1116                 return -EINVAL;
1117         }
1118
1119         if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER &&
1120             attr->map_extra != 0)
1121                 return -EINVAL;
1122
1123         f_flags = bpf_get_file_flag(attr->map_flags);
1124         if (f_flags < 0)
1125                 return f_flags;
1126
1127         if (numa_node != NUMA_NO_NODE &&
1128             ((unsigned int)numa_node >= nr_node_ids ||
1129              !node_online(numa_node)))
1130                 return -EINVAL;
1131
1132         /* find map type and init map: hashtable vs rbtree vs bloom vs ... */
1133         map_type = attr->map_type;
1134         if (map_type >= ARRAY_SIZE(bpf_map_types))
1135                 return -EINVAL;
1136         map_type = array_index_nospec(map_type, ARRAY_SIZE(bpf_map_types));
1137         ops = bpf_map_types[map_type];
1138         if (!ops)
1139                 return -EINVAL;
1140
1141         if (ops->map_alloc_check) {
1142                 err = ops->map_alloc_check(attr);
1143                 if (err)
1144                         return err;
1145         }
1146         if (attr->map_ifindex)
1147                 ops = &bpf_map_offload_ops;
1148         if (!ops->map_mem_usage)
1149                 return -EINVAL;
1150
1151         /* Intent here is for unprivileged_bpf_disabled to block BPF map
1152          * creation for unprivileged users; other actions depend
1153          * on fd availability and access to bpffs, so are dependent on
1154          * object creation success. Even with unprivileged BPF disabled,
1155          * capability checks are still carried out.
1156          */
1157         if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
1158                 return -EPERM;
1159
1160         /* check privileged map type permissions */
1161         switch (map_type) {
1162         case BPF_MAP_TYPE_ARRAY:
1163         case BPF_MAP_TYPE_PERCPU_ARRAY:
1164         case BPF_MAP_TYPE_PROG_ARRAY:
1165         case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
1166         case BPF_MAP_TYPE_CGROUP_ARRAY:
1167         case BPF_MAP_TYPE_ARRAY_OF_MAPS:
1168         case BPF_MAP_TYPE_HASH:
1169         case BPF_MAP_TYPE_PERCPU_HASH:
1170         case BPF_MAP_TYPE_HASH_OF_MAPS:
1171         case BPF_MAP_TYPE_RINGBUF:
1172         case BPF_MAP_TYPE_USER_RINGBUF:
1173         case BPF_MAP_TYPE_CGROUP_STORAGE:
1174         case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE:
1175                 /* unprivileged */
1176                 break;
1177         case BPF_MAP_TYPE_SK_STORAGE:
1178         case BPF_MAP_TYPE_INODE_STORAGE:
1179         case BPF_MAP_TYPE_TASK_STORAGE:
1180         case BPF_MAP_TYPE_CGRP_STORAGE:
1181         case BPF_MAP_TYPE_BLOOM_FILTER:
1182         case BPF_MAP_TYPE_LPM_TRIE:
1183         case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY:
1184         case BPF_MAP_TYPE_STACK_TRACE:
1185         case BPF_MAP_TYPE_QUEUE:
1186         case BPF_MAP_TYPE_STACK:
1187         case BPF_MAP_TYPE_LRU_HASH:
1188         case BPF_MAP_TYPE_LRU_PERCPU_HASH:
1189         case BPF_MAP_TYPE_STRUCT_OPS:
1190         case BPF_MAP_TYPE_CPUMAP:
1191                 if (!bpf_capable())
1192                         return -EPERM;
1193                 break;
1194         case BPF_MAP_TYPE_SOCKMAP:
1195         case BPF_MAP_TYPE_SOCKHASH:
1196         case BPF_MAP_TYPE_DEVMAP:
1197         case BPF_MAP_TYPE_DEVMAP_HASH:
1198         case BPF_MAP_TYPE_XSKMAP:
1199                 if (!capable(CAP_NET_ADMIN))
1200                         return -EPERM;
1201                 break;
1202         default:
1203                 WARN(1, "unsupported map type %d", map_type);
1204                 return -EPERM;
1205         }
1206
1207         map = ops->map_alloc(attr);
1208         if (IS_ERR(map))
1209                 return PTR_ERR(map);
1210         map->ops = ops;
1211         map->map_type = map_type;
1212
1213         err = bpf_obj_name_cpy(map->name, attr->map_name,
1214                                sizeof(attr->map_name));
1215         if (err < 0)
1216                 goto free_map;
1217
1218         atomic64_set(&map->refcnt, 1);
1219         atomic64_set(&map->usercnt, 1);
1220         mutex_init(&map->freeze_mutex);
1221         spin_lock_init(&map->owner.lock);
1222
1223         if (attr->btf_key_type_id || attr->btf_value_type_id ||
1224             /* Even the map's value is a kernel's struct,
1225              * the bpf_prog.o must have BTF to begin with
1226              * to figure out the corresponding kernel's
1227              * counter part.  Thus, attr->btf_fd has
1228              * to be valid also.
1229              */
1230             attr->btf_vmlinux_value_type_id) {
1231                 struct btf *btf;
1232
1233                 btf = btf_get_by_fd(attr->btf_fd);
1234                 if (IS_ERR(btf)) {
1235                         err = PTR_ERR(btf);
1236                         goto free_map;
1237                 }
1238                 if (btf_is_kernel(btf)) {
1239                         btf_put(btf);
1240                         err = -EACCES;
1241                         goto free_map;
1242                 }
1243                 map->btf = btf;
1244
1245                 if (attr->btf_value_type_id) {
1246                         err = map_check_btf(map, btf, attr->btf_key_type_id,
1247                                             attr->btf_value_type_id);
1248                         if (err)
1249                                 goto free_map;
1250                 }
1251
1252                 map->btf_key_type_id = attr->btf_key_type_id;
1253                 map->btf_value_type_id = attr->btf_value_type_id;
1254                 map->btf_vmlinux_value_type_id =
1255                         attr->btf_vmlinux_value_type_id;
1256         }
1257
1258         err = security_bpf_map_alloc(map);
1259         if (err)
1260                 goto free_map;
1261
1262         err = bpf_map_alloc_id(map);
1263         if (err)
1264                 goto free_map_sec;
1265
1266         bpf_map_save_memcg(map);
1267
1268         err = bpf_map_new_fd(map, f_flags);
1269         if (err < 0) {
1270                 /* failed to allocate fd.
1271                  * bpf_map_put_with_uref() is needed because the above
1272                  * bpf_map_alloc_id() has published the map
1273                  * to the userspace and the userspace may
1274                  * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID.
1275                  */
1276                 bpf_map_put_with_uref(map);
1277                 return err;
1278         }
1279
1280         return err;
1281
1282 free_map_sec:
1283         security_bpf_map_free(map);
1284 free_map:
1285         btf_put(map->btf);
1286         map->ops->map_free(map);
1287         return err;
1288 }
1289
1290 /* if error is returned, fd is released.
1291  * On success caller should complete fd access with matching fdput()
1292  */
1293 struct bpf_map *__bpf_map_get(struct fd f)
1294 {
1295         if (!f.file)
1296                 return ERR_PTR(-EBADF);
1297         if (f.file->f_op != &bpf_map_fops) {
1298                 fdput(f);
1299                 return ERR_PTR(-EINVAL);
1300         }
1301
1302         return f.file->private_data;
1303 }
1304
1305 void bpf_map_inc(struct bpf_map *map)
1306 {
1307         atomic64_inc(&map->refcnt);
1308 }
1309 EXPORT_SYMBOL_GPL(bpf_map_inc);
1310
1311 void bpf_map_inc_with_uref(struct bpf_map *map)
1312 {
1313         atomic64_inc(&map->refcnt);
1314         atomic64_inc(&map->usercnt);
1315 }
1316 EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref);
1317
1318 struct bpf_map *bpf_map_get(u32 ufd)
1319 {
1320         struct fd f = fdget(ufd);
1321         struct bpf_map *map;
1322
1323         map = __bpf_map_get(f);
1324         if (IS_ERR(map))
1325                 return map;
1326
1327         bpf_map_inc(map);
1328         fdput(f);
1329
1330         return map;
1331 }
1332 EXPORT_SYMBOL(bpf_map_get);
1333
1334 struct bpf_map *bpf_map_get_with_uref(u32 ufd)
1335 {
1336         struct fd f = fdget(ufd);
1337         struct bpf_map *map;
1338
1339         map = __bpf_map_get(f);
1340         if (IS_ERR(map))
1341                 return map;
1342
1343         bpf_map_inc_with_uref(map);
1344         fdput(f);
1345
1346         return map;
1347 }
1348
1349 /* map_idr_lock should have been held or the map should have been
1350  * protected by rcu read lock.
1351  */
1352 struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
1353 {
1354         int refold;
1355
1356         refold = atomic64_fetch_add_unless(&map->refcnt, 1, 0);
1357         if (!refold)
1358                 return ERR_PTR(-ENOENT);
1359         if (uref)
1360                 atomic64_inc(&map->usercnt);
1361
1362         return map;
1363 }
1364
1365 struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map)
1366 {
1367         spin_lock_bh(&map_idr_lock);
1368         map = __bpf_map_inc_not_zero(map, false);
1369         spin_unlock_bh(&map_idr_lock);
1370
1371         return map;
1372 }
1373 EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero);
1374
1375 int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
1376 {
1377         return -ENOTSUPP;
1378 }
1379
1380 static void *__bpf_copy_key(void __user *ukey, u64 key_size)
1381 {
1382         if (key_size)
1383                 return vmemdup_user(ukey, key_size);
1384
1385         if (ukey)
1386                 return ERR_PTR(-EINVAL);
1387
1388         return NULL;
1389 }
1390
1391 static void *___bpf_copy_key(bpfptr_t ukey, u64 key_size)
1392 {
1393         if (key_size)
1394                 return kvmemdup_bpfptr(ukey, key_size);
1395
1396         if (!bpfptr_is_null(ukey))
1397                 return ERR_PTR(-EINVAL);
1398
1399         return NULL;
1400 }
1401
1402 /* last field in 'union bpf_attr' used by this command */
1403 #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags
1404
1405 static int map_lookup_elem(union bpf_attr *attr)
1406 {
1407         void __user *ukey = u64_to_user_ptr(attr->key);
1408         void __user *uvalue = u64_to_user_ptr(attr->value);
1409         int ufd = attr->map_fd;
1410         struct bpf_map *map;
1411         void *key, *value;
1412         u32 value_size;
1413         struct fd f;
1414         int err;
1415
1416         if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
1417                 return -EINVAL;
1418
1419         if (attr->flags & ~BPF_F_LOCK)
1420                 return -EINVAL;
1421
1422         f = fdget(ufd);
1423         map = __bpf_map_get(f);
1424         if (IS_ERR(map))
1425                 return PTR_ERR(map);
1426         if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
1427                 err = -EPERM;
1428                 goto err_put;
1429         }
1430
1431         if ((attr->flags & BPF_F_LOCK) &&
1432             !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
1433                 err = -EINVAL;
1434                 goto err_put;
1435         }
1436
1437         key = __bpf_copy_key(ukey, map->key_size);
1438         if (IS_ERR(key)) {
1439                 err = PTR_ERR(key);
1440                 goto err_put;
1441         }
1442
1443         value_size = bpf_map_value_size(map);
1444
1445         err = -ENOMEM;
1446         value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
1447         if (!value)
1448                 goto free_key;
1449
1450         if (map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
1451                 if (copy_from_user(value, uvalue, value_size))
1452                         err = -EFAULT;
1453                 else
1454                         err = bpf_map_copy_value(map, key, value, attr->flags);
1455                 goto free_value;
1456         }
1457
1458         err = bpf_map_copy_value(map, key, value, attr->flags);
1459         if (err)
1460                 goto free_value;
1461
1462         err = -EFAULT;
1463         if (copy_to_user(uvalue, value, value_size) != 0)
1464                 goto free_value;
1465
1466         err = 0;
1467
1468 free_value:
1469         kvfree(value);
1470 free_key:
1471         kvfree(key);
1472 err_put:
1473         fdput(f);
1474         return err;
1475 }
1476
1477
1478 #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
1479
1480 static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
1481 {
1482         bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
1483         bpfptr_t uvalue = make_bpfptr(attr->value, uattr.is_kernel);
1484         int ufd = attr->map_fd;
1485         struct bpf_map *map;
1486         void *key, *value;
1487         u32 value_size;
1488         struct fd f;
1489         int err;
1490
1491         if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
1492                 return -EINVAL;
1493
1494         f = fdget(ufd);
1495         map = __bpf_map_get(f);
1496         if (IS_ERR(map))
1497                 return PTR_ERR(map);
1498         bpf_map_write_active_inc(map);
1499         if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
1500                 err = -EPERM;
1501                 goto err_put;
1502         }
1503
1504         if ((attr->flags & BPF_F_LOCK) &&
1505             !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
1506                 err = -EINVAL;
1507                 goto err_put;
1508         }
1509
1510         key = ___bpf_copy_key(ukey, map->key_size);
1511         if (IS_ERR(key)) {
1512                 err = PTR_ERR(key);
1513                 goto err_put;
1514         }
1515
1516         value_size = bpf_map_value_size(map);
1517         value = kvmemdup_bpfptr(uvalue, value_size);
1518         if (IS_ERR(value)) {
1519                 err = PTR_ERR(value);
1520                 goto free_key;
1521         }
1522
1523         err = bpf_map_update_value(map, f.file, key, value, attr->flags);
1524
1525         kvfree(value);
1526 free_key:
1527         kvfree(key);
1528 err_put:
1529         bpf_map_write_active_dec(map);
1530         fdput(f);
1531         return err;
1532 }
1533
1534 #define BPF_MAP_DELETE_ELEM_LAST_FIELD key
1535
1536 static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr)
1537 {
1538         bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
1539         int ufd = attr->map_fd;
1540         struct bpf_map *map;
1541         struct fd f;
1542         void *key;
1543         int err;
1544
1545         if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
1546                 return -EINVAL;
1547
1548         f = fdget(ufd);
1549         map = __bpf_map_get(f);
1550         if (IS_ERR(map))
1551                 return PTR_ERR(map);
1552         bpf_map_write_active_inc(map);
1553         if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
1554                 err = -EPERM;
1555                 goto err_put;
1556         }
1557
1558         key = ___bpf_copy_key(ukey, map->key_size);
1559         if (IS_ERR(key)) {
1560                 err = PTR_ERR(key);
1561                 goto err_put;
1562         }
1563
1564         if (bpf_map_is_offloaded(map)) {
1565                 err = bpf_map_offload_delete_elem(map, key);
1566                 goto out;
1567         } else if (IS_FD_PROG_ARRAY(map) ||
1568                    map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
1569                 /* These maps require sleepable context */
1570                 err = map->ops->map_delete_elem(map, key);
1571                 goto out;
1572         }
1573
1574         bpf_disable_instrumentation();
1575         rcu_read_lock();
1576         err = map->ops->map_delete_elem(map, key);
1577         rcu_read_unlock();
1578         bpf_enable_instrumentation();
1579         maybe_wait_bpf_programs(map);
1580 out:
1581         kvfree(key);
1582 err_put:
1583         bpf_map_write_active_dec(map);
1584         fdput(f);
1585         return err;
1586 }
1587
1588 /* last field in 'union bpf_attr' used by this command */
1589 #define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
1590
1591 static int map_get_next_key(union bpf_attr *attr)
1592 {
1593         void __user *ukey = u64_to_user_ptr(attr->key);
1594         void __user *unext_key = u64_to_user_ptr(attr->next_key);
1595         int ufd = attr->map_fd;
1596         struct bpf_map *map;
1597         void *key, *next_key;
1598         struct fd f;
1599         int err;
1600
1601         if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
1602                 return -EINVAL;
1603
1604         f = fdget(ufd);
1605         map = __bpf_map_get(f);
1606         if (IS_ERR(map))
1607                 return PTR_ERR(map);
1608         if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
1609                 err = -EPERM;
1610                 goto err_put;
1611         }
1612
1613         if (ukey) {
1614                 key = __bpf_copy_key(ukey, map->key_size);
1615                 if (IS_ERR(key)) {
1616                         err = PTR_ERR(key);
1617                         goto err_put;
1618                 }
1619         } else {
1620                 key = NULL;
1621         }
1622
1623         err = -ENOMEM;
1624         next_key = kvmalloc(map->key_size, GFP_USER);
1625         if (!next_key)
1626                 goto free_key;
1627
1628         if (bpf_map_is_offloaded(map)) {
1629                 err = bpf_map_offload_get_next_key(map, key, next_key);
1630                 goto out;
1631         }
1632
1633         rcu_read_lock();
1634         err = map->ops->map_get_next_key(map, key, next_key);
1635         rcu_read_unlock();
1636 out:
1637         if (err)
1638                 goto free_next_key;
1639
1640         err = -EFAULT;
1641         if (copy_to_user(unext_key, next_key, map->key_size) != 0)
1642                 goto free_next_key;
1643
1644         err = 0;
1645
1646 free_next_key:
1647         kvfree(next_key);
1648 free_key:
1649         kvfree(key);
1650 err_put:
1651         fdput(f);
1652         return err;
1653 }
1654
1655 int generic_map_delete_batch(struct bpf_map *map,
1656                              const union bpf_attr *attr,
1657                              union bpf_attr __user *uattr)
1658 {
1659         void __user *keys = u64_to_user_ptr(attr->batch.keys);
1660         u32 cp, max_count;
1661         int err = 0;
1662         void *key;
1663
1664         if (attr->batch.elem_flags & ~BPF_F_LOCK)
1665                 return -EINVAL;
1666
1667         if ((attr->batch.elem_flags & BPF_F_LOCK) &&
1668             !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
1669                 return -EINVAL;
1670         }
1671
1672         max_count = attr->batch.count;
1673         if (!max_count)
1674                 return 0;
1675
1676         key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
1677         if (!key)
1678                 return -ENOMEM;
1679
1680         for (cp = 0; cp < max_count; cp++) {
1681                 err = -EFAULT;
1682                 if (copy_from_user(key, keys + cp * map->key_size,
1683                                    map->key_size))
1684                         break;
1685
1686                 if (bpf_map_is_offloaded(map)) {
1687                         err = bpf_map_offload_delete_elem(map, key);
1688                         break;
1689                 }
1690
1691                 bpf_disable_instrumentation();
1692                 rcu_read_lock();
1693                 err = map->ops->map_delete_elem(map, key);
1694                 rcu_read_unlock();
1695                 bpf_enable_instrumentation();
1696                 if (err)
1697                         break;
1698                 cond_resched();
1699         }
1700         if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
1701                 err = -EFAULT;
1702
1703         kvfree(key);
1704
1705         maybe_wait_bpf_programs(map);
1706         return err;
1707 }
1708
1709 int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
1710                              const union bpf_attr *attr,
1711                              union bpf_attr __user *uattr)
1712 {
1713         void __user *values = u64_to_user_ptr(attr->batch.values);
1714         void __user *keys = u64_to_user_ptr(attr->batch.keys);
1715         u32 value_size, cp, max_count;
1716         void *key, *value;
1717         int err = 0;
1718
1719         if (attr->batch.elem_flags & ~BPF_F_LOCK)
1720                 return -EINVAL;
1721
1722         if ((attr->batch.elem_flags & BPF_F_LOCK) &&
1723             !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
1724                 return -EINVAL;
1725         }
1726
1727         value_size = bpf_map_value_size(map);
1728
1729         max_count = attr->batch.count;
1730         if (!max_count)
1731                 return 0;
1732
1733         key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
1734         if (!key)
1735                 return -ENOMEM;
1736
1737         value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
1738         if (!value) {
1739                 kvfree(key);
1740                 return -ENOMEM;
1741         }
1742
1743         for (cp = 0; cp < max_count; cp++) {
1744                 err = -EFAULT;
1745                 if (copy_from_user(key, keys + cp * map->key_size,
1746                     map->key_size) ||
1747                     copy_from_user(value, values + cp * value_size, value_size))
1748                         break;
1749
1750                 err = bpf_map_update_value(map, map_file, key, value,
1751                                            attr->batch.elem_flags);
1752
1753                 if (err)
1754                         break;
1755                 cond_resched();
1756         }
1757
1758         if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
1759                 err = -EFAULT;
1760
1761         kvfree(value);
1762         kvfree(key);
1763         return err;
1764 }
1765
1766 #define MAP_LOOKUP_RETRIES 3
1767
1768 int generic_map_lookup_batch(struct bpf_map *map,
1769                                     const union bpf_attr *attr,
1770                                     union bpf_attr __user *uattr)
1771 {
1772         void __user *uobatch = u64_to_user_ptr(attr->batch.out_batch);
1773         void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
1774         void __user *values = u64_to_user_ptr(attr->batch.values);
1775         void __user *keys = u64_to_user_ptr(attr->batch.keys);
1776         void *buf, *buf_prevkey, *prev_key, *key, *value;
1777         int err, retry = MAP_LOOKUP_RETRIES;
1778         u32 value_size, cp, max_count;
1779
1780         if (attr->batch.elem_flags & ~BPF_F_LOCK)
1781                 return -EINVAL;
1782
1783         if ((attr->batch.elem_flags & BPF_F_LOCK) &&
1784             !btf_record_has_field(map->record, BPF_SPIN_LOCK))
1785                 return -EINVAL;
1786
1787         value_size = bpf_map_value_size(map);
1788
1789         max_count = attr->batch.count;
1790         if (!max_count)
1791                 return 0;
1792
1793         if (put_user(0, &uattr->batch.count))
1794                 return -EFAULT;
1795
1796         buf_prevkey = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
1797         if (!buf_prevkey)
1798                 return -ENOMEM;
1799
1800         buf = kvmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN);
1801         if (!buf) {
1802                 kvfree(buf_prevkey);
1803                 return -ENOMEM;
1804         }
1805
1806         err = -EFAULT;
1807         prev_key = NULL;
1808         if (ubatch && copy_from_user(buf_prevkey, ubatch, map->key_size))
1809                 goto free_buf;
1810         key = buf;
1811         value = key + map->key_size;
1812         if (ubatch)
1813                 prev_key = buf_prevkey;
1814
1815         for (cp = 0; cp < max_count;) {
1816                 rcu_read_lock();
1817                 err = map->ops->map_get_next_key(map, prev_key, key);
1818                 rcu_read_unlock();
1819                 if (err)
1820                         break;
1821                 err = bpf_map_copy_value(map, key, value,
1822                                          attr->batch.elem_flags);
1823
1824                 if (err == -ENOENT) {
1825                         if (retry) {
1826                                 retry--;
1827                                 continue;
1828                         }
1829                         err = -EINTR;
1830                         break;
1831                 }
1832
1833                 if (err)
1834                         goto free_buf;
1835
1836                 if (copy_to_user(keys + cp * map->key_size, key,
1837                                  map->key_size)) {
1838                         err = -EFAULT;
1839                         goto free_buf;
1840                 }
1841                 if (copy_to_user(values + cp * value_size, value, value_size)) {
1842                         err = -EFAULT;
1843                         goto free_buf;
1844                 }
1845
1846                 if (!prev_key)
1847                         prev_key = buf_prevkey;
1848
1849                 swap(prev_key, key);
1850                 retry = MAP_LOOKUP_RETRIES;
1851                 cp++;
1852                 cond_resched();
1853         }
1854
1855         if (err == -EFAULT)
1856                 goto free_buf;
1857
1858         if ((copy_to_user(&uattr->batch.count, &cp, sizeof(cp)) ||
1859                     (cp && copy_to_user(uobatch, prev_key, map->key_size))))
1860                 err = -EFAULT;
1861
1862 free_buf:
1863         kvfree(buf_prevkey);
1864         kvfree(buf);
1865         return err;
1866 }
1867
1868 #define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD flags
1869
1870 static int map_lookup_and_delete_elem(union bpf_attr *attr)
1871 {
1872         void __user *ukey = u64_to_user_ptr(attr->key);
1873         void __user *uvalue = u64_to_user_ptr(attr->value);
1874         int ufd = attr->map_fd;
1875         struct bpf_map *map;
1876         void *key, *value;
1877         u32 value_size;
1878         struct fd f;
1879         int err;
1880
1881         if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
1882                 return -EINVAL;
1883
1884         if (attr->flags & ~BPF_F_LOCK)
1885                 return -EINVAL;
1886
1887         f = fdget(ufd);
1888         map = __bpf_map_get(f);
1889         if (IS_ERR(map))
1890                 return PTR_ERR(map);
1891         bpf_map_write_active_inc(map);
1892         if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) ||
1893             !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
1894                 err = -EPERM;
1895                 goto err_put;
1896         }
1897
1898         if (attr->flags &&
1899             (map->map_type == BPF_MAP_TYPE_QUEUE ||
1900              map->map_type == BPF_MAP_TYPE_STACK)) {
1901                 err = -EINVAL;
1902                 goto err_put;
1903         }
1904
1905         if ((attr->flags & BPF_F_LOCK) &&
1906             !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
1907                 err = -EINVAL;
1908                 goto err_put;
1909         }
1910
1911         key = __bpf_copy_key(ukey, map->key_size);
1912         if (IS_ERR(key)) {
1913                 err = PTR_ERR(key);
1914                 goto err_put;
1915         }
1916
1917         value_size = bpf_map_value_size(map);
1918
1919         err = -ENOMEM;
1920         value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
1921         if (!value)
1922                 goto free_key;
1923
1924         err = -ENOTSUPP;
1925         if (map->map_type == BPF_MAP_TYPE_QUEUE ||
1926             map->map_type == BPF_MAP_TYPE_STACK) {
1927                 err = map->ops->map_pop_elem(map, value);
1928         } else if (map->map_type == BPF_MAP_TYPE_HASH ||
1929                    map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
1930                    map->map_type == BPF_MAP_TYPE_LRU_HASH ||
1931                    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
1932                 if (!bpf_map_is_offloaded(map)) {
1933                         bpf_disable_instrumentation();
1934                         rcu_read_lock();
1935                         err = map->ops->map_lookup_and_delete_elem(map, key, value, attr->flags);
1936                         rcu_read_unlock();
1937                         bpf_enable_instrumentation();
1938                 }
1939         }
1940
1941         if (err)
1942                 goto free_value;
1943
1944         if (copy_to_user(uvalue, value, value_size) != 0) {
1945                 err = -EFAULT;
1946                 goto free_value;
1947         }
1948
1949         err = 0;
1950
1951 free_value:
1952         kvfree(value);
1953 free_key:
1954         kvfree(key);
1955 err_put:
1956         bpf_map_write_active_dec(map);
1957         fdput(f);
1958         return err;
1959 }
1960
1961 #define BPF_MAP_FREEZE_LAST_FIELD map_fd
1962
1963 static int map_freeze(const union bpf_attr *attr)
1964 {
1965         int err = 0, ufd = attr->map_fd;
1966         struct bpf_map *map;
1967         struct fd f;
1968
1969         if (CHECK_ATTR(BPF_MAP_FREEZE))
1970                 return -EINVAL;
1971
1972         f = fdget(ufd);
1973         map = __bpf_map_get(f);
1974         if (IS_ERR(map))
1975                 return PTR_ERR(map);
1976
1977         if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || !IS_ERR_OR_NULL(map->record)) {
1978                 fdput(f);
1979                 return -ENOTSUPP;
1980         }
1981
1982         if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
1983                 fdput(f);
1984                 return -EPERM;
1985         }
1986
1987         mutex_lock(&map->freeze_mutex);
1988         if (bpf_map_write_active(map)) {
1989                 err = -EBUSY;
1990                 goto err_put;
1991         }
1992         if (READ_ONCE(map->frozen)) {
1993                 err = -EBUSY;
1994                 goto err_put;
1995         }
1996
1997         WRITE_ONCE(map->frozen, true);
1998 err_put:
1999         mutex_unlock(&map->freeze_mutex);
2000         fdput(f);
2001         return err;
2002 }
2003
2004 static const struct bpf_prog_ops * const bpf_prog_types[] = {
2005 #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
2006         [_id] = & _name ## _prog_ops,
2007 #define BPF_MAP_TYPE(_id, _ops)
2008 #define BPF_LINK_TYPE(_id, _name)
2009 #include <linux/bpf_types.h>
2010 #undef BPF_PROG_TYPE
2011 #undef BPF_MAP_TYPE
2012 #undef BPF_LINK_TYPE
2013 };
2014
2015 static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
2016 {
2017         const struct bpf_prog_ops *ops;
2018
2019         if (type >= ARRAY_SIZE(bpf_prog_types))
2020                 return -EINVAL;
2021         type = array_index_nospec(type, ARRAY_SIZE(bpf_prog_types));
2022         ops = bpf_prog_types[type];
2023         if (!ops)
2024                 return -EINVAL;
2025
2026         if (!bpf_prog_is_offloaded(prog->aux))
2027                 prog->aux->ops = ops;
2028         else
2029                 prog->aux->ops = &bpf_offload_prog_ops;
2030         prog->type = type;
2031         return 0;
2032 }
2033
2034 enum bpf_audit {
2035         BPF_AUDIT_LOAD,
2036         BPF_AUDIT_UNLOAD,
2037         BPF_AUDIT_MAX,
2038 };
2039
2040 static const char * const bpf_audit_str[BPF_AUDIT_MAX] = {
2041         [BPF_AUDIT_LOAD]   = "LOAD",
2042         [BPF_AUDIT_UNLOAD] = "UNLOAD",
2043 };
2044
2045 static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op)
2046 {
2047         struct audit_context *ctx = NULL;
2048         struct audit_buffer *ab;
2049
2050         if (WARN_ON_ONCE(op >= BPF_AUDIT_MAX))
2051                 return;
2052         if (audit_enabled == AUDIT_OFF)
2053                 return;
2054         if (!in_irq() && !irqs_disabled())
2055                 ctx = audit_context();
2056         ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF);
2057         if (unlikely(!ab))
2058                 return;
2059         audit_log_format(ab, "prog-id=%u op=%s",
2060                          prog->aux->id, bpf_audit_str[op]);
2061         audit_log_end(ab);
2062 }
2063
2064 static int bpf_prog_alloc_id(struct bpf_prog *prog)
2065 {
2066         int id;
2067
2068         idr_preload(GFP_KERNEL);
2069         spin_lock_bh(&prog_idr_lock);
2070         id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC);
2071         if (id > 0)
2072                 prog->aux->id = id;
2073         spin_unlock_bh(&prog_idr_lock);
2074         idr_preload_end();
2075
2076         /* id is in [1, INT_MAX) */
2077         if (WARN_ON_ONCE(!id))
2078                 return -ENOSPC;
2079
2080         return id > 0 ? 0 : id;
2081 }
2082
2083 void bpf_prog_free_id(struct bpf_prog *prog)
2084 {
2085         unsigned long flags;
2086
2087         /* cBPF to eBPF migrations are currently not in the idr store.
2088          * Offloaded programs are removed from the store when their device
2089          * disappears - even if someone grabs an fd to them they are unusable,
2090          * simply waiting for refcnt to drop to be freed.
2091          */
2092         if (!prog->aux->id)
2093                 return;
2094
2095         spin_lock_irqsave(&prog_idr_lock, flags);
2096         idr_remove(&prog_idr, prog->aux->id);
2097         prog->aux->id = 0;
2098         spin_unlock_irqrestore(&prog_idr_lock, flags);
2099 }
2100
2101 static void __bpf_prog_put_rcu(struct rcu_head *rcu)
2102 {
2103         struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
2104
2105         kvfree(aux->func_info);
2106         kfree(aux->func_info_aux);
2107         free_uid(aux->user);
2108         security_bpf_prog_free(aux);
2109         bpf_prog_free(aux->prog);
2110 }
2111
2112 static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
2113 {
2114         bpf_prog_kallsyms_del_all(prog);
2115         btf_put(prog->aux->btf);
2116         module_put(prog->aux->mod);
2117         kvfree(prog->aux->jited_linfo);
2118         kvfree(prog->aux->linfo);
2119         kfree(prog->aux->kfunc_tab);
2120         if (prog->aux->attach_btf)
2121                 btf_put(prog->aux->attach_btf);
2122
2123         if (deferred) {
2124                 if (prog->aux->sleepable)
2125                         call_rcu_tasks_trace(&prog->aux->rcu, __bpf_prog_put_rcu);
2126                 else
2127                         call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
2128         } else {
2129                 __bpf_prog_put_rcu(&prog->aux->rcu);
2130         }
2131 }
2132
2133 static void bpf_prog_put_deferred(struct work_struct *work)
2134 {
2135         struct bpf_prog_aux *aux;
2136         struct bpf_prog *prog;
2137
2138         aux = container_of(work, struct bpf_prog_aux, work);
2139         prog = aux->prog;
2140         perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
2141         bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
2142         bpf_prog_free_id(prog);
2143         __bpf_prog_put_noref(prog, true);
2144 }
2145
2146 static void __bpf_prog_put(struct bpf_prog *prog)
2147 {
2148         struct bpf_prog_aux *aux = prog->aux;
2149
2150         if (atomic64_dec_and_test(&aux->refcnt)) {
2151                 if (in_irq() || irqs_disabled()) {
2152                         INIT_WORK(&aux->work, bpf_prog_put_deferred);
2153                         schedule_work(&aux->work);
2154                 } else {
2155                         bpf_prog_put_deferred(&aux->work);
2156                 }
2157         }
2158 }
2159
2160 void bpf_prog_put(struct bpf_prog *prog)
2161 {
2162         __bpf_prog_put(prog);
2163 }
2164 EXPORT_SYMBOL_GPL(bpf_prog_put);
2165
2166 static int bpf_prog_release(struct inode *inode, struct file *filp)
2167 {
2168         struct bpf_prog *prog = filp->private_data;
2169
2170         bpf_prog_put(prog);
2171         return 0;
2172 }
2173
2174 struct bpf_prog_kstats {
2175         u64 nsecs;
2176         u64 cnt;
2177         u64 misses;
2178 };
2179
2180 void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog)
2181 {
2182         struct bpf_prog_stats *stats;
2183         unsigned int flags;
2184
2185         stats = this_cpu_ptr(prog->stats);
2186         flags = u64_stats_update_begin_irqsave(&stats->syncp);
2187         u64_stats_inc(&stats->misses);
2188         u64_stats_update_end_irqrestore(&stats->syncp, flags);
2189 }
2190
2191 static void bpf_prog_get_stats(const struct bpf_prog *prog,
2192                                struct bpf_prog_kstats *stats)
2193 {
2194         u64 nsecs = 0, cnt = 0, misses = 0;
2195         int cpu;
2196
2197         for_each_possible_cpu(cpu) {
2198                 const struct bpf_prog_stats *st;
2199                 unsigned int start;
2200                 u64 tnsecs, tcnt, tmisses;
2201
2202                 st = per_cpu_ptr(prog->stats, cpu);
2203                 do {
2204                         start = u64_stats_fetch_begin(&st->syncp);
2205                         tnsecs = u64_stats_read(&st->nsecs);
2206                         tcnt = u64_stats_read(&st->cnt);
2207                         tmisses = u64_stats_read(&st->misses);
2208                 } while (u64_stats_fetch_retry(&st->syncp, start));
2209                 nsecs += tnsecs;
2210                 cnt += tcnt;
2211                 misses += tmisses;
2212         }
2213         stats->nsecs = nsecs;
2214         stats->cnt = cnt;
2215         stats->misses = misses;
2216 }
2217
2218 #ifdef CONFIG_PROC_FS
2219 static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
2220 {
2221         const struct bpf_prog *prog = filp->private_data;
2222         char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
2223         struct bpf_prog_kstats stats;
2224
2225         bpf_prog_get_stats(prog, &stats);
2226         bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
2227         seq_printf(m,
2228                    "prog_type:\t%u\n"
2229                    "prog_jited:\t%u\n"
2230                    "prog_tag:\t%s\n"
2231                    "memlock:\t%llu\n"
2232                    "prog_id:\t%u\n"
2233                    "run_time_ns:\t%llu\n"
2234                    "run_cnt:\t%llu\n"
2235                    "recursion_misses:\t%llu\n"
2236                    "verified_insns:\t%u\n",
2237                    prog->type,
2238                    prog->jited,
2239                    prog_tag,
2240                    prog->pages * 1ULL << PAGE_SHIFT,
2241                    prog->aux->id,
2242                    stats.nsecs,
2243                    stats.cnt,
2244                    stats.misses,
2245                    prog->aux->verified_insns);
2246 }
2247 #endif
2248
2249 const struct file_operations bpf_prog_fops = {
2250 #ifdef CONFIG_PROC_FS
2251         .show_fdinfo    = bpf_prog_show_fdinfo,
2252 #endif
2253         .release        = bpf_prog_release,
2254         .read           = bpf_dummy_read,
2255         .write          = bpf_dummy_write,
2256 };
2257
2258 int bpf_prog_new_fd(struct bpf_prog *prog)
2259 {
2260         int ret;
2261
2262         ret = security_bpf_prog(prog);
2263         if (ret < 0)
2264                 return ret;
2265
2266         return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog,
2267                                 O_RDWR | O_CLOEXEC);
2268 }
2269
2270 static struct bpf_prog *____bpf_prog_get(struct fd f)
2271 {
2272         if (!f.file)
2273                 return ERR_PTR(-EBADF);
2274         if (f.file->f_op != &bpf_prog_fops) {
2275                 fdput(f);
2276                 return ERR_PTR(-EINVAL);
2277         }
2278
2279         return f.file->private_data;
2280 }
2281
2282 void bpf_prog_add(struct bpf_prog *prog, int i)
2283 {
2284         atomic64_add(i, &prog->aux->refcnt);
2285 }
2286 EXPORT_SYMBOL_GPL(bpf_prog_add);
2287
2288 void bpf_prog_sub(struct bpf_prog *prog, int i)
2289 {
2290         /* Only to be used for undoing previous bpf_prog_add() in some
2291          * error path. We still know that another entity in our call
2292          * path holds a reference to the program, thus atomic_sub() can
2293          * be safely used in such cases!
2294          */
2295         WARN_ON(atomic64_sub_return(i, &prog->aux->refcnt) == 0);
2296 }
2297 EXPORT_SYMBOL_GPL(bpf_prog_sub);
2298
2299 void bpf_prog_inc(struct bpf_prog *prog)
2300 {
2301         atomic64_inc(&prog->aux->refcnt);
2302 }
2303 EXPORT_SYMBOL_GPL(bpf_prog_inc);
2304
2305 /* prog_idr_lock should have been held */
2306 struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
2307 {
2308         int refold;
2309
2310         refold = atomic64_fetch_add_unless(&prog->aux->refcnt, 1, 0);
2311
2312         if (!refold)
2313                 return ERR_PTR(-ENOENT);
2314
2315         return prog;
2316 }
2317 EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
2318
2319 bool bpf_prog_get_ok(struct bpf_prog *prog,
2320                             enum bpf_prog_type *attach_type, bool attach_drv)
2321 {
2322         /* not an attachment, just a refcount inc, always allow */
2323         if (!attach_type)
2324                 return true;
2325
2326         if (prog->type != *attach_type)
2327                 return false;
2328         if (bpf_prog_is_offloaded(prog->aux) && !attach_drv)
2329                 return false;
2330
2331         return true;
2332 }
2333
2334 static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type,
2335                                        bool attach_drv)
2336 {
2337         struct fd f = fdget(ufd);
2338         struct bpf_prog *prog;
2339
2340         prog = ____bpf_prog_get(f);
2341         if (IS_ERR(prog))
2342                 return prog;
2343         if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) {
2344                 prog = ERR_PTR(-EINVAL);
2345                 goto out;
2346         }
2347
2348         bpf_prog_inc(prog);
2349 out:
2350         fdput(f);
2351         return prog;
2352 }
2353
2354 struct bpf_prog *bpf_prog_get(u32 ufd)
2355 {
2356         return __bpf_prog_get(ufd, NULL, false);
2357 }
2358
2359 struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
2360                                        bool attach_drv)
2361 {
2362         return __bpf_prog_get(ufd, &type, attach_drv);
2363 }
2364 EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev);
2365
2366 /* Initially all BPF programs could be loaded w/o specifying
2367  * expected_attach_type. Later for some of them specifying expected_attach_type
2368  * at load time became required so that program could be validated properly.
2369  * Programs of types that are allowed to be loaded both w/ and w/o (for
2370  * backward compatibility) expected_attach_type, should have the default attach
2371  * type assigned to expected_attach_type for the latter case, so that it can be
2372  * validated later at attach time.
2373  *
2374  * bpf_prog_load_fixup_attach_type() sets expected_attach_type in @attr if
2375  * prog type requires it but has some attach types that have to be backward
2376  * compatible.
2377  */
2378 static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr)
2379 {
2380         switch (attr->prog_type) {
2381         case BPF_PROG_TYPE_CGROUP_SOCK:
2382                 /* Unfortunately BPF_ATTACH_TYPE_UNSPEC enumeration doesn't
2383                  * exist so checking for non-zero is the way to go here.
2384                  */
2385                 if (!attr->expected_attach_type)
2386                         attr->expected_attach_type =
2387                                 BPF_CGROUP_INET_SOCK_CREATE;
2388                 break;
2389         case BPF_PROG_TYPE_SK_REUSEPORT:
2390                 if (!attr->expected_attach_type)
2391                         attr->expected_attach_type =
2392                                 BPF_SK_REUSEPORT_SELECT;
2393                 break;
2394         }
2395 }
2396
2397 static int
2398 bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
2399                            enum bpf_attach_type expected_attach_type,
2400                            struct btf *attach_btf, u32 btf_id,
2401                            struct bpf_prog *dst_prog)
2402 {
2403         if (btf_id) {
2404                 if (btf_id > BTF_MAX_TYPE)
2405                         return -EINVAL;
2406
2407                 if (!attach_btf && !dst_prog)
2408                         return -EINVAL;
2409
2410                 switch (prog_type) {
2411                 case BPF_PROG_TYPE_TRACING:
2412                 case BPF_PROG_TYPE_LSM:
2413                 case BPF_PROG_TYPE_STRUCT_OPS:
2414                 case BPF_PROG_TYPE_EXT:
2415                         break;
2416                 default:
2417                         return -EINVAL;
2418                 }
2419         }
2420
2421         if (attach_btf && (!btf_id || dst_prog))
2422                 return -EINVAL;
2423
2424         if (dst_prog && prog_type != BPF_PROG_TYPE_TRACING &&
2425             prog_type != BPF_PROG_TYPE_EXT)
2426                 return -EINVAL;
2427
2428         switch (prog_type) {
2429         case BPF_PROG_TYPE_CGROUP_SOCK:
2430                 switch (expected_attach_type) {
2431                 case BPF_CGROUP_INET_SOCK_CREATE:
2432                 case BPF_CGROUP_INET_SOCK_RELEASE:
2433                 case BPF_CGROUP_INET4_POST_BIND:
2434                 case BPF_CGROUP_INET6_POST_BIND:
2435                         return 0;
2436                 default:
2437                         return -EINVAL;
2438                 }
2439         case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
2440                 switch (expected_attach_type) {
2441                 case BPF_CGROUP_INET4_BIND:
2442                 case BPF_CGROUP_INET6_BIND:
2443                 case BPF_CGROUP_INET4_CONNECT:
2444                 case BPF_CGROUP_INET6_CONNECT:
2445                 case BPF_CGROUP_INET4_GETPEERNAME:
2446                 case BPF_CGROUP_INET6_GETPEERNAME:
2447                 case BPF_CGROUP_INET4_GETSOCKNAME:
2448                 case BPF_CGROUP_INET6_GETSOCKNAME:
2449                 case BPF_CGROUP_UDP4_SENDMSG:
2450                 case BPF_CGROUP_UDP6_SENDMSG:
2451                 case BPF_CGROUP_UDP4_RECVMSG:
2452                 case BPF_CGROUP_UDP6_RECVMSG:
2453                         return 0;
2454                 default:
2455                         return -EINVAL;
2456                 }
2457         case BPF_PROG_TYPE_CGROUP_SKB:
2458                 switch (expected_attach_type) {
2459                 case BPF_CGROUP_INET_INGRESS:
2460                 case BPF_CGROUP_INET_EGRESS:
2461                         return 0;
2462                 default:
2463                         return -EINVAL;
2464                 }
2465         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
2466                 switch (expected_attach_type) {
2467                 case BPF_CGROUP_SETSOCKOPT:
2468                 case BPF_CGROUP_GETSOCKOPT:
2469                         return 0;
2470                 default:
2471                         return -EINVAL;
2472                 }
2473         case BPF_PROG_TYPE_SK_LOOKUP:
2474                 if (expected_attach_type == BPF_SK_LOOKUP)
2475                         return 0;
2476                 return -EINVAL;
2477         case BPF_PROG_TYPE_SK_REUSEPORT:
2478                 switch (expected_attach_type) {
2479                 case BPF_SK_REUSEPORT_SELECT:
2480                 case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE:
2481                         return 0;
2482                 default:
2483                         return -EINVAL;
2484                 }
2485         case BPF_PROG_TYPE_NETFILTER:
2486                 if (expected_attach_type == BPF_NETFILTER)
2487                         return 0;
2488                 return -EINVAL;
2489         case BPF_PROG_TYPE_SYSCALL:
2490         case BPF_PROG_TYPE_EXT:
2491                 if (expected_attach_type)
2492                         return -EINVAL;
2493                 fallthrough;
2494         default:
2495                 return 0;
2496         }
2497 }
2498
2499 static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)
2500 {
2501         switch (prog_type) {
2502         case BPF_PROG_TYPE_SCHED_CLS:
2503         case BPF_PROG_TYPE_SCHED_ACT:
2504         case BPF_PROG_TYPE_XDP:
2505         case BPF_PROG_TYPE_LWT_IN:
2506         case BPF_PROG_TYPE_LWT_OUT:
2507         case BPF_PROG_TYPE_LWT_XMIT:
2508         case BPF_PROG_TYPE_LWT_SEG6LOCAL:
2509         case BPF_PROG_TYPE_SK_SKB:
2510         case BPF_PROG_TYPE_SK_MSG:
2511         case BPF_PROG_TYPE_FLOW_DISSECTOR:
2512         case BPF_PROG_TYPE_CGROUP_DEVICE:
2513         case BPF_PROG_TYPE_CGROUP_SOCK:
2514         case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
2515         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
2516         case BPF_PROG_TYPE_CGROUP_SYSCTL:
2517         case BPF_PROG_TYPE_SOCK_OPS:
2518         case BPF_PROG_TYPE_EXT: /* extends any prog */
2519         case BPF_PROG_TYPE_NETFILTER:
2520                 return true;
2521         case BPF_PROG_TYPE_CGROUP_SKB:
2522                 /* always unpriv */
2523         case BPF_PROG_TYPE_SK_REUSEPORT:
2524                 /* equivalent to SOCKET_FILTER. need CAP_BPF only */
2525         default:
2526                 return false;
2527         }
2528 }
2529
2530 static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
2531 {
2532         switch (prog_type) {
2533         case BPF_PROG_TYPE_KPROBE:
2534         case BPF_PROG_TYPE_TRACEPOINT:
2535         case BPF_PROG_TYPE_PERF_EVENT:
2536         case BPF_PROG_TYPE_RAW_TRACEPOINT:
2537         case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
2538         case BPF_PROG_TYPE_TRACING:
2539         case BPF_PROG_TYPE_LSM:
2540         case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */
2541         case BPF_PROG_TYPE_EXT: /* extends any prog */
2542                 return true;
2543         default:
2544                 return false;
2545         }
2546 }
2547
2548 /* last field in 'union bpf_attr' used by this command */
2549 #define BPF_PROG_LOAD_LAST_FIELD log_true_size
2550
2551 static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
2552 {
2553         enum bpf_prog_type type = attr->prog_type;
2554         struct bpf_prog *prog, *dst_prog = NULL;
2555         struct btf *attach_btf = NULL;
2556         int err;
2557         char license[128];
2558
2559         if (CHECK_ATTR(BPF_PROG_LOAD))
2560                 return -EINVAL;
2561
2562         if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |
2563                                  BPF_F_ANY_ALIGNMENT |
2564                                  BPF_F_TEST_STATE_FREQ |
2565                                  BPF_F_SLEEPABLE |
2566                                  BPF_F_TEST_RND_HI32 |
2567                                  BPF_F_XDP_HAS_FRAGS |
2568                                  BPF_F_XDP_DEV_BOUND_ONLY))
2569                 return -EINVAL;
2570
2571         if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
2572             (attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
2573             !bpf_capable())
2574                 return -EPERM;
2575
2576         /* Intent here is for unprivileged_bpf_disabled to block BPF program
2577          * creation for unprivileged users; other actions depend
2578          * on fd availability and access to bpffs, so are dependent on
2579          * object creation success. Even with unprivileged BPF disabled,
2580          * capability checks are still carried out for these
2581          * and other operations.
2582          */
2583         if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
2584                 return -EPERM;
2585
2586         if (attr->insn_cnt == 0 ||
2587             attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
2588                 return -E2BIG;
2589         if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
2590             type != BPF_PROG_TYPE_CGROUP_SKB &&
2591             !bpf_capable())
2592                 return -EPERM;
2593
2594         if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN) && !capable(CAP_SYS_ADMIN))
2595                 return -EPERM;
2596         if (is_perfmon_prog_type(type) && !perfmon_capable())
2597                 return -EPERM;
2598
2599         /* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog
2600          * or btf, we need to check which one it is
2601          */
2602         if (attr->attach_prog_fd) {
2603                 dst_prog = bpf_prog_get(attr->attach_prog_fd);
2604                 if (IS_ERR(dst_prog)) {
2605                         dst_prog = NULL;
2606                         attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd);
2607                         if (IS_ERR(attach_btf))
2608                                 return -EINVAL;
2609                         if (!btf_is_kernel(attach_btf)) {
2610                                 /* attaching through specifying bpf_prog's BTF
2611                                  * objects directly might be supported eventually
2612                                  */
2613                                 btf_put(attach_btf);
2614                                 return -ENOTSUPP;
2615                         }
2616                 }
2617         } else if (attr->attach_btf_id) {
2618                 /* fall back to vmlinux BTF, if BTF type ID is specified */
2619                 attach_btf = bpf_get_btf_vmlinux();
2620                 if (IS_ERR(attach_btf))
2621                         return PTR_ERR(attach_btf);
2622                 if (!attach_btf)
2623                         return -EINVAL;
2624                 btf_get(attach_btf);
2625         }
2626
2627         bpf_prog_load_fixup_attach_type(attr);
2628         if (bpf_prog_load_check_attach(type, attr->expected_attach_type,
2629                                        attach_btf, attr->attach_btf_id,
2630                                        dst_prog)) {
2631                 if (dst_prog)
2632                         bpf_prog_put(dst_prog);
2633                 if (attach_btf)
2634                         btf_put(attach_btf);
2635                 return -EINVAL;
2636         }
2637
2638         /* plain bpf_prog allocation */
2639         prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
2640         if (!prog) {
2641                 if (dst_prog)
2642                         bpf_prog_put(dst_prog);
2643                 if (attach_btf)
2644                         btf_put(attach_btf);
2645                 return -ENOMEM;
2646         }
2647
2648         prog->expected_attach_type = attr->expected_attach_type;
2649         prog->aux->attach_btf = attach_btf;
2650         prog->aux->attach_btf_id = attr->attach_btf_id;
2651         prog->aux->dst_prog = dst_prog;
2652         prog->aux->dev_bound = !!attr->prog_ifindex;
2653         prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
2654         prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
2655
2656         err = security_bpf_prog_alloc(prog->aux);
2657         if (err)
2658                 goto free_prog;
2659
2660         prog->aux->user = get_current_user();
2661         prog->len = attr->insn_cnt;
2662
2663         err = -EFAULT;
2664         if (copy_from_bpfptr(prog->insns,
2665                              make_bpfptr(attr->insns, uattr.is_kernel),
2666                              bpf_prog_insn_size(prog)) != 0)
2667                 goto free_prog_sec;
2668         /* copy eBPF program license from user space */
2669         if (strncpy_from_bpfptr(license,
2670                                 make_bpfptr(attr->license, uattr.is_kernel),
2671                                 sizeof(license) - 1) < 0)
2672                 goto free_prog_sec;
2673         license[sizeof(license) - 1] = 0;
2674
2675         /* eBPF programs must be GPL compatible to use GPL-ed functions */
2676         prog->gpl_compatible = license_is_gpl_compatible(license) ? 1 : 0;
2677
2678         prog->orig_prog = NULL;
2679         prog->jited = 0;
2680
2681         atomic64_set(&prog->aux->refcnt, 1);
2682
2683         if (bpf_prog_is_dev_bound(prog->aux)) {
2684                 err = bpf_prog_dev_bound_init(prog, attr);
2685                 if (err)
2686                         goto free_prog_sec;
2687         }
2688
2689         if (type == BPF_PROG_TYPE_EXT && dst_prog &&
2690             bpf_prog_is_dev_bound(dst_prog->aux)) {
2691                 err = bpf_prog_dev_bound_inherit(prog, dst_prog);
2692                 if (err)
2693                         goto free_prog_sec;
2694         }
2695
2696         /* find program type: socket_filter vs tracing_filter */
2697         err = find_prog_type(type, prog);
2698         if (err < 0)
2699                 goto free_prog_sec;
2700
2701         prog->aux->load_time = ktime_get_boottime_ns();
2702         err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name,
2703                                sizeof(attr->prog_name));
2704         if (err < 0)
2705                 goto free_prog_sec;
2706
2707         /* run eBPF verifier */
2708         err = bpf_check(&prog, attr, uattr, uattr_size);
2709         if (err < 0)
2710                 goto free_used_maps;
2711
2712         prog = bpf_prog_select_runtime(prog, &err);
2713         if (err < 0)
2714                 goto free_used_maps;
2715
2716         err = bpf_prog_alloc_id(prog);
2717         if (err)
2718                 goto free_used_maps;
2719
2720         /* Upon success of bpf_prog_alloc_id(), the BPF prog is
2721          * effectively publicly exposed. However, retrieving via
2722          * bpf_prog_get_fd_by_id() will take another reference,
2723          * therefore it cannot be gone underneath us.
2724          *
2725          * Only for the time /after/ successful bpf_prog_new_fd()
2726          * and before returning to userspace, we might just hold
2727          * one reference and any parallel close on that fd could
2728          * rip everything out. Hence, below notifications must
2729          * happen before bpf_prog_new_fd().
2730          *
2731          * Also, any failure handling from this point onwards must
2732          * be using bpf_prog_put() given the program is exposed.
2733          */
2734         bpf_prog_kallsyms_add(prog);
2735         perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0);
2736         bpf_audit_prog(prog, BPF_AUDIT_LOAD);
2737
2738         err = bpf_prog_new_fd(prog);
2739         if (err < 0)
2740                 bpf_prog_put(prog);
2741         return err;
2742
2743 free_used_maps:
2744         /* In case we have subprogs, we need to wait for a grace
2745          * period before we can tear down JIT memory since symbols
2746          * are already exposed under kallsyms.
2747          */
2748         __bpf_prog_put_noref(prog, prog->aux->func_cnt);
2749         return err;
2750 free_prog_sec:
2751         free_uid(prog->aux->user);
2752         security_bpf_prog_free(prog->aux);
2753 free_prog:
2754         if (prog->aux->attach_btf)
2755                 btf_put(prog->aux->attach_btf);
2756         bpf_prog_free(prog);
2757         return err;
2758 }
2759
2760 #define BPF_OBJ_LAST_FIELD path_fd
2761
2762 static int bpf_obj_pin(const union bpf_attr *attr)
2763 {
2764         int path_fd;
2765
2766         if (CHECK_ATTR(BPF_OBJ) || attr->file_flags & ~BPF_F_PATH_FD)
2767                 return -EINVAL;
2768
2769         /* path_fd has to be accompanied by BPF_F_PATH_FD flag */
2770         if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd)
2771                 return -EINVAL;
2772
2773         path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD;
2774         return bpf_obj_pin_user(attr->bpf_fd, path_fd,
2775                                 u64_to_user_ptr(attr->pathname));
2776 }
2777
2778 static int bpf_obj_get(const union bpf_attr *attr)
2779 {
2780         int path_fd;
2781
2782         if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 ||
2783             attr->file_flags & ~(BPF_OBJ_FLAG_MASK | BPF_F_PATH_FD))
2784                 return -EINVAL;
2785
2786         /* path_fd has to be accompanied by BPF_F_PATH_FD flag */
2787         if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd)
2788                 return -EINVAL;
2789
2790         path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD;
2791         return bpf_obj_get_user(path_fd, u64_to_user_ptr(attr->pathname),
2792                                 attr->file_flags);
2793 }
2794
2795 void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
2796                    const struct bpf_link_ops *ops, struct bpf_prog *prog)
2797 {
2798         atomic64_set(&link->refcnt, 1);
2799         link->type = type;
2800         link->id = 0;
2801         link->ops = ops;
2802         link->prog = prog;
2803 }
2804
2805 static void bpf_link_free_id(int id)
2806 {
2807         if (!id)
2808                 return;
2809
2810         spin_lock_bh(&link_idr_lock);
2811         idr_remove(&link_idr, id);
2812         spin_unlock_bh(&link_idr_lock);
2813 }
2814
2815 /* Clean up bpf_link and corresponding anon_inode file and FD. After
2816  * anon_inode is created, bpf_link can't be just kfree()'d due to deferred
2817  * anon_inode's release() call. This helper marks bpf_link as
2818  * defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt
2819  * is not decremented, it's the responsibility of a calling code that failed
2820  * to complete bpf_link initialization.
2821  * This helper eventually calls link's dealloc callback, but does not call
2822  * link's release callback.
2823  */
2824 void bpf_link_cleanup(struct bpf_link_primer *primer)
2825 {
2826         primer->link->prog = NULL;
2827         bpf_link_free_id(primer->id);
2828         fput(primer->file);
2829         put_unused_fd(primer->fd);
2830 }
2831
2832 void bpf_link_inc(struct bpf_link *link)
2833 {
2834         atomic64_inc(&link->refcnt);
2835 }
2836
2837 /* bpf_link_free is guaranteed to be called from process context */
2838 static void bpf_link_free(struct bpf_link *link)
2839 {
2840         bpf_link_free_id(link->id);
2841         if (link->prog) {
2842                 /* detach BPF program, clean up used resources */
2843                 link->ops->release(link);
2844                 bpf_prog_put(link->prog);
2845         }
2846         /* free bpf_link and its containing memory */
2847         link->ops->dealloc(link);
2848 }
2849
2850 static void bpf_link_put_deferred(struct work_struct *work)
2851 {
2852         struct bpf_link *link = container_of(work, struct bpf_link, work);
2853
2854         bpf_link_free(link);
2855 }
2856
2857 /* bpf_link_put might be called from atomic context. It needs to be called
2858  * from sleepable context in order to acquire sleeping locks during the process.
2859  */
2860 void bpf_link_put(struct bpf_link *link)
2861 {
2862         if (!atomic64_dec_and_test(&link->refcnt))
2863                 return;
2864
2865         INIT_WORK(&link->work, bpf_link_put_deferred);
2866         schedule_work(&link->work);
2867 }
2868 EXPORT_SYMBOL(bpf_link_put);
2869
2870 static void bpf_link_put_direct(struct bpf_link *link)
2871 {
2872         if (!atomic64_dec_and_test(&link->refcnt))
2873                 return;
2874         bpf_link_free(link);
2875 }
2876
2877 static int bpf_link_release(struct inode *inode, struct file *filp)
2878 {
2879         struct bpf_link *link = filp->private_data;
2880
2881         bpf_link_put_direct(link);
2882         return 0;
2883 }
2884
2885 #ifdef CONFIG_PROC_FS
2886 #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
2887 #define BPF_MAP_TYPE(_id, _ops)
2888 #define BPF_LINK_TYPE(_id, _name) [_id] = #_name,
2889 static const char *bpf_link_type_strs[] = {
2890         [BPF_LINK_TYPE_UNSPEC] = "<invalid>",
2891 #include <linux/bpf_types.h>
2892 };
2893 #undef BPF_PROG_TYPE
2894 #undef BPF_MAP_TYPE
2895 #undef BPF_LINK_TYPE
2896
2897 static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
2898 {
2899         const struct bpf_link *link = filp->private_data;
2900         const struct bpf_prog *prog = link->prog;
2901         char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
2902
2903         seq_printf(m,
2904                    "link_type:\t%s\n"
2905                    "link_id:\t%u\n",
2906                    bpf_link_type_strs[link->type],
2907                    link->id);
2908         if (prog) {
2909                 bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
2910                 seq_printf(m,
2911                            "prog_tag:\t%s\n"
2912                            "prog_id:\t%u\n",
2913                            prog_tag,
2914                            prog->aux->id);
2915         }
2916         if (link->ops->show_fdinfo)
2917                 link->ops->show_fdinfo(link, m);
2918 }
2919 #endif
2920
2921 static const struct file_operations bpf_link_fops = {
2922 #ifdef CONFIG_PROC_FS
2923         .show_fdinfo    = bpf_link_show_fdinfo,
2924 #endif
2925         .release        = bpf_link_release,
2926         .read           = bpf_dummy_read,
2927         .write          = bpf_dummy_write,
2928 };
2929
2930 static int bpf_link_alloc_id(struct bpf_link *link)
2931 {
2932         int id;
2933
2934         idr_preload(GFP_KERNEL);
2935         spin_lock_bh(&link_idr_lock);
2936         id = idr_alloc_cyclic(&link_idr, link, 1, INT_MAX, GFP_ATOMIC);
2937         spin_unlock_bh(&link_idr_lock);
2938         idr_preload_end();
2939
2940         return id;
2941 }
2942
2943 /* Prepare bpf_link to be exposed to user-space by allocating anon_inode file,
2944  * reserving unused FD and allocating ID from link_idr. This is to be paired
2945  * with bpf_link_settle() to install FD and ID and expose bpf_link to
2946  * user-space, if bpf_link is successfully attached. If not, bpf_link and
2947  * pre-allocated resources are to be freed with bpf_cleanup() call. All the
2948  * transient state is passed around in struct bpf_link_primer.
2949  * This is preferred way to create and initialize bpf_link, especially when
2950  * there are complicated and expensive operations in between creating bpf_link
2951  * itself and attaching it to BPF hook. By using bpf_link_prime() and
2952  * bpf_link_settle() kernel code using bpf_link doesn't have to perform
2953  * expensive (and potentially failing) roll back operations in a rare case
2954  * that file, FD, or ID can't be allocated.
2955  */
2956 int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer)
2957 {
2958         struct file *file;
2959         int fd, id;
2960
2961         fd = get_unused_fd_flags(O_CLOEXEC);
2962         if (fd < 0)
2963                 return fd;
2964
2965
2966         id = bpf_link_alloc_id(link);
2967         if (id < 0) {
2968                 put_unused_fd(fd);
2969                 return id;
2970         }
2971
2972         file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC);
2973         if (IS_ERR(file)) {
2974                 bpf_link_free_id(id);
2975                 put_unused_fd(fd);
2976                 return PTR_ERR(file);
2977         }
2978
2979         primer->link = link;
2980         primer->file = file;
2981         primer->fd = fd;
2982         primer->id = id;
2983         return 0;
2984 }
2985
2986 int bpf_link_settle(struct bpf_link_primer *primer)
2987 {
2988         /* make bpf_link fetchable by ID */
2989         spin_lock_bh(&link_idr_lock);
2990         primer->link->id = primer->id;
2991         spin_unlock_bh(&link_idr_lock);
2992         /* make bpf_link fetchable by FD */
2993         fd_install(primer->fd, primer->file);
2994         /* pass through installed FD */
2995         return primer->fd;
2996 }
2997
2998 int bpf_link_new_fd(struct bpf_link *link)
2999 {
3000         return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC);
3001 }
3002
3003 struct bpf_link *bpf_link_get_from_fd(u32 ufd)
3004 {
3005         struct fd f = fdget(ufd);
3006         struct bpf_link *link;
3007
3008         if (!f.file)
3009                 return ERR_PTR(-EBADF);
3010         if (f.file->f_op != &bpf_link_fops) {
3011                 fdput(f);
3012                 return ERR_PTR(-EINVAL);
3013         }
3014
3015         link = f.file->private_data;
3016         bpf_link_inc(link);
3017         fdput(f);
3018
3019         return link;
3020 }
3021 EXPORT_SYMBOL(bpf_link_get_from_fd);
3022
3023 static void bpf_tracing_link_release(struct bpf_link *link)
3024 {
3025         struct bpf_tracing_link *tr_link =
3026                 container_of(link, struct bpf_tracing_link, link.link);
3027
3028         WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link,
3029                                                 tr_link->trampoline));
3030
3031         bpf_trampoline_put(tr_link->trampoline);
3032
3033         /* tgt_prog is NULL if target is a kernel function */
3034         if (tr_link->tgt_prog)
3035                 bpf_prog_put(tr_link->tgt_prog);
3036 }
3037
3038 static void bpf_tracing_link_dealloc(struct bpf_link *link)
3039 {
3040         struct bpf_tracing_link *tr_link =
3041                 container_of(link, struct bpf_tracing_link, link.link);
3042
3043         kfree(tr_link);
3044 }
3045
3046 static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link,
3047                                          struct seq_file *seq)
3048 {
3049         struct bpf_tracing_link *tr_link =
3050                 container_of(link, struct bpf_tracing_link, link.link);
3051         u32 target_btf_id, target_obj_id;
3052
3053         bpf_trampoline_unpack_key(tr_link->trampoline->key,
3054                                   &target_obj_id, &target_btf_id);
3055         seq_printf(seq,
3056                    "attach_type:\t%d\n"
3057                    "target_obj_id:\t%u\n"
3058                    "target_btf_id:\t%u\n",
3059                    tr_link->attach_type,
3060                    target_obj_id,
3061                    target_btf_id);
3062 }
3063
3064 static int bpf_tracing_link_fill_link_info(const struct bpf_link *link,
3065                                            struct bpf_link_info *info)
3066 {
3067         struct bpf_tracing_link *tr_link =
3068                 container_of(link, struct bpf_tracing_link, link.link);
3069
3070         info->tracing.attach_type = tr_link->attach_type;
3071         bpf_trampoline_unpack_key(tr_link->trampoline->key,
3072                                   &info->tracing.target_obj_id,
3073                                   &info->tracing.target_btf_id);
3074
3075         return 0;
3076 }
3077
3078 static const struct bpf_link_ops bpf_tracing_link_lops = {
3079         .release = bpf_tracing_link_release,
3080         .dealloc = bpf_tracing_link_dealloc,
3081         .show_fdinfo = bpf_tracing_link_show_fdinfo,
3082         .fill_link_info = bpf_tracing_link_fill_link_info,
3083 };
3084
3085 static int bpf_tracing_prog_attach(struct bpf_prog *prog,
3086                                    int tgt_prog_fd,
3087                                    u32 btf_id,
3088                                    u64 bpf_cookie)
3089 {
3090         struct bpf_link_primer link_primer;
3091         struct bpf_prog *tgt_prog = NULL;
3092         struct bpf_trampoline *tr = NULL;
3093         struct bpf_tracing_link *link;
3094         u64 key = 0;
3095         int err;
3096
3097         switch (prog->type) {
3098         case BPF_PROG_TYPE_TRACING:
3099                 if (prog->expected_attach_type != BPF_TRACE_FENTRY &&
3100                     prog->expected_attach_type != BPF_TRACE_FEXIT &&
3101                     prog->expected_attach_type != BPF_MODIFY_RETURN) {
3102                         err = -EINVAL;
3103                         goto out_put_prog;
3104                 }
3105                 break;
3106         case BPF_PROG_TYPE_EXT:
3107                 if (prog->expected_attach_type != 0) {
3108                         err = -EINVAL;
3109                         goto out_put_prog;
3110                 }
3111                 break;
3112         case BPF_PROG_TYPE_LSM:
3113                 if (prog->expected_attach_type != BPF_LSM_MAC) {
3114                         err = -EINVAL;
3115                         goto out_put_prog;
3116                 }
3117                 break;
3118         default:
3119                 err = -EINVAL;
3120                 goto out_put_prog;
3121         }
3122
3123         if (!!tgt_prog_fd != !!btf_id) {
3124                 err = -EINVAL;
3125                 goto out_put_prog;
3126         }
3127
3128         if (tgt_prog_fd) {
3129                 /* For now we only allow new targets for BPF_PROG_TYPE_EXT */
3130                 if (prog->type != BPF_PROG_TYPE_EXT) {
3131                         err = -EINVAL;
3132                         goto out_put_prog;
3133                 }
3134
3135                 tgt_prog = bpf_prog_get(tgt_prog_fd);
3136                 if (IS_ERR(tgt_prog)) {
3137                         err = PTR_ERR(tgt_prog);
3138                         tgt_prog = NULL;
3139                         goto out_put_prog;
3140                 }
3141
3142                 key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id);
3143         }
3144
3145         link = kzalloc(sizeof(*link), GFP_USER);
3146         if (!link) {
3147                 err = -ENOMEM;
3148                 goto out_put_prog;
3149         }
3150         bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING,
3151                       &bpf_tracing_link_lops, prog);
3152         link->attach_type = prog->expected_attach_type;
3153         link->link.cookie = bpf_cookie;
3154
3155         mutex_lock(&prog->aux->dst_mutex);
3156
3157         /* There are a few possible cases here:
3158          *
3159          * - if prog->aux->dst_trampoline is set, the program was just loaded
3160          *   and not yet attached to anything, so we can use the values stored
3161          *   in prog->aux
3162          *
3163          * - if prog->aux->dst_trampoline is NULL, the program has already been
3164          *   attached to a target and its initial target was cleared (below)
3165          *
3166          * - if tgt_prog != NULL, the caller specified tgt_prog_fd +
3167          *   target_btf_id using the link_create API.
3168          *
3169          * - if tgt_prog == NULL when this function was called using the old
3170          *   raw_tracepoint_open API, and we need a target from prog->aux
3171          *
3172          * - if prog->aux->dst_trampoline and tgt_prog is NULL, the program
3173          *   was detached and is going for re-attachment.
3174          */
3175         if (!prog->aux->dst_trampoline && !tgt_prog) {
3176                 /*
3177                  * Allow re-attach for TRACING and LSM programs. If it's
3178                  * currently linked, bpf_trampoline_link_prog will fail.
3179                  * EXT programs need to specify tgt_prog_fd, so they
3180                  * re-attach in separate code path.
3181                  */
3182                 if (prog->type != BPF_PROG_TYPE_TRACING &&
3183                     prog->type != BPF_PROG_TYPE_LSM) {
3184                         err = -EINVAL;
3185                         goto out_unlock;
3186                 }
3187                 btf_id = prog->aux->attach_btf_id;
3188                 key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, btf_id);
3189         }
3190
3191         if (!prog->aux->dst_trampoline ||
3192             (key && key != prog->aux->dst_trampoline->key)) {
3193                 /* If there is no saved target, or the specified target is
3194                  * different from the destination specified at load time, we
3195                  * need a new trampoline and a check for compatibility
3196                  */
3197                 struct bpf_attach_target_info tgt_info = {};
3198
3199                 err = bpf_check_attach_target(NULL, prog, tgt_prog, btf_id,
3200                                               &tgt_info);
3201                 if (err)
3202                         goto out_unlock;
3203
3204                 if (tgt_info.tgt_mod) {
3205                         module_put(prog->aux->mod);
3206                         prog->aux->mod = tgt_info.tgt_mod;
3207                 }
3208
3209                 tr = bpf_trampoline_get(key, &tgt_info);
3210                 if (!tr) {
3211                         err = -ENOMEM;
3212                         goto out_unlock;
3213                 }
3214         } else {
3215                 /* The caller didn't specify a target, or the target was the
3216                  * same as the destination supplied during program load. This
3217                  * means we can reuse the trampoline and reference from program
3218                  * load time, and there is no need to allocate a new one. This
3219                  * can only happen once for any program, as the saved values in
3220                  * prog->aux are cleared below.
3221                  */
3222                 tr = prog->aux->dst_trampoline;
3223                 tgt_prog = prog->aux->dst_prog;
3224         }
3225
3226         err = bpf_link_prime(&link->link.link, &link_primer);
3227         if (err)
3228                 goto out_unlock;
3229
3230         err = bpf_trampoline_link_prog(&link->link, tr);
3231         if (err) {
3232                 bpf_link_cleanup(&link_primer);
3233                 link = NULL;
3234                 goto out_unlock;
3235         }
3236
3237         link->tgt_prog = tgt_prog;
3238         link->trampoline = tr;
3239
3240         /* Always clear the trampoline and target prog from prog->aux to make
3241          * sure the original attach destination is not kept alive after a
3242          * program is (re-)attached to another target.
3243          */
3244         if (prog->aux->dst_prog &&
3245             (tgt_prog_fd || tr != prog->aux->dst_trampoline))
3246                 /* got extra prog ref from syscall, or attaching to different prog */
3247                 bpf_prog_put(prog->aux->dst_prog);
3248         if (prog->aux->dst_trampoline && tr != prog->aux->dst_trampoline)
3249                 /* we allocated a new trampoline, so free the old one */
3250                 bpf_trampoline_put(prog->aux->dst_trampoline);
3251
3252         prog->aux->dst_prog = NULL;
3253         prog->aux->dst_trampoline = NULL;
3254         mutex_unlock(&prog->aux->dst_mutex);
3255
3256         return bpf_link_settle(&link_primer);
3257 out_unlock:
3258         if (tr && tr != prog->aux->dst_trampoline)
3259                 bpf_trampoline_put(tr);
3260         mutex_unlock(&prog->aux->dst_mutex);
3261         kfree(link);
3262 out_put_prog:
3263         if (tgt_prog_fd && tgt_prog)
3264                 bpf_prog_put(tgt_prog);
3265         return err;
3266 }
3267
3268 struct bpf_raw_tp_link {
3269         struct bpf_link link;
3270         struct bpf_raw_event_map *btp;
3271 };
3272
3273 static void bpf_raw_tp_link_release(struct bpf_link *link)
3274 {
3275         struct bpf_raw_tp_link *raw_tp =
3276                 container_of(link, struct bpf_raw_tp_link, link);
3277
3278         bpf_probe_unregister(raw_tp->btp, raw_tp->link.prog);
3279         bpf_put_raw_tracepoint(raw_tp->btp);
3280 }
3281
3282 static void bpf_raw_tp_link_dealloc(struct bpf_link *link)
3283 {
3284         struct bpf_raw_tp_link *raw_tp =
3285                 container_of(link, struct bpf_raw_tp_link, link);
3286
3287         kfree(raw_tp);
3288 }
3289
3290 static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link,
3291                                         struct seq_file *seq)
3292 {
3293         struct bpf_raw_tp_link *raw_tp_link =
3294                 container_of(link, struct bpf_raw_tp_link, link);
3295
3296         seq_printf(seq,
3297                    "tp_name:\t%s\n",
3298                    raw_tp_link->btp->tp->name);
3299 }
3300
3301 static int bpf_copy_to_user(char __user *ubuf, const char *buf, u32 ulen,
3302                             u32 len)
3303 {
3304         if (ulen >= len + 1) {
3305                 if (copy_to_user(ubuf, buf, len + 1))
3306                         return -EFAULT;
3307         } else {
3308                 char zero = '\0';
3309
3310                 if (copy_to_user(ubuf, buf, ulen - 1))
3311                         return -EFAULT;
3312                 if (put_user(zero, ubuf + ulen - 1))
3313                         return -EFAULT;
3314                 return -ENOSPC;
3315         }
3316
3317         return 0;
3318 }
3319
3320 static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
3321                                           struct bpf_link_info *info)
3322 {
3323         struct bpf_raw_tp_link *raw_tp_link =
3324                 container_of(link, struct bpf_raw_tp_link, link);
3325         char __user *ubuf = u64_to_user_ptr(info->raw_tracepoint.tp_name);
3326         const char *tp_name = raw_tp_link->btp->tp->name;
3327         u32 ulen = info->raw_tracepoint.tp_name_len;
3328         size_t tp_len = strlen(tp_name);
3329
3330         if (!ulen ^ !ubuf)
3331                 return -EINVAL;
3332
3333         info->raw_tracepoint.tp_name_len = tp_len + 1;
3334
3335         if (!ubuf)
3336                 return 0;
3337
3338         return bpf_copy_to_user(ubuf, tp_name, ulen, tp_len);
3339 }
3340
3341 static const struct bpf_link_ops bpf_raw_tp_link_lops = {
3342         .release = bpf_raw_tp_link_release,
3343         .dealloc = bpf_raw_tp_link_dealloc,
3344         .show_fdinfo = bpf_raw_tp_link_show_fdinfo,
3345         .fill_link_info = bpf_raw_tp_link_fill_link_info,
3346 };
3347
3348 #ifdef CONFIG_PERF_EVENTS
3349 struct bpf_perf_link {
3350         struct bpf_link link;
3351         struct file *perf_file;
3352 };
3353
3354 static void bpf_perf_link_release(struct bpf_link *link)
3355 {
3356         struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link);
3357         struct perf_event *event = perf_link->perf_file->private_data;
3358
3359         perf_event_free_bpf_prog(event);
3360         fput(perf_link->perf_file);
3361 }
3362
3363 static void bpf_perf_link_dealloc(struct bpf_link *link)
3364 {
3365         struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link);
3366
3367         kfree(perf_link);
3368 }
3369
3370 static int bpf_perf_link_fill_common(const struct perf_event *event,
3371                                      char __user *uname, u32 ulen,
3372                                      u64 *probe_offset, u64 *probe_addr,
3373                                      u32 *fd_type)
3374 {
3375         const char *buf;
3376         u32 prog_id;
3377         size_t len;
3378         int err;
3379
3380         if (!ulen ^ !uname)
3381                 return -EINVAL;
3382
3383         err = bpf_get_perf_event_info(event, &prog_id, fd_type, &buf,
3384                                       probe_offset, probe_addr);
3385         if (err)
3386                 return err;
3387         if (!uname)
3388                 return 0;
3389         if (buf) {
3390                 len = strlen(buf);
3391                 err = bpf_copy_to_user(uname, buf, ulen, len);
3392                 if (err)
3393                         return err;
3394         } else {
3395                 char zero = '\0';
3396
3397                 if (put_user(zero, uname))
3398                         return -EFAULT;
3399         }
3400         return 0;
3401 }
3402
3403 #ifdef CONFIG_KPROBE_EVENTS
3404 static int bpf_perf_link_fill_kprobe(const struct perf_event *event,
3405                                      struct bpf_link_info *info)
3406 {
3407         char __user *uname;
3408         u64 addr, offset;
3409         u32 ulen, type;
3410         int err;
3411
3412         uname = u64_to_user_ptr(info->perf_event.kprobe.func_name);
3413         ulen = info->perf_event.kprobe.name_len;
3414         err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr,
3415                                         &type);
3416         if (err)
3417                 return err;
3418         if (type == BPF_FD_TYPE_KRETPROBE)
3419                 info->perf_event.type = BPF_PERF_EVENT_KRETPROBE;
3420         else
3421                 info->perf_event.type = BPF_PERF_EVENT_KPROBE;
3422
3423         info->perf_event.kprobe.offset = offset;
3424         if (!kallsyms_show_value(current_cred()))
3425                 addr = 0;
3426         info->perf_event.kprobe.addr = addr;
3427         return 0;
3428 }
3429 #endif
3430
3431 #ifdef CONFIG_UPROBE_EVENTS
3432 static int bpf_perf_link_fill_uprobe(const struct perf_event *event,
3433                                      struct bpf_link_info *info)
3434 {
3435         char __user *uname;
3436         u64 addr, offset;
3437         u32 ulen, type;
3438         int err;
3439
3440         uname = u64_to_user_ptr(info->perf_event.uprobe.file_name);
3441         ulen = info->perf_event.uprobe.name_len;
3442         err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr,
3443                                         &type);
3444         if (err)
3445                 return err;
3446
3447         if (type == BPF_FD_TYPE_URETPROBE)
3448                 info->perf_event.type = BPF_PERF_EVENT_URETPROBE;
3449         else
3450                 info->perf_event.type = BPF_PERF_EVENT_UPROBE;
3451         info->perf_event.uprobe.offset = offset;
3452         return 0;
3453 }
3454 #endif
3455
3456 static int bpf_perf_link_fill_probe(const struct perf_event *event,
3457                                     struct bpf_link_info *info)
3458 {
3459 #ifdef CONFIG_KPROBE_EVENTS
3460         if (event->tp_event->flags & TRACE_EVENT_FL_KPROBE)
3461                 return bpf_perf_link_fill_kprobe(event, info);
3462 #endif
3463 #ifdef CONFIG_UPROBE_EVENTS
3464         if (event->tp_event->flags & TRACE_EVENT_FL_UPROBE)
3465                 return bpf_perf_link_fill_uprobe(event, info);
3466 #endif
3467         return -EOPNOTSUPP;
3468 }
3469
3470 static int bpf_perf_link_fill_tracepoint(const struct perf_event *event,
3471                                          struct bpf_link_info *info)
3472 {
3473         char __user *uname;
3474         u32 ulen;
3475
3476         uname = u64_to_user_ptr(info->perf_event.tracepoint.tp_name);
3477         ulen = info->perf_event.tracepoint.name_len;
3478         info->perf_event.type = BPF_PERF_EVENT_TRACEPOINT;
3479         return bpf_perf_link_fill_common(event, uname, ulen, NULL, NULL, NULL);
3480 }
3481
3482 static int bpf_perf_link_fill_perf_event(const struct perf_event *event,
3483                                          struct bpf_link_info *info)
3484 {
3485         info->perf_event.event.type = event->attr.type;
3486         info->perf_event.event.config = event->attr.config;
3487         info->perf_event.type = BPF_PERF_EVENT_EVENT;
3488         return 0;
3489 }
3490
3491 static int bpf_perf_link_fill_link_info(const struct bpf_link *link,
3492                                         struct bpf_link_info *info)
3493 {
3494         struct bpf_perf_link *perf_link;
3495         const struct perf_event *event;
3496
3497         perf_link = container_of(link, struct bpf_perf_link, link);
3498         event = perf_get_event(perf_link->perf_file);
3499         if (IS_ERR(event))
3500                 return PTR_ERR(event);
3501
3502         switch (event->prog->type) {
3503         case BPF_PROG_TYPE_PERF_EVENT:
3504                 return bpf_perf_link_fill_perf_event(event, info);
3505         case BPF_PROG_TYPE_TRACEPOINT:
3506                 return bpf_perf_link_fill_tracepoint(event, info);
3507         case BPF_PROG_TYPE_KPROBE:
3508                 return bpf_perf_link_fill_probe(event, info);
3509         default:
3510                 return -EOPNOTSUPP;
3511         }
3512 }
3513
3514 static const struct bpf_link_ops bpf_perf_link_lops = {
3515         .release = bpf_perf_link_release,
3516         .dealloc = bpf_perf_link_dealloc,
3517         .fill_link_info = bpf_perf_link_fill_link_info,
3518 };
3519
3520 static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
3521 {
3522         struct bpf_link_primer link_primer;
3523         struct bpf_perf_link *link;
3524         struct perf_event *event;
3525         struct file *perf_file;
3526         int err;
3527
3528         if (attr->link_create.flags)
3529                 return -EINVAL;
3530
3531         perf_file = perf_event_get(attr->link_create.target_fd);
3532         if (IS_ERR(perf_file))
3533                 return PTR_ERR(perf_file);
3534
3535         link = kzalloc(sizeof(*link), GFP_USER);
3536         if (!link) {
3537                 err = -ENOMEM;
3538                 goto out_put_file;
3539         }
3540         bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog);
3541         link->perf_file = perf_file;
3542
3543         err = bpf_link_prime(&link->link, &link_primer);
3544         if (err) {
3545                 kfree(link);
3546                 goto out_put_file;
3547         }
3548
3549         event = perf_file->private_data;
3550         err = perf_event_set_bpf_prog(event, prog, attr->link_create.perf_event.bpf_cookie);
3551         if (err) {
3552                 bpf_link_cleanup(&link_primer);
3553                 goto out_put_file;
3554         }
3555         /* perf_event_set_bpf_prog() doesn't take its own refcnt on prog */
3556         bpf_prog_inc(prog);
3557
3558         return bpf_link_settle(&link_primer);
3559
3560 out_put_file:
3561         fput(perf_file);
3562         return err;
3563 }
3564 #else
3565 static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
3566 {
3567         return -EOPNOTSUPP;
3568 }
3569 #endif /* CONFIG_PERF_EVENTS */
3570
3571 static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
3572                                   const char __user *user_tp_name)
3573 {
3574         struct bpf_link_primer link_primer;
3575         struct bpf_raw_tp_link *link;
3576         struct bpf_raw_event_map *btp;
3577         const char *tp_name;
3578         char buf[128];
3579         int err;
3580
3581         switch (prog->type) {
3582         case BPF_PROG_TYPE_TRACING:
3583         case BPF_PROG_TYPE_EXT:
3584         case BPF_PROG_TYPE_LSM:
3585                 if (user_tp_name)
3586                         /* The attach point for this category of programs
3587                          * should be specified via btf_id during program load.
3588                          */
3589                         return -EINVAL;
3590                 if (prog->type == BPF_PROG_TYPE_TRACING &&
3591                     prog->expected_attach_type == BPF_TRACE_RAW_TP) {
3592                         tp_name = prog->aux->attach_func_name;
3593                         break;
3594                 }
3595                 return bpf_tracing_prog_attach(prog, 0, 0, 0);
3596         case BPF_PROG_TYPE_RAW_TRACEPOINT:
3597         case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
3598                 if (strncpy_from_user(buf, user_tp_name, sizeof(buf) - 1) < 0)
3599                         return -EFAULT;
3600                 buf[sizeof(buf) - 1] = 0;
3601                 tp_name = buf;
3602                 break;
3603         default:
3604                 return -EINVAL;
3605         }
3606
3607         btp = bpf_get_raw_tracepoint(tp_name);
3608         if (!btp)
3609                 return -ENOENT;
3610
3611         link = kzalloc(sizeof(*link), GFP_USER);
3612         if (!link) {
3613                 err = -ENOMEM;
3614                 goto out_put_btp;
3615         }
3616         bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT,
3617                       &bpf_raw_tp_link_lops, prog);
3618         link->btp = btp;
3619
3620         err = bpf_link_prime(&link->link, &link_primer);
3621         if (err) {
3622                 kfree(link);
3623                 goto out_put_btp;
3624         }
3625
3626         err = bpf_probe_register(link->btp, prog);
3627         if (err) {
3628                 bpf_link_cleanup(&link_primer);
3629                 goto out_put_btp;
3630         }
3631
3632         return bpf_link_settle(&link_primer);
3633
3634 out_put_btp:
3635         bpf_put_raw_tracepoint(btp);
3636         return err;
3637 }
3638
3639 #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
3640
3641 static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
3642 {
3643         struct bpf_prog *prog;
3644         int fd;
3645
3646         if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN))
3647                 return -EINVAL;
3648
3649         prog = bpf_prog_get(attr->raw_tracepoint.prog_fd);
3650         if (IS_ERR(prog))
3651                 return PTR_ERR(prog);
3652
3653         fd = bpf_raw_tp_link_attach(prog, u64_to_user_ptr(attr->raw_tracepoint.name));
3654         if (fd < 0)
3655                 bpf_prog_put(prog);
3656         return fd;
3657 }
3658
3659 static enum bpf_prog_type
3660 attach_type_to_prog_type(enum bpf_attach_type attach_type)
3661 {
3662         switch (attach_type) {
3663         case BPF_CGROUP_INET_INGRESS:
3664         case BPF_CGROUP_INET_EGRESS:
3665                 return BPF_PROG_TYPE_CGROUP_SKB;
3666         case BPF_CGROUP_INET_SOCK_CREATE:
3667         case BPF_CGROUP_INET_SOCK_RELEASE:
3668         case BPF_CGROUP_INET4_POST_BIND:
3669         case BPF_CGROUP_INET6_POST_BIND:
3670                 return BPF_PROG_TYPE_CGROUP_SOCK;
3671         case BPF_CGROUP_INET4_BIND:
3672         case BPF_CGROUP_INET6_BIND:
3673         case BPF_CGROUP_INET4_CONNECT:
3674         case BPF_CGROUP_INET6_CONNECT:
3675         case BPF_CGROUP_INET4_GETPEERNAME:
3676         case BPF_CGROUP_INET6_GETPEERNAME:
3677         case BPF_CGROUP_INET4_GETSOCKNAME:
3678         case BPF_CGROUP_INET6_GETSOCKNAME:
3679         case BPF_CGROUP_UDP4_SENDMSG:
3680         case BPF_CGROUP_UDP6_SENDMSG:
3681         case BPF_CGROUP_UDP4_RECVMSG:
3682         case BPF_CGROUP_UDP6_RECVMSG:
3683                 return BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
3684         case BPF_CGROUP_SOCK_OPS:
3685                 return BPF_PROG_TYPE_SOCK_OPS;
3686         case BPF_CGROUP_DEVICE:
3687                 return BPF_PROG_TYPE_CGROUP_DEVICE;
3688         case BPF_SK_MSG_VERDICT:
3689                 return BPF_PROG_TYPE_SK_MSG;
3690         case BPF_SK_SKB_STREAM_PARSER:
3691         case BPF_SK_SKB_STREAM_VERDICT:
3692         case BPF_SK_SKB_VERDICT:
3693                 return BPF_PROG_TYPE_SK_SKB;
3694         case BPF_LIRC_MODE2:
3695                 return BPF_PROG_TYPE_LIRC_MODE2;
3696         case BPF_FLOW_DISSECTOR:
3697                 return BPF_PROG_TYPE_FLOW_DISSECTOR;
3698         case BPF_CGROUP_SYSCTL:
3699                 return BPF_PROG_TYPE_CGROUP_SYSCTL;
3700         case BPF_CGROUP_GETSOCKOPT:
3701         case BPF_CGROUP_SETSOCKOPT:
3702                 return BPF_PROG_TYPE_CGROUP_SOCKOPT;
3703         case BPF_TRACE_ITER:
3704         case BPF_TRACE_RAW_TP:
3705         case BPF_TRACE_FENTRY:
3706         case BPF_TRACE_FEXIT:
3707         case BPF_MODIFY_RETURN:
3708                 return BPF_PROG_TYPE_TRACING;
3709         case BPF_LSM_MAC:
3710                 return BPF_PROG_TYPE_LSM;
3711         case BPF_SK_LOOKUP:
3712                 return BPF_PROG_TYPE_SK_LOOKUP;
3713         case BPF_XDP:
3714                 return BPF_PROG_TYPE_XDP;
3715         case BPF_LSM_CGROUP:
3716                 return BPF_PROG_TYPE_LSM;
3717         case BPF_TCX_INGRESS:
3718         case BPF_TCX_EGRESS:
3719                 return BPF_PROG_TYPE_SCHED_CLS;
3720         default:
3721                 return BPF_PROG_TYPE_UNSPEC;
3722         }
3723 }
3724
3725 static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
3726                                              enum bpf_attach_type attach_type)
3727 {
3728         enum bpf_prog_type ptype;
3729
3730         switch (prog->type) {
3731         case BPF_PROG_TYPE_CGROUP_SOCK:
3732         case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
3733         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
3734         case BPF_PROG_TYPE_SK_LOOKUP:
3735                 return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
3736         case BPF_PROG_TYPE_CGROUP_SKB:
3737                 if (!capable(CAP_NET_ADMIN))
3738                         /* cg-skb progs can be loaded by unpriv user.
3739                          * check permissions at attach time.
3740                          */
3741                         return -EPERM;
3742                 return prog->enforce_expected_attach_type &&
3743                         prog->expected_attach_type != attach_type ?
3744                         -EINVAL : 0;
3745         case BPF_PROG_TYPE_EXT:
3746                 return 0;
3747         case BPF_PROG_TYPE_NETFILTER:
3748                 if (attach_type != BPF_NETFILTER)
3749                         return -EINVAL;
3750                 return 0;
3751         case BPF_PROG_TYPE_PERF_EVENT:
3752         case BPF_PROG_TYPE_TRACEPOINT:
3753                 if (attach_type != BPF_PERF_EVENT)
3754                         return -EINVAL;
3755                 return 0;
3756         case BPF_PROG_TYPE_KPROBE:
3757                 if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI &&
3758                     attach_type != BPF_TRACE_KPROBE_MULTI)
3759                         return -EINVAL;
3760                 if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI &&
3761                     attach_type != BPF_TRACE_UPROBE_MULTI)
3762                         return -EINVAL;
3763                 if (attach_type != BPF_PERF_EVENT &&
3764                     attach_type != BPF_TRACE_KPROBE_MULTI &&
3765                     attach_type != BPF_TRACE_UPROBE_MULTI)
3766                         return -EINVAL;
3767                 return 0;
3768         case BPF_PROG_TYPE_SCHED_CLS:
3769                 if (attach_type != BPF_TCX_INGRESS &&
3770                     attach_type != BPF_TCX_EGRESS)
3771                         return -EINVAL;
3772                 return 0;
3773         default:
3774                 ptype = attach_type_to_prog_type(attach_type);
3775                 if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type)
3776                         return -EINVAL;
3777                 return 0;
3778         }
3779 }
3780
3781 #define BPF_PROG_ATTACH_LAST_FIELD expected_revision
3782
3783 #define BPF_F_ATTACH_MASK_BASE  \
3784         (BPF_F_ALLOW_OVERRIDE | \
3785          BPF_F_ALLOW_MULTI |    \
3786          BPF_F_REPLACE)
3787
3788 #define BPF_F_ATTACH_MASK_MPROG \
3789         (BPF_F_REPLACE |        \
3790          BPF_F_BEFORE |         \
3791          BPF_F_AFTER |          \
3792          BPF_F_ID |             \
3793          BPF_F_LINK)
3794
3795 static int bpf_prog_attach(const union bpf_attr *attr)
3796 {
3797         enum bpf_prog_type ptype;
3798         struct bpf_prog *prog;
3799         u32 mask;
3800         int ret;
3801
3802         if (CHECK_ATTR(BPF_PROG_ATTACH))
3803                 return -EINVAL;
3804
3805         ptype = attach_type_to_prog_type(attr->attach_type);
3806         if (ptype == BPF_PROG_TYPE_UNSPEC)
3807                 return -EINVAL;
3808         mask = bpf_mprog_supported(ptype) ?
3809                BPF_F_ATTACH_MASK_MPROG : BPF_F_ATTACH_MASK_BASE;
3810         if (attr->attach_flags & ~mask)
3811                 return -EINVAL;
3812
3813         prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
3814         if (IS_ERR(prog))
3815                 return PTR_ERR(prog);
3816
3817         if (bpf_prog_attach_check_attach_type(prog, attr->attach_type)) {
3818                 bpf_prog_put(prog);
3819                 return -EINVAL;
3820         }
3821
3822         switch (ptype) {
3823         case BPF_PROG_TYPE_SK_SKB:
3824         case BPF_PROG_TYPE_SK_MSG:
3825                 ret = sock_map_get_from_fd(attr, prog);
3826                 break;
3827         case BPF_PROG_TYPE_LIRC_MODE2:
3828                 ret = lirc_prog_attach(attr, prog);
3829                 break;
3830         case BPF_PROG_TYPE_FLOW_DISSECTOR:
3831                 ret = netns_bpf_prog_attach(attr, prog);
3832                 break;
3833         case BPF_PROG_TYPE_CGROUP_DEVICE:
3834         case BPF_PROG_TYPE_CGROUP_SKB:
3835         case BPF_PROG_TYPE_CGROUP_SOCK:
3836         case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
3837         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
3838         case BPF_PROG_TYPE_CGROUP_SYSCTL:
3839         case BPF_PROG_TYPE_SOCK_OPS:
3840         case BPF_PROG_TYPE_LSM:
3841                 if (ptype == BPF_PROG_TYPE_LSM &&
3842                     prog->expected_attach_type != BPF_LSM_CGROUP)
3843                         ret = -EINVAL;
3844                 else
3845                         ret = cgroup_bpf_prog_attach(attr, ptype, prog);
3846                 break;
3847         case BPF_PROG_TYPE_SCHED_CLS:
3848                 ret = tcx_prog_attach(attr, prog);
3849                 break;
3850         default:
3851                 ret = -EINVAL;
3852         }
3853
3854         if (ret)
3855                 bpf_prog_put(prog);
3856         return ret;
3857 }
3858
3859 #define BPF_PROG_DETACH_LAST_FIELD expected_revision
3860
3861 static int bpf_prog_detach(const union bpf_attr *attr)
3862 {
3863         struct bpf_prog *prog = NULL;
3864         enum bpf_prog_type ptype;
3865         int ret;
3866
3867         if (CHECK_ATTR(BPF_PROG_DETACH))
3868                 return -EINVAL;
3869
3870         ptype = attach_type_to_prog_type(attr->attach_type);
3871         if (bpf_mprog_supported(ptype)) {
3872                 if (ptype == BPF_PROG_TYPE_UNSPEC)
3873                         return -EINVAL;
3874                 if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG)
3875                         return -EINVAL;
3876                 if (attr->attach_bpf_fd) {
3877                         prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
3878                         if (IS_ERR(prog))
3879                                 return PTR_ERR(prog);
3880                 }
3881         }
3882
3883         switch (ptype) {
3884         case BPF_PROG_TYPE_SK_MSG:
3885         case BPF_PROG_TYPE_SK_SKB:
3886                 ret = sock_map_prog_detach(attr, ptype);
3887                 break;
3888         case BPF_PROG_TYPE_LIRC_MODE2:
3889                 ret = lirc_prog_detach(attr);
3890                 break;
3891         case BPF_PROG_TYPE_FLOW_DISSECTOR:
3892                 ret = netns_bpf_prog_detach(attr, ptype);
3893                 break;
3894         case BPF_PROG_TYPE_CGROUP_DEVICE:
3895         case BPF_PROG_TYPE_CGROUP_SKB:
3896         case BPF_PROG_TYPE_CGROUP_SOCK:
3897         case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
3898         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
3899         case BPF_PROG_TYPE_CGROUP_SYSCTL:
3900         case BPF_PROG_TYPE_SOCK_OPS:
3901         case BPF_PROG_TYPE_LSM:
3902                 ret = cgroup_bpf_prog_detach(attr, ptype);
3903                 break;
3904         case BPF_PROG_TYPE_SCHED_CLS:
3905                 ret = tcx_prog_detach(attr, prog);
3906                 break;
3907         default:
3908                 ret = -EINVAL;
3909         }
3910
3911         if (prog)
3912                 bpf_prog_put(prog);
3913         return ret;
3914 }
3915
3916 #define BPF_PROG_QUERY_LAST_FIELD query.link_attach_flags
3917
3918 static int bpf_prog_query(const union bpf_attr *attr,
3919                           union bpf_attr __user *uattr)
3920 {
3921         if (!capable(CAP_NET_ADMIN))
3922                 return -EPERM;
3923         if (CHECK_ATTR(BPF_PROG_QUERY))
3924                 return -EINVAL;
3925         if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE)
3926                 return -EINVAL;
3927
3928         switch (attr->query.attach_type) {
3929         case BPF_CGROUP_INET_INGRESS:
3930         case BPF_CGROUP_INET_EGRESS:
3931         case BPF_CGROUP_INET_SOCK_CREATE:
3932         case BPF_CGROUP_INET_SOCK_RELEASE:
3933         case BPF_CGROUP_INET4_BIND:
3934         case BPF_CGROUP_INET6_BIND:
3935         case BPF_CGROUP_INET4_POST_BIND:
3936         case BPF_CGROUP_INET6_POST_BIND:
3937         case BPF_CGROUP_INET4_CONNECT:
3938         case BPF_CGROUP_INET6_CONNECT:
3939         case BPF_CGROUP_INET4_GETPEERNAME:
3940         case BPF_CGROUP_INET6_GETPEERNAME:
3941         case BPF_CGROUP_INET4_GETSOCKNAME:
3942         case BPF_CGROUP_INET6_GETSOCKNAME:
3943         case BPF_CGROUP_UDP4_SENDMSG:
3944         case BPF_CGROUP_UDP6_SENDMSG:
3945         case BPF_CGROUP_UDP4_RECVMSG:
3946         case BPF_CGROUP_UDP6_RECVMSG:
3947         case BPF_CGROUP_SOCK_OPS:
3948         case BPF_CGROUP_DEVICE:
3949         case BPF_CGROUP_SYSCTL:
3950         case BPF_CGROUP_GETSOCKOPT:
3951         case BPF_CGROUP_SETSOCKOPT:
3952         case BPF_LSM_CGROUP:
3953                 return cgroup_bpf_prog_query(attr, uattr);
3954         case BPF_LIRC_MODE2:
3955                 return lirc_prog_query(attr, uattr);
3956         case BPF_FLOW_DISSECTOR:
3957         case BPF_SK_LOOKUP:
3958                 return netns_bpf_prog_query(attr, uattr);
3959         case BPF_SK_SKB_STREAM_PARSER:
3960         case BPF_SK_SKB_STREAM_VERDICT:
3961         case BPF_SK_MSG_VERDICT:
3962         case BPF_SK_SKB_VERDICT:
3963                 return sock_map_bpf_prog_query(attr, uattr);
3964         case BPF_TCX_INGRESS:
3965         case BPF_TCX_EGRESS:
3966                 return tcx_prog_query(attr, uattr);
3967         default:
3968                 return -EINVAL;
3969         }
3970 }
3971
3972 #define BPF_PROG_TEST_RUN_LAST_FIELD test.batch_size
3973
3974 static int bpf_prog_test_run(const union bpf_attr *attr,
3975                              union bpf_attr __user *uattr)
3976 {
3977         struct bpf_prog *prog;
3978         int ret = -ENOTSUPP;
3979
3980         if (CHECK_ATTR(BPF_PROG_TEST_RUN))
3981                 return -EINVAL;
3982
3983         if ((attr->test.ctx_size_in && !attr->test.ctx_in) ||
3984             (!attr->test.ctx_size_in && attr->test.ctx_in))
3985                 return -EINVAL;
3986
3987         if ((attr->test.ctx_size_out && !attr->test.ctx_out) ||
3988             (!attr->test.ctx_size_out && attr->test.ctx_out))
3989                 return -EINVAL;
3990
3991         prog = bpf_prog_get(attr->test.prog_fd);
3992         if (IS_ERR(prog))
3993                 return PTR_ERR(prog);
3994
3995         if (prog->aux->ops->test_run)
3996                 ret = prog->aux->ops->test_run(prog, attr, uattr);
3997
3998         bpf_prog_put(prog);
3999         return ret;
4000 }
4001
4002 #define BPF_OBJ_GET_NEXT_ID_LAST_FIELD next_id
4003
4004 static int bpf_obj_get_next_id(const union bpf_attr *attr,
4005                                union bpf_attr __user *uattr,
4006                                struct idr *idr,
4007                                spinlock_t *lock)
4008 {
4009         u32 next_id = attr->start_id;
4010         int err = 0;
4011
4012         if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX)
4013                 return -EINVAL;
4014
4015         if (!capable(CAP_SYS_ADMIN))
4016                 return -EPERM;
4017
4018         next_id++;
4019         spin_lock_bh(lock);
4020         if (!idr_get_next(idr, &next_id))
4021                 err = -ENOENT;
4022         spin_unlock_bh(lock);
4023
4024         if (!err)
4025                 err = put_user(next_id, &uattr->next_id);
4026
4027         return err;
4028 }
4029
4030 struct bpf_map *bpf_map_get_curr_or_next(u32 *id)
4031 {
4032         struct bpf_map *map;
4033
4034         spin_lock_bh(&map_idr_lock);
4035 again:
4036         map = idr_get_next(&map_idr, id);
4037         if (map) {
4038                 map = __bpf_map_inc_not_zero(map, false);
4039                 if (IS_ERR(map)) {
4040                         (*id)++;
4041                         goto again;
4042                 }
4043         }
4044         spin_unlock_bh(&map_idr_lock);
4045
4046         return map;
4047 }
4048
4049 struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id)
4050 {
4051         struct bpf_prog *prog;
4052
4053         spin_lock_bh(&prog_idr_lock);
4054 again:
4055         prog = idr_get_next(&prog_idr, id);
4056         if (prog) {
4057                 prog = bpf_prog_inc_not_zero(prog);
4058                 if (IS_ERR(prog)) {
4059                         (*id)++;
4060                         goto again;
4061                 }
4062         }
4063         spin_unlock_bh(&prog_idr_lock);
4064
4065         return prog;
4066 }
4067
4068 #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
4069
4070 struct bpf_prog *bpf_prog_by_id(u32 id)
4071 {
4072         struct bpf_prog *prog;
4073
4074         if (!id)
4075                 return ERR_PTR(-ENOENT);
4076
4077         spin_lock_bh(&prog_idr_lock);
4078         prog = idr_find(&prog_idr, id);
4079         if (prog)
4080                 prog = bpf_prog_inc_not_zero(prog);
4081         else
4082                 prog = ERR_PTR(-ENOENT);
4083         spin_unlock_bh(&prog_idr_lock);
4084         return prog;
4085 }
4086
4087 static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
4088 {
4089         struct bpf_prog *prog;
4090         u32 id = attr->prog_id;
4091         int fd;
4092
4093         if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
4094                 return -EINVAL;
4095
4096         if (!capable(CAP_SYS_ADMIN))
4097                 return -EPERM;
4098
4099         prog = bpf_prog_by_id(id);
4100         if (IS_ERR(prog))
4101                 return PTR_ERR(prog);
4102
4103         fd = bpf_prog_new_fd(prog);
4104         if (fd < 0)
4105                 bpf_prog_put(prog);
4106
4107         return fd;
4108 }
4109
4110 #define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags
4111
4112 static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
4113 {
4114         struct bpf_map *map;
4115         u32 id = attr->map_id;
4116         int f_flags;
4117         int fd;
4118
4119         if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) ||
4120             attr->open_flags & ~BPF_OBJ_FLAG_MASK)
4121                 return -EINVAL;
4122
4123         if (!capable(CAP_SYS_ADMIN))
4124                 return -EPERM;
4125
4126         f_flags = bpf_get_file_flag(attr->open_flags);
4127         if (f_flags < 0)
4128                 return f_flags;
4129
4130         spin_lock_bh(&map_idr_lock);
4131         map = idr_find(&map_idr, id);
4132         if (map)
4133                 map = __bpf_map_inc_not_zero(map, true);
4134         else
4135                 map = ERR_PTR(-ENOENT);
4136         spin_unlock_bh(&map_idr_lock);
4137
4138         if (IS_ERR(map))
4139                 return PTR_ERR(map);
4140
4141         fd = bpf_map_new_fd(map, f_flags);
4142         if (fd < 0)
4143                 bpf_map_put_with_uref(map);
4144
4145         return fd;
4146 }
4147
4148 static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog,
4149                                               unsigned long addr, u32 *off,
4150                                               u32 *type)
4151 {
4152         const struct bpf_map *map;
4153         int i;
4154
4155         mutex_lock(&prog->aux->used_maps_mutex);
4156         for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) {
4157                 map = prog->aux->used_maps[i];
4158                 if (map == (void *)addr) {
4159                         *type = BPF_PSEUDO_MAP_FD;
4160                         goto out;
4161                 }
4162                 if (!map->ops->map_direct_value_meta)
4163                         continue;
4164                 if (!map->ops->map_direct_value_meta(map, addr, off)) {
4165                         *type = BPF_PSEUDO_MAP_VALUE;
4166                         goto out;
4167                 }
4168         }
4169         map = NULL;
4170
4171 out:
4172         mutex_unlock(&prog->aux->used_maps_mutex);
4173         return map;
4174 }
4175
4176 static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog,
4177                                               const struct cred *f_cred)
4178 {
4179         const struct bpf_map *map;
4180         struct bpf_insn *insns;
4181         u32 off, type;
4182         u64 imm;
4183         u8 code;
4184         int i;
4185
4186         insns = kmemdup(prog->insnsi, bpf_prog_insn_size(prog),
4187                         GFP_USER);
4188         if (!insns)
4189                 return insns;
4190
4191         for (i = 0; i < prog->len; i++) {
4192                 code = insns[i].code;
4193
4194                 if (code == (BPF_JMP | BPF_TAIL_CALL)) {
4195                         insns[i].code = BPF_JMP | BPF_CALL;
4196                         insns[i].imm = BPF_FUNC_tail_call;
4197                         /* fall-through */
4198                 }
4199                 if (code == (BPF_JMP | BPF_CALL) ||
4200                     code == (BPF_JMP | BPF_CALL_ARGS)) {
4201                         if (code == (BPF_JMP | BPF_CALL_ARGS))
4202                                 insns[i].code = BPF_JMP | BPF_CALL;
4203                         if (!bpf_dump_raw_ok(f_cred))
4204                                 insns[i].imm = 0;
4205                         continue;
4206                 }
4207                 if (BPF_CLASS(code) == BPF_LDX && BPF_MODE(code) == BPF_PROBE_MEM) {
4208                         insns[i].code = BPF_LDX | BPF_SIZE(code) | BPF_MEM;
4209                         continue;
4210                 }
4211
4212                 if (code != (BPF_LD | BPF_IMM | BPF_DW))
4213                         continue;
4214
4215                 imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm;
4216                 map = bpf_map_from_imm(prog, imm, &off, &type);
4217                 if (map) {
4218                         insns[i].src_reg = type;
4219                         insns[i].imm = map->id;
4220                         insns[i + 1].imm = off;
4221                         continue;
4222                 }
4223         }
4224
4225         return insns;
4226 }
4227
4228 static int set_info_rec_size(struct bpf_prog_info *info)
4229 {
4230         /*
4231          * Ensure info.*_rec_size is the same as kernel expected size
4232          *
4233          * or
4234          *
4235          * Only allow zero *_rec_size if both _rec_size and _cnt are
4236          * zero.  In this case, the kernel will set the expected
4237          * _rec_size back to the info.
4238          */
4239
4240         if ((info->nr_func_info || info->func_info_rec_size) &&
4241             info->func_info_rec_size != sizeof(struct bpf_func_info))
4242                 return -EINVAL;
4243
4244         if ((info->nr_line_info || info->line_info_rec_size) &&
4245             info->line_info_rec_size != sizeof(struct bpf_line_info))
4246                 return -EINVAL;
4247
4248         if ((info->nr_jited_line_info || info->jited_line_info_rec_size) &&
4249             info->jited_line_info_rec_size != sizeof(__u64))
4250                 return -EINVAL;
4251
4252         info->func_info_rec_size = sizeof(struct bpf_func_info);
4253         info->line_info_rec_size = sizeof(struct bpf_line_info);
4254         info->jited_line_info_rec_size = sizeof(__u64);
4255
4256         return 0;
4257 }
4258
4259 static int bpf_prog_get_info_by_fd(struct file *file,
4260                                    struct bpf_prog *prog,
4261                                    const union bpf_attr *attr,
4262                                    union bpf_attr __user *uattr)
4263 {
4264         struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
4265         struct btf *attach_btf = bpf_prog_get_target_btf(prog);
4266         struct bpf_prog_info info;
4267         u32 info_len = attr->info.info_len;
4268         struct bpf_prog_kstats stats;
4269         char __user *uinsns;
4270         u32 ulen;
4271         int err;
4272
4273         err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
4274         if (err)
4275                 return err;
4276         info_len = min_t(u32, sizeof(info), info_len);
4277
4278         memset(&info, 0, sizeof(info));
4279         if (copy_from_user(&info, uinfo, info_len))
4280                 return -EFAULT;
4281
4282         info.type = prog->type;
4283         info.id = prog->aux->id;
4284         info.load_time = prog->aux->load_time;
4285         info.created_by_uid = from_kuid_munged(current_user_ns(),
4286                                                prog->aux->user->uid);
4287         info.gpl_compatible = prog->gpl_compatible;
4288
4289         memcpy(info.tag, prog->tag, sizeof(prog->tag));
4290         memcpy(info.name, prog->aux->name, sizeof(prog->aux->name));
4291
4292         mutex_lock(&prog->aux->used_maps_mutex);
4293         ulen = info.nr_map_ids;
4294         info.nr_map_ids = prog->aux->used_map_cnt;
4295         ulen = min_t(u32, info.nr_map_ids, ulen);
4296         if (ulen) {
4297                 u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids);
4298                 u32 i;
4299
4300                 for (i = 0; i < ulen; i++)
4301                         if (put_user(prog->aux->used_maps[i]->id,
4302                                      &user_map_ids[i])) {
4303                                 mutex_unlock(&prog->aux->used_maps_mutex);
4304                                 return -EFAULT;
4305                         }
4306         }
4307         mutex_unlock(&prog->aux->used_maps_mutex);
4308
4309         err = set_info_rec_size(&info);
4310         if (err)
4311                 return err;
4312
4313         bpf_prog_get_stats(prog, &stats);
4314         info.run_time_ns = stats.nsecs;
4315         info.run_cnt = stats.cnt;
4316         info.recursion_misses = stats.misses;
4317
4318         info.verified_insns = prog->aux->verified_insns;
4319
4320         if (!bpf_capable()) {
4321                 info.jited_prog_len = 0;
4322                 info.xlated_prog_len = 0;
4323                 info.nr_jited_ksyms = 0;
4324                 info.nr_jited_func_lens = 0;
4325                 info.nr_func_info = 0;
4326                 info.nr_line_info = 0;
4327                 info.nr_jited_line_info = 0;
4328                 goto done;
4329         }
4330
4331         ulen = info.xlated_prog_len;
4332         info.xlated_prog_len = bpf_prog_insn_size(prog);
4333         if (info.xlated_prog_len && ulen) {
4334                 struct bpf_insn *insns_sanitized;
4335                 bool fault;
4336
4337                 if (prog->blinded && !bpf_dump_raw_ok(file->f_cred)) {
4338                         info.xlated_prog_insns = 0;
4339                         goto done;
4340                 }
4341                 insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred);
4342                 if (!insns_sanitized)
4343                         return -ENOMEM;
4344                 uinsns = u64_to_user_ptr(info.xlated_prog_insns);
4345                 ulen = min_t(u32, info.xlated_prog_len, ulen);
4346                 fault = copy_to_user(uinsns, insns_sanitized, ulen);
4347                 kfree(insns_sanitized);
4348                 if (fault)
4349                         return -EFAULT;
4350         }
4351
4352         if (bpf_prog_is_offloaded(prog->aux)) {
4353                 err = bpf_prog_offload_info_fill(&info, prog);
4354                 if (err)
4355                         return err;
4356                 goto done;
4357         }
4358
4359         /* NOTE: the following code is supposed to be skipped for offload.
4360          * bpf_prog_offload_info_fill() is the place to fill similar fields
4361          * for offload.
4362          */
4363         ulen = info.jited_prog_len;
4364         if (prog->aux->func_cnt) {
4365                 u32 i;
4366
4367                 info.jited_prog_len = 0;
4368                 for (i = 0; i < prog->aux->func_cnt; i++)
4369                         info.jited_prog_len += prog->aux->func[i]->jited_len;
4370         } else {
4371                 info.jited_prog_len = prog->jited_len;
4372         }
4373
4374         if (info.jited_prog_len && ulen) {
4375                 if (bpf_dump_raw_ok(file->f_cred)) {
4376                         uinsns = u64_to_user_ptr(info.jited_prog_insns);
4377                         ulen = min_t(u32, info.jited_prog_len, ulen);
4378
4379                         /* for multi-function programs, copy the JITed
4380                          * instructions for all the functions
4381                          */
4382                         if (prog->aux->func_cnt) {
4383                                 u32 len, free, i;
4384                                 u8 *img;
4385
4386                                 free = ulen;
4387                                 for (i = 0; i < prog->aux->func_cnt; i++) {
4388                                         len = prog->aux->func[i]->jited_len;
4389                                         len = min_t(u32, len, free);
4390                                         img = (u8 *) prog->aux->func[i]->bpf_func;
4391                                         if (copy_to_user(uinsns, img, len))
4392                                                 return -EFAULT;
4393                                         uinsns += len;
4394                                         free -= len;
4395                                         if (!free)
4396                                                 break;
4397                                 }
4398                         } else {
4399                                 if (copy_to_user(uinsns, prog->bpf_func, ulen))
4400                                         return -EFAULT;
4401                         }
4402                 } else {
4403                         info.jited_prog_insns = 0;
4404                 }
4405         }
4406
4407         ulen = info.nr_jited_ksyms;
4408         info.nr_jited_ksyms = prog->aux->func_cnt ? : 1;
4409         if (ulen) {
4410                 if (bpf_dump_raw_ok(file->f_cred)) {
4411                         unsigned long ksym_addr;
4412                         u64 __user *user_ksyms;
4413                         u32 i;
4414
4415                         /* copy the address of the kernel symbol
4416                          * corresponding to each function
4417                          */
4418                         ulen = min_t(u32, info.nr_jited_ksyms, ulen);
4419                         user_ksyms = u64_to_user_ptr(info.jited_ksyms);
4420                         if (prog->aux->func_cnt) {
4421                                 for (i = 0; i < ulen; i++) {
4422                                         ksym_addr = (unsigned long)
4423                                                 prog->aux->func[i]->bpf_func;
4424                                         if (put_user((u64) ksym_addr,
4425                                                      &user_ksyms[i]))
4426                                                 return -EFAULT;
4427                                 }
4428                         } else {
4429                                 ksym_addr = (unsigned long) prog->bpf_func;
4430                                 if (put_user((u64) ksym_addr, &user_ksyms[0]))
4431                                         return -EFAULT;
4432                         }
4433                 } else {
4434                         info.jited_ksyms = 0;
4435                 }
4436         }
4437
4438         ulen = info.nr_jited_func_lens;
4439         info.nr_jited_func_lens = prog->aux->func_cnt ? : 1;
4440         if (ulen) {
4441                 if (bpf_dump_raw_ok(file->f_cred)) {
4442                         u32 __user *user_lens;
4443                         u32 func_len, i;
4444
4445                         /* copy the JITed image lengths for each function */
4446                         ulen = min_t(u32, info.nr_jited_func_lens, ulen);
4447                         user_lens = u64_to_user_ptr(info.jited_func_lens);
4448                         if (prog->aux->func_cnt) {
4449                                 for (i = 0; i < ulen; i++) {
4450                                         func_len =
4451                                                 prog->aux->func[i]->jited_len;
4452                                         if (put_user(func_len, &user_lens[i]))
4453                                                 return -EFAULT;
4454                                 }
4455                         } else {
4456                                 func_len = prog->jited_len;
4457                                 if (put_user(func_len, &user_lens[0]))
4458                                         return -EFAULT;
4459                         }
4460                 } else {
4461                         info.jited_func_lens = 0;
4462                 }
4463         }
4464
4465         if (prog->aux->btf)
4466                 info.btf_id = btf_obj_id(prog->aux->btf);
4467         info.attach_btf_id = prog->aux->attach_btf_id;
4468         if (attach_btf)
4469                 info.attach_btf_obj_id = btf_obj_id(attach_btf);
4470
4471         ulen = info.nr_func_info;
4472         info.nr_func_info = prog->aux->func_info_cnt;
4473         if (info.nr_func_info && ulen) {
4474                 char __user *user_finfo;
4475
4476                 user_finfo = u64_to_user_ptr(info.func_info);
4477                 ulen = min_t(u32, info.nr_func_info, ulen);
4478                 if (copy_to_user(user_finfo, prog->aux->func_info,
4479                                  info.func_info_rec_size * ulen))
4480                         return -EFAULT;
4481         }
4482
4483         ulen = info.nr_line_info;
4484         info.nr_line_info = prog->aux->nr_linfo;
4485         if (info.nr_line_info && ulen) {
4486                 __u8 __user *user_linfo;
4487
4488                 user_linfo = u64_to_user_ptr(info.line_info);
4489                 ulen = min_t(u32, info.nr_line_info, ulen);
4490                 if (copy_to_user(user_linfo, prog->aux->linfo,
4491                                  info.line_info_rec_size * ulen))
4492                         return -EFAULT;
4493         }
4494
4495         ulen = info.nr_jited_line_info;
4496         if (prog->aux->jited_linfo)
4497                 info.nr_jited_line_info = prog->aux->nr_linfo;
4498         else
4499                 info.nr_jited_line_info = 0;
4500         if (info.nr_jited_line_info && ulen) {
4501                 if (bpf_dump_raw_ok(file->f_cred)) {
4502                         unsigned long line_addr;
4503                         __u64 __user *user_linfo;
4504                         u32 i;
4505
4506                         user_linfo = u64_to_user_ptr(info.jited_line_info);
4507                         ulen = min_t(u32, info.nr_jited_line_info, ulen);
4508                         for (i = 0; i < ulen; i++) {
4509                                 line_addr = (unsigned long)prog->aux->jited_linfo[i];
4510                                 if (put_user((__u64)line_addr, &user_linfo[i]))
4511                                         return -EFAULT;
4512                         }
4513                 } else {
4514                         info.jited_line_info = 0;
4515                 }
4516         }
4517
4518         ulen = info.nr_prog_tags;
4519         info.nr_prog_tags = prog->aux->func_cnt ? : 1;
4520         if (ulen) {
4521                 __u8 __user (*user_prog_tags)[BPF_TAG_SIZE];
4522                 u32 i;
4523
4524                 user_prog_tags = u64_to_user_ptr(info.prog_tags);
4525                 ulen = min_t(u32, info.nr_prog_tags, ulen);
4526                 if (prog->aux->func_cnt) {
4527                         for (i = 0; i < ulen; i++) {
4528                                 if (copy_to_user(user_prog_tags[i],
4529                                                  prog->aux->func[i]->tag,
4530                                                  BPF_TAG_SIZE))
4531                                         return -EFAULT;
4532                         }
4533                 } else {
4534                         if (copy_to_user(user_prog_tags[0],
4535                                          prog->tag, BPF_TAG_SIZE))
4536                                 return -EFAULT;
4537                 }
4538         }
4539
4540 done:
4541         if (copy_to_user(uinfo, &info, info_len) ||
4542             put_user(info_len, &uattr->info.info_len))
4543                 return -EFAULT;
4544
4545         return 0;
4546 }
4547
4548 static int bpf_map_get_info_by_fd(struct file *file,
4549                                   struct bpf_map *map,
4550                                   const union bpf_attr *attr,
4551                                   union bpf_attr __user *uattr)
4552 {
4553         struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info);
4554         struct bpf_map_info info;
4555         u32 info_len = attr->info.info_len;
4556         int err;
4557
4558         err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
4559         if (err)
4560                 return err;
4561         info_len = min_t(u32, sizeof(info), info_len);
4562
4563         memset(&info, 0, sizeof(info));
4564         info.type = map->map_type;
4565         info.id = map->id;
4566         info.key_size = map->key_size;
4567         info.value_size = map->value_size;
4568         info.max_entries = map->max_entries;
4569         info.map_flags = map->map_flags;
4570         info.map_extra = map->map_extra;
4571         memcpy(info.name, map->name, sizeof(map->name));
4572
4573         if (map->btf) {
4574                 info.btf_id = btf_obj_id(map->btf);
4575                 info.btf_key_type_id = map->btf_key_type_id;
4576                 info.btf_value_type_id = map->btf_value_type_id;
4577         }
4578         info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id;
4579
4580         if (bpf_map_is_offloaded(map)) {
4581                 err = bpf_map_offload_info_fill(&info, map);
4582                 if (err)
4583                         return err;
4584         }
4585
4586         if (copy_to_user(uinfo, &info, info_len) ||
4587             put_user(info_len, &uattr->info.info_len))
4588                 return -EFAULT;
4589
4590         return 0;
4591 }
4592
4593 static int bpf_btf_get_info_by_fd(struct file *file,
4594                                   struct btf *btf,
4595                                   const union bpf_attr *attr,
4596                                   union bpf_attr __user *uattr)
4597 {
4598         struct bpf_btf_info __user *uinfo = u64_to_user_ptr(attr->info.info);
4599         u32 info_len = attr->info.info_len;
4600         int err;
4601
4602         err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(*uinfo), info_len);
4603         if (err)
4604                 return err;
4605
4606         return btf_get_info_by_fd(btf, attr, uattr);
4607 }
4608
4609 static int bpf_link_get_info_by_fd(struct file *file,
4610                                   struct bpf_link *link,
4611                                   const union bpf_attr *attr,
4612                                   union bpf_attr __user *uattr)
4613 {
4614         struct bpf_link_info __user *uinfo = u64_to_user_ptr(attr->info.info);
4615         struct bpf_link_info info;
4616         u32 info_len = attr->info.info_len;
4617         int err;
4618
4619         err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
4620         if (err)
4621                 return err;
4622         info_len = min_t(u32, sizeof(info), info_len);
4623
4624         memset(&info, 0, sizeof(info));
4625         if (copy_from_user(&info, uinfo, info_len))
4626                 return -EFAULT;
4627
4628         info.type = link->type;
4629         info.id = link->id;
4630         if (link->prog)
4631                 info.prog_id = link->prog->aux->id;
4632
4633         if (link->ops->fill_link_info) {
4634                 err = link->ops->fill_link_info(link, &info);
4635                 if (err)
4636                         return err;
4637         }
4638
4639         if (copy_to_user(uinfo, &info, info_len) ||
4640             put_user(info_len, &uattr->info.info_len))
4641                 return -EFAULT;
4642
4643         return 0;
4644 }
4645
4646
4647 #define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info
4648
4649 static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
4650                                   union bpf_attr __user *uattr)
4651 {
4652         int ufd = attr->info.bpf_fd;
4653         struct fd f;
4654         int err;
4655
4656         if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD))
4657                 return -EINVAL;
4658
4659         f = fdget(ufd);
4660         if (!f.file)
4661                 return -EBADFD;
4662
4663         if (f.file->f_op == &bpf_prog_fops)
4664                 err = bpf_prog_get_info_by_fd(f.file, f.file->private_data, attr,
4665                                               uattr);
4666         else if (f.file->f_op == &bpf_map_fops)
4667                 err = bpf_map_get_info_by_fd(f.file, f.file->private_data, attr,
4668                                              uattr);
4669         else if (f.file->f_op == &btf_fops)
4670                 err = bpf_btf_get_info_by_fd(f.file, f.file->private_data, attr, uattr);
4671         else if (f.file->f_op == &bpf_link_fops)
4672                 err = bpf_link_get_info_by_fd(f.file, f.file->private_data,
4673                                               attr, uattr);
4674         else
4675                 err = -EINVAL;
4676
4677         fdput(f);
4678         return err;
4679 }
4680
4681 #define BPF_BTF_LOAD_LAST_FIELD btf_log_true_size
4682
4683 static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
4684 {
4685         if (CHECK_ATTR(BPF_BTF_LOAD))
4686                 return -EINVAL;
4687
4688         if (!bpf_capable())
4689                 return -EPERM;
4690
4691         return btf_new_fd(attr, uattr, uattr_size);
4692 }
4693
4694 #define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id
4695
4696 static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
4697 {
4698         if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID))
4699                 return -EINVAL;
4700
4701         if (!capable(CAP_SYS_ADMIN))
4702                 return -EPERM;
4703
4704         return btf_get_fd_by_id(attr->btf_id);
4705 }
4706
4707 static int bpf_task_fd_query_copy(const union bpf_attr *attr,
4708                                     union bpf_attr __user *uattr,
4709                                     u32 prog_id, u32 fd_type,
4710                                     const char *buf, u64 probe_offset,
4711                                     u64 probe_addr)
4712 {
4713         char __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf);
4714         u32 len = buf ? strlen(buf) : 0, input_len;
4715         int err = 0;
4716
4717         if (put_user(len, &uattr->task_fd_query.buf_len))
4718                 return -EFAULT;
4719         input_len = attr->task_fd_query.buf_len;
4720         if (input_len && ubuf) {
4721                 if (!len) {
4722                         /* nothing to copy, just make ubuf NULL terminated */
4723                         char zero = '\0';
4724
4725                         if (put_user(zero, ubuf))
4726                                 return -EFAULT;
4727                 } else if (input_len >= len + 1) {
4728                         /* ubuf can hold the string with NULL terminator */
4729                         if (copy_to_user(ubuf, buf, len + 1))
4730                                 return -EFAULT;
4731                 } else {
4732                         /* ubuf cannot hold the string with NULL terminator,
4733                          * do a partial copy with NULL terminator.
4734                          */
4735                         char zero = '\0';
4736
4737                         err = -ENOSPC;
4738                         if (copy_to_user(ubuf, buf, input_len - 1))
4739                                 return -EFAULT;
4740                         if (put_user(zero, ubuf + input_len - 1))
4741                                 return -EFAULT;
4742                 }
4743         }
4744
4745         if (put_user(prog_id, &uattr->task_fd_query.prog_id) ||
4746             put_user(fd_type, &uattr->task_fd_query.fd_type) ||
4747             put_user(probe_offset, &uattr->task_fd_query.probe_offset) ||
4748             put_user(probe_addr, &uattr->task_fd_query.probe_addr))
4749                 return -EFAULT;
4750
4751         return err;
4752 }
4753
4754 #define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr
4755
4756 static int bpf_task_fd_query(const union bpf_attr *attr,
4757                              union bpf_attr __user *uattr)
4758 {
4759         pid_t pid = attr->task_fd_query.pid;
4760         u32 fd = attr->task_fd_query.fd;
4761         const struct perf_event *event;
4762         struct task_struct *task;
4763         struct file *file;
4764         int err;
4765
4766         if (CHECK_ATTR(BPF_TASK_FD_QUERY))
4767                 return -EINVAL;
4768
4769         if (!capable(CAP_SYS_ADMIN))
4770                 return -EPERM;
4771
4772         if (attr->task_fd_query.flags != 0)
4773                 return -EINVAL;
4774
4775         rcu_read_lock();
4776         task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
4777         rcu_read_unlock();
4778         if (!task)
4779                 return -ENOENT;
4780
4781         err = 0;
4782         file = fget_task(task, fd);
4783         put_task_struct(task);
4784         if (!file)
4785                 return -EBADF;
4786
4787         if (file->f_op == &bpf_link_fops) {
4788                 struct bpf_link *link = file->private_data;
4789
4790                 if (link->ops == &bpf_raw_tp_link_lops) {
4791                         struct bpf_raw_tp_link *raw_tp =
4792                                 container_of(link, struct bpf_raw_tp_link, link);
4793                         struct bpf_raw_event_map *btp = raw_tp->btp;
4794
4795                         err = bpf_task_fd_query_copy(attr, uattr,
4796                                                      raw_tp->link.prog->aux->id,
4797                                                      BPF_FD_TYPE_RAW_TRACEPOINT,
4798                                                      btp->tp->name, 0, 0);
4799                         goto put_file;
4800                 }
4801                 goto out_not_supp;
4802         }
4803
4804         event = perf_get_event(file);
4805         if (!IS_ERR(event)) {
4806                 u64 probe_offset, probe_addr;
4807                 u32 prog_id, fd_type;
4808                 const char *buf;
4809
4810                 err = bpf_get_perf_event_info(event, &prog_id, &fd_type,
4811                                               &buf, &probe_offset,
4812                                               &probe_addr);
4813                 if (!err)
4814                         err = bpf_task_fd_query_copy(attr, uattr, prog_id,
4815                                                      fd_type, buf,
4816                                                      probe_offset,
4817                                                      probe_addr);
4818                 goto put_file;
4819         }
4820
4821 out_not_supp:
4822         err = -ENOTSUPP;
4823 put_file:
4824         fput(file);
4825         return err;
4826 }
4827
4828 #define BPF_MAP_BATCH_LAST_FIELD batch.flags
4829
4830 #define BPF_DO_BATCH(fn, ...)                   \
4831         do {                                    \
4832                 if (!fn) {                      \
4833                         err = -ENOTSUPP;        \
4834                         goto err_put;           \
4835                 }                               \
4836                 err = fn(__VA_ARGS__);          \
4837         } while (0)
4838
4839 static int bpf_map_do_batch(const union bpf_attr *attr,
4840                             union bpf_attr __user *uattr,
4841                             int cmd)
4842 {
4843         bool has_read  = cmd == BPF_MAP_LOOKUP_BATCH ||
4844                          cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH;
4845         bool has_write = cmd != BPF_MAP_LOOKUP_BATCH;
4846         struct bpf_map *map;
4847         int err, ufd;
4848         struct fd f;
4849
4850         if (CHECK_ATTR(BPF_MAP_BATCH))
4851                 return -EINVAL;
4852
4853         ufd = attr->batch.map_fd;
4854         f = fdget(ufd);
4855         map = __bpf_map_get(f);
4856         if (IS_ERR(map))
4857                 return PTR_ERR(map);
4858         if (has_write)
4859                 bpf_map_write_active_inc(map);
4860         if (has_read && !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
4861                 err = -EPERM;
4862                 goto err_put;
4863         }
4864         if (has_write && !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
4865                 err = -EPERM;
4866                 goto err_put;
4867         }
4868
4869         if (cmd == BPF_MAP_LOOKUP_BATCH)
4870                 BPF_DO_BATCH(map->ops->map_lookup_batch, map, attr, uattr);
4871         else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH)
4872                 BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch, map, attr, uattr);
4873         else if (cmd == BPF_MAP_UPDATE_BATCH)
4874                 BPF_DO_BATCH(map->ops->map_update_batch, map, f.file, attr, uattr);
4875         else
4876                 BPF_DO_BATCH(map->ops->map_delete_batch, map, attr, uattr);
4877 err_put:
4878         if (has_write)
4879                 bpf_map_write_active_dec(map);
4880         fdput(f);
4881         return err;
4882 }
4883
4884 #define BPF_LINK_CREATE_LAST_FIELD link_create.uprobe_multi.pid
4885 static int link_create(union bpf_attr *attr, bpfptr_t uattr)
4886 {
4887         struct bpf_prog *prog;
4888         int ret;
4889
4890         if (CHECK_ATTR(BPF_LINK_CREATE))
4891                 return -EINVAL;
4892
4893         if (attr->link_create.attach_type == BPF_STRUCT_OPS)
4894                 return bpf_struct_ops_link_create(attr);
4895
4896         prog = bpf_prog_get(attr->link_create.prog_fd);
4897         if (IS_ERR(prog))
4898                 return PTR_ERR(prog);
4899
4900         ret = bpf_prog_attach_check_attach_type(prog,
4901                                                 attr->link_create.attach_type);
4902         if (ret)
4903                 goto out;
4904
4905         switch (prog->type) {
4906         case BPF_PROG_TYPE_CGROUP_SKB:
4907         case BPF_PROG_TYPE_CGROUP_SOCK:
4908         case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
4909         case BPF_PROG_TYPE_SOCK_OPS:
4910         case BPF_PROG_TYPE_CGROUP_DEVICE:
4911         case BPF_PROG_TYPE_CGROUP_SYSCTL:
4912         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
4913                 ret = cgroup_bpf_link_attach(attr, prog);
4914                 break;
4915         case BPF_PROG_TYPE_EXT:
4916                 ret = bpf_tracing_prog_attach(prog,
4917                                               attr->link_create.target_fd,
4918                                               attr->link_create.target_btf_id,
4919                                               attr->link_create.tracing.cookie);
4920                 break;
4921         case BPF_PROG_TYPE_LSM:
4922         case BPF_PROG_TYPE_TRACING:
4923                 if (attr->link_create.attach_type != prog->expected_attach_type) {
4924                         ret = -EINVAL;
4925                         goto out;
4926                 }
4927                 if (prog->expected_attach_type == BPF_TRACE_RAW_TP)
4928                         ret = bpf_raw_tp_link_attach(prog, NULL);
4929                 else if (prog->expected_attach_type == BPF_TRACE_ITER)
4930                         ret = bpf_iter_link_attach(attr, uattr, prog);
4931                 else if (prog->expected_attach_type == BPF_LSM_CGROUP)
4932                         ret = cgroup_bpf_link_attach(attr, prog);
4933                 else
4934                         ret = bpf_tracing_prog_attach(prog,
4935                                                       attr->link_create.target_fd,
4936                                                       attr->link_create.target_btf_id,
4937                                                       attr->link_create.tracing.cookie);
4938                 break;
4939         case BPF_PROG_TYPE_FLOW_DISSECTOR:
4940         case BPF_PROG_TYPE_SK_LOOKUP:
4941                 ret = netns_bpf_link_create(attr, prog);
4942                 break;
4943 #ifdef CONFIG_NET
4944         case BPF_PROG_TYPE_XDP:
4945                 ret = bpf_xdp_link_attach(attr, prog);
4946                 break;
4947         case BPF_PROG_TYPE_SCHED_CLS:
4948                 ret = tcx_link_attach(attr, prog);
4949                 break;
4950         case BPF_PROG_TYPE_NETFILTER:
4951                 ret = bpf_nf_link_attach(attr, prog);
4952                 break;
4953 #endif
4954         case BPF_PROG_TYPE_PERF_EVENT:
4955         case BPF_PROG_TYPE_TRACEPOINT:
4956                 ret = bpf_perf_link_attach(attr, prog);
4957                 break;
4958         case BPF_PROG_TYPE_KPROBE:
4959                 if (attr->link_create.attach_type == BPF_PERF_EVENT)
4960                         ret = bpf_perf_link_attach(attr, prog);
4961                 else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI)
4962                         ret = bpf_kprobe_multi_link_attach(attr, prog);
4963                 else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI)
4964                         ret = bpf_uprobe_multi_link_attach(attr, prog);
4965                 break;
4966         default:
4967                 ret = -EINVAL;
4968         }
4969
4970 out:
4971         if (ret < 0)
4972                 bpf_prog_put(prog);
4973         return ret;
4974 }
4975
4976 static int link_update_map(struct bpf_link *link, union bpf_attr *attr)
4977 {
4978         struct bpf_map *new_map, *old_map = NULL;
4979         int ret;
4980
4981         new_map = bpf_map_get(attr->link_update.new_map_fd);
4982         if (IS_ERR(new_map))
4983                 return PTR_ERR(new_map);
4984
4985         if (attr->link_update.flags & BPF_F_REPLACE) {
4986                 old_map = bpf_map_get(attr->link_update.old_map_fd);
4987                 if (IS_ERR(old_map)) {
4988                         ret = PTR_ERR(old_map);
4989                         goto out_put;
4990                 }
4991         } else if (attr->link_update.old_map_fd) {
4992                 ret = -EINVAL;
4993                 goto out_put;
4994         }
4995
4996         ret = link->ops->update_map(link, new_map, old_map);
4997
4998         if (old_map)
4999                 bpf_map_put(old_map);
5000 out_put:
5001         bpf_map_put(new_map);
5002         return ret;
5003 }
5004
5005 #define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd
5006
5007 static int link_update(union bpf_attr *attr)
5008 {
5009         struct bpf_prog *old_prog = NULL, *new_prog;
5010         struct bpf_link *link;
5011         u32 flags;
5012         int ret;
5013
5014         if (CHECK_ATTR(BPF_LINK_UPDATE))
5015                 return -EINVAL;
5016
5017         flags = attr->link_update.flags;
5018         if (flags & ~BPF_F_REPLACE)
5019                 return -EINVAL;
5020
5021         link = bpf_link_get_from_fd(attr->link_update.link_fd);
5022         if (IS_ERR(link))
5023                 return PTR_ERR(link);
5024
5025         if (link->ops->update_map) {
5026                 ret = link_update_map(link, attr);
5027                 goto out_put_link;
5028         }
5029
5030         new_prog = bpf_prog_get(attr->link_update.new_prog_fd);
5031         if (IS_ERR(new_prog)) {
5032                 ret = PTR_ERR(new_prog);
5033                 goto out_put_link;
5034         }
5035
5036         if (flags & BPF_F_REPLACE) {
5037                 old_prog = bpf_prog_get(attr->link_update.old_prog_fd);
5038                 if (IS_ERR(old_prog)) {
5039                         ret = PTR_ERR(old_prog);
5040                         old_prog = NULL;
5041                         goto out_put_progs;
5042                 }
5043         } else if (attr->link_update.old_prog_fd) {
5044                 ret = -EINVAL;
5045                 goto out_put_progs;
5046         }
5047
5048         if (link->ops->update_prog)
5049                 ret = link->ops->update_prog(link, new_prog, old_prog);
5050         else
5051                 ret = -EINVAL;
5052
5053 out_put_progs:
5054         if (old_prog)
5055                 bpf_prog_put(old_prog);
5056         if (ret)
5057                 bpf_prog_put(new_prog);
5058 out_put_link:
5059         bpf_link_put_direct(link);
5060         return ret;
5061 }
5062
5063 #define BPF_LINK_DETACH_LAST_FIELD link_detach.link_fd
5064
5065 static int link_detach(union bpf_attr *attr)
5066 {
5067         struct bpf_link *link;
5068         int ret;
5069
5070         if (CHECK_ATTR(BPF_LINK_DETACH))
5071                 return -EINVAL;
5072
5073         link = bpf_link_get_from_fd(attr->link_detach.link_fd);
5074         if (IS_ERR(link))
5075                 return PTR_ERR(link);
5076
5077         if (link->ops->detach)
5078                 ret = link->ops->detach(link);
5079         else
5080                 ret = -EOPNOTSUPP;
5081
5082         bpf_link_put_direct(link);
5083         return ret;
5084 }
5085
5086 static struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link)
5087 {
5088         return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? link : ERR_PTR(-ENOENT);
5089 }
5090
5091 struct bpf_link *bpf_link_by_id(u32 id)
5092 {
5093         struct bpf_link *link;
5094
5095         if (!id)
5096                 return ERR_PTR(-ENOENT);
5097
5098         spin_lock_bh(&link_idr_lock);
5099         /* before link is "settled", ID is 0, pretend it doesn't exist yet */
5100         link = idr_find(&link_idr, id);
5101         if (link) {
5102                 if (link->id)
5103                         link = bpf_link_inc_not_zero(link);
5104                 else
5105                         link = ERR_PTR(-EAGAIN);
5106         } else {
5107                 link = ERR_PTR(-ENOENT);
5108         }
5109         spin_unlock_bh(&link_idr_lock);
5110         return link;
5111 }
5112
5113 struct bpf_link *bpf_link_get_curr_or_next(u32 *id)
5114 {
5115         struct bpf_link *link;
5116
5117         spin_lock_bh(&link_idr_lock);
5118 again:
5119         link = idr_get_next(&link_idr, id);
5120         if (link) {
5121                 link = bpf_link_inc_not_zero(link);
5122                 if (IS_ERR(link)) {
5123                         (*id)++;
5124                         goto again;
5125                 }
5126         }
5127         spin_unlock_bh(&link_idr_lock);
5128
5129         return link;
5130 }
5131
5132 #define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id
5133
5134 static int bpf_link_get_fd_by_id(const union bpf_attr *attr)
5135 {
5136         struct bpf_link *link;
5137         u32 id = attr->link_id;
5138         int fd;
5139
5140         if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID))
5141                 return -EINVAL;
5142
5143         if (!capable(CAP_SYS_ADMIN))
5144                 return -EPERM;
5145
5146         link = bpf_link_by_id(id);
5147         if (IS_ERR(link))
5148                 return PTR_ERR(link);
5149
5150         fd = bpf_link_new_fd(link);
5151         if (fd < 0)
5152                 bpf_link_put_direct(link);
5153
5154         return fd;
5155 }
5156
5157 DEFINE_MUTEX(bpf_stats_enabled_mutex);
5158
5159 static int bpf_stats_release(struct inode *inode, struct file *file)
5160 {
5161         mutex_lock(&bpf_stats_enabled_mutex);
5162         static_key_slow_dec(&bpf_stats_enabled_key.key);
5163         mutex_unlock(&bpf_stats_enabled_mutex);
5164         return 0;
5165 }
5166
5167 static const struct file_operations bpf_stats_fops = {
5168         .release = bpf_stats_release,
5169 };
5170
5171 static int bpf_enable_runtime_stats(void)
5172 {
5173         int fd;
5174
5175         mutex_lock(&bpf_stats_enabled_mutex);
5176
5177         /* Set a very high limit to avoid overflow */
5178         if (static_key_count(&bpf_stats_enabled_key.key) > INT_MAX / 2) {
5179                 mutex_unlock(&bpf_stats_enabled_mutex);
5180                 return -EBUSY;
5181         }
5182
5183         fd = anon_inode_getfd("bpf-stats", &bpf_stats_fops, NULL, O_CLOEXEC);
5184         if (fd >= 0)
5185                 static_key_slow_inc(&bpf_stats_enabled_key.key);
5186
5187         mutex_unlock(&bpf_stats_enabled_mutex);
5188         return fd;
5189 }
5190
5191 #define BPF_ENABLE_STATS_LAST_FIELD enable_stats.type
5192
5193 static int bpf_enable_stats(union bpf_attr *attr)
5194 {
5195
5196         if (CHECK_ATTR(BPF_ENABLE_STATS))
5197                 return -EINVAL;
5198
5199         if (!capable(CAP_SYS_ADMIN))
5200                 return -EPERM;
5201
5202         switch (attr->enable_stats.type) {
5203         case BPF_STATS_RUN_TIME:
5204                 return bpf_enable_runtime_stats();
5205         default:
5206                 break;
5207         }
5208         return -EINVAL;
5209 }
5210
5211 #define BPF_ITER_CREATE_LAST_FIELD iter_create.flags
5212
5213 static int bpf_iter_create(union bpf_attr *attr)
5214 {
5215         struct bpf_link *link;
5216         int err;
5217
5218         if (CHECK_ATTR(BPF_ITER_CREATE))
5219                 return -EINVAL;
5220
5221         if (attr->iter_create.flags)
5222                 return -EINVAL;
5223
5224         link = bpf_link_get_from_fd(attr->iter_create.link_fd);
5225         if (IS_ERR(link))
5226                 return PTR_ERR(link);
5227
5228         err = bpf_iter_new_fd(link);
5229         bpf_link_put_direct(link);
5230
5231         return err;
5232 }
5233
5234 #define BPF_PROG_BIND_MAP_LAST_FIELD prog_bind_map.flags
5235
5236 static int bpf_prog_bind_map(union bpf_attr *attr)
5237 {
5238         struct bpf_prog *prog;
5239         struct bpf_map *map;
5240         struct bpf_map **used_maps_old, **used_maps_new;
5241         int i, ret = 0;
5242
5243         if (CHECK_ATTR(BPF_PROG_BIND_MAP))
5244                 return -EINVAL;
5245
5246         if (attr->prog_bind_map.flags)
5247                 return -EINVAL;
5248
5249         prog = bpf_prog_get(attr->prog_bind_map.prog_fd);
5250         if (IS_ERR(prog))
5251                 return PTR_ERR(prog);
5252
5253         map = bpf_map_get(attr->prog_bind_map.map_fd);
5254         if (IS_ERR(map)) {
5255                 ret = PTR_ERR(map);
5256                 goto out_prog_put;
5257         }
5258
5259         mutex_lock(&prog->aux->used_maps_mutex);
5260
5261         used_maps_old = prog->aux->used_maps;
5262
5263         for (i = 0; i < prog->aux->used_map_cnt; i++)
5264                 if (used_maps_old[i] == map) {
5265                         bpf_map_put(map);
5266                         goto out_unlock;
5267                 }
5268
5269         used_maps_new = kmalloc_array(prog->aux->used_map_cnt + 1,
5270                                       sizeof(used_maps_new[0]),
5271                                       GFP_KERNEL);
5272         if (!used_maps_new) {
5273                 ret = -ENOMEM;
5274                 goto out_unlock;
5275         }
5276
5277         memcpy(used_maps_new, used_maps_old,
5278                sizeof(used_maps_old[0]) * prog->aux->used_map_cnt);
5279         used_maps_new[prog->aux->used_map_cnt] = map;
5280
5281         prog->aux->used_map_cnt++;
5282         prog->aux->used_maps = used_maps_new;
5283
5284         kfree(used_maps_old);
5285
5286 out_unlock:
5287         mutex_unlock(&prog->aux->used_maps_mutex);
5288
5289         if (ret)
5290                 bpf_map_put(map);
5291 out_prog_put:
5292         bpf_prog_put(prog);
5293         return ret;
5294 }
5295
5296 static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
5297 {
5298         union bpf_attr attr;
5299         int err;
5300
5301         err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
5302         if (err)
5303                 return err;
5304         size = min_t(u32, size, sizeof(attr));
5305
5306         /* copy attributes from user space, may be less than sizeof(bpf_attr) */
5307         memset(&attr, 0, sizeof(attr));
5308         if (copy_from_bpfptr(&attr, uattr, size) != 0)
5309                 return -EFAULT;
5310
5311         err = security_bpf(cmd, &attr, size);
5312         if (err < 0)
5313                 return err;
5314
5315         switch (cmd) {
5316         case BPF_MAP_CREATE:
5317                 err = map_create(&attr);
5318                 break;
5319         case BPF_MAP_LOOKUP_ELEM:
5320                 err = map_lookup_elem(&attr);
5321                 break;
5322         case BPF_MAP_UPDATE_ELEM:
5323                 err = map_update_elem(&attr, uattr);
5324                 break;
5325         case BPF_MAP_DELETE_ELEM:
5326                 err = map_delete_elem(&attr, uattr);
5327                 break;
5328         case BPF_MAP_GET_NEXT_KEY:
5329                 err = map_get_next_key(&attr);
5330                 break;
5331         case BPF_MAP_FREEZE:
5332                 err = map_freeze(&attr);
5333                 break;
5334         case BPF_PROG_LOAD:
5335                 err = bpf_prog_load(&attr, uattr, size);
5336                 break;
5337         case BPF_OBJ_PIN:
5338                 err = bpf_obj_pin(&attr);
5339                 break;
5340         case BPF_OBJ_GET:
5341                 err = bpf_obj_get(&attr);
5342                 break;
5343         case BPF_PROG_ATTACH:
5344                 err = bpf_prog_attach(&attr);
5345                 break;
5346         case BPF_PROG_DETACH:
5347                 err = bpf_prog_detach(&attr);
5348                 break;
5349         case BPF_PROG_QUERY:
5350                 err = bpf_prog_query(&attr, uattr.user);
5351                 break;
5352         case BPF_PROG_TEST_RUN:
5353                 err = bpf_prog_test_run(&attr, uattr.user);
5354                 break;
5355         case BPF_PROG_GET_NEXT_ID:
5356                 err = bpf_obj_get_next_id(&attr, uattr.user,
5357                                           &prog_idr, &prog_idr_lock);
5358                 break;
5359         case BPF_MAP_GET_NEXT_ID:
5360                 err = bpf_obj_get_next_id(&attr, uattr.user,
5361                                           &map_idr, &map_idr_lock);
5362                 break;
5363         case BPF_BTF_GET_NEXT_ID:
5364                 err = bpf_obj_get_next_id(&attr, uattr.user,
5365                                           &btf_idr, &btf_idr_lock);
5366                 break;
5367         case BPF_PROG_GET_FD_BY_ID:
5368                 err = bpf_prog_get_fd_by_id(&attr);
5369                 break;
5370         case BPF_MAP_GET_FD_BY_ID:
5371                 err = bpf_map_get_fd_by_id(&attr);
5372                 break;
5373         case BPF_OBJ_GET_INFO_BY_FD:
5374                 err = bpf_obj_get_info_by_fd(&attr, uattr.user);
5375                 break;
5376         case BPF_RAW_TRACEPOINT_OPEN:
5377                 err = bpf_raw_tracepoint_open(&attr);
5378                 break;
5379         case BPF_BTF_LOAD:
5380                 err = bpf_btf_load(&attr, uattr, size);
5381                 break;
5382         case BPF_BTF_GET_FD_BY_ID:
5383                 err = bpf_btf_get_fd_by_id(&attr);
5384                 break;
5385         case BPF_TASK_FD_QUERY:
5386                 err = bpf_task_fd_query(&attr, uattr.user);
5387                 break;
5388         case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
5389                 err = map_lookup_and_delete_elem(&attr);
5390                 break;
5391         case BPF_MAP_LOOKUP_BATCH:
5392                 err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_LOOKUP_BATCH);
5393                 break;
5394         case BPF_MAP_LOOKUP_AND_DELETE_BATCH:
5395                 err = bpf_map_do_batch(&attr, uattr.user,
5396                                        BPF_MAP_LOOKUP_AND_DELETE_BATCH);
5397                 break;
5398         case BPF_MAP_UPDATE_BATCH:
5399                 err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_UPDATE_BATCH);
5400                 break;
5401         case BPF_MAP_DELETE_BATCH:
5402                 err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_DELETE_BATCH);
5403                 break;
5404         case BPF_LINK_CREATE:
5405                 err = link_create(&attr, uattr);
5406                 break;
5407         case BPF_LINK_UPDATE:
5408                 err = link_update(&attr);
5409                 break;
5410         case BPF_LINK_GET_FD_BY_ID:
5411                 err = bpf_link_get_fd_by_id(&attr);
5412                 break;
5413         case BPF_LINK_GET_NEXT_ID:
5414                 err = bpf_obj_get_next_id(&attr, uattr.user,
5415                                           &link_idr, &link_idr_lock);
5416                 break;
5417         case BPF_ENABLE_STATS:
5418                 err = bpf_enable_stats(&attr);
5419                 break;
5420         case BPF_ITER_CREATE:
5421                 err = bpf_iter_create(&attr);
5422                 break;
5423         case BPF_LINK_DETACH:
5424                 err = link_detach(&attr);
5425                 break;
5426         case BPF_PROG_BIND_MAP:
5427                 err = bpf_prog_bind_map(&attr);
5428                 break;
5429         default:
5430                 err = -EINVAL;
5431                 break;
5432         }
5433
5434         return err;
5435 }
5436
5437 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
5438 {
5439         return __sys_bpf(cmd, USER_BPFPTR(uattr), size);
5440 }
5441
5442 static bool syscall_prog_is_valid_access(int off, int size,
5443                                          enum bpf_access_type type,
5444                                          const struct bpf_prog *prog,
5445                                          struct bpf_insn_access_aux *info)
5446 {
5447         if (off < 0 || off >= U16_MAX)
5448                 return false;
5449         if (off % size != 0)
5450                 return false;
5451         return true;
5452 }
5453
5454 BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size)
5455 {
5456         switch (cmd) {
5457         case BPF_MAP_CREATE:
5458         case BPF_MAP_DELETE_ELEM:
5459         case BPF_MAP_UPDATE_ELEM:
5460         case BPF_MAP_FREEZE:
5461         case BPF_MAP_GET_FD_BY_ID:
5462         case BPF_PROG_LOAD:
5463         case BPF_BTF_LOAD:
5464         case BPF_LINK_CREATE:
5465         case BPF_RAW_TRACEPOINT_OPEN:
5466                 break;
5467         default:
5468                 return -EINVAL;
5469         }
5470         return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size);
5471 }
5472
5473
5474 /* To shut up -Wmissing-prototypes.
5475  * This function is used by the kernel light skeleton
5476  * to load bpf programs when modules are loaded or during kernel boot.
5477  * See tools/lib/bpf/skel_internal.h
5478  */
5479 int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size);
5480
5481 int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size)
5482 {
5483         struct bpf_prog * __maybe_unused prog;
5484         struct bpf_tramp_run_ctx __maybe_unused run_ctx;
5485
5486         switch (cmd) {
5487 #ifdef CONFIG_BPF_JIT /* __bpf_prog_enter_sleepable used by trampoline and JIT */
5488         case BPF_PROG_TEST_RUN:
5489                 if (attr->test.data_in || attr->test.data_out ||
5490                     attr->test.ctx_out || attr->test.duration ||
5491                     attr->test.repeat || attr->test.flags)
5492                         return -EINVAL;
5493
5494                 prog = bpf_prog_get_type(attr->test.prog_fd, BPF_PROG_TYPE_SYSCALL);
5495                 if (IS_ERR(prog))
5496                         return PTR_ERR(prog);
5497
5498                 if (attr->test.ctx_size_in < prog->aux->max_ctx_offset ||
5499                     attr->test.ctx_size_in > U16_MAX) {
5500                         bpf_prog_put(prog);
5501                         return -EINVAL;
5502                 }
5503
5504                 run_ctx.bpf_cookie = 0;
5505                 run_ctx.saved_run_ctx = NULL;
5506                 if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) {
5507                         /* recursion detected */
5508                         bpf_prog_put(prog);
5509                         return -EBUSY;
5510                 }
5511                 attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in);
5512                 __bpf_prog_exit_sleepable_recur(prog, 0 /* bpf_prog_run does runtime stats */,
5513                                                 &run_ctx);
5514                 bpf_prog_put(prog);
5515                 return 0;
5516 #endif
5517         default:
5518                 return ____bpf_sys_bpf(cmd, attr, size);
5519         }
5520 }
5521 EXPORT_SYMBOL(kern_sys_bpf);
5522
5523 static const struct bpf_func_proto bpf_sys_bpf_proto = {
5524         .func           = bpf_sys_bpf,
5525         .gpl_only       = false,
5526         .ret_type       = RET_INTEGER,
5527         .arg1_type      = ARG_ANYTHING,
5528         .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
5529         .arg3_type      = ARG_CONST_SIZE,
5530 };
5531
5532 const struct bpf_func_proto * __weak
5533 tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
5534 {
5535         return bpf_base_func_proto(func_id);
5536 }
5537
5538 BPF_CALL_1(bpf_sys_close, u32, fd)
5539 {
5540         /* When bpf program calls this helper there should not be
5541          * an fdget() without matching completed fdput().
5542          * This helper is allowed in the following callchain only:
5543          * sys_bpf->prog_test_run->bpf_prog->bpf_sys_close
5544          */
5545         return close_fd(fd);
5546 }
5547
5548 static const struct bpf_func_proto bpf_sys_close_proto = {
5549         .func           = bpf_sys_close,
5550         .gpl_only       = false,
5551         .ret_type       = RET_INTEGER,
5552         .arg1_type      = ARG_ANYTHING,
5553 };
5554
5555 BPF_CALL_4(bpf_kallsyms_lookup_name, const char *, name, int, name_sz, int, flags, u64 *, res)
5556 {
5557         if (flags)
5558                 return -EINVAL;
5559
5560         if (name_sz <= 1 || name[name_sz - 1])
5561                 return -EINVAL;
5562
5563         if (!bpf_dump_raw_ok(current_cred()))
5564                 return -EPERM;
5565
5566         *res = kallsyms_lookup_name(name);
5567         return *res ? 0 : -ENOENT;
5568 }
5569
5570 static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = {
5571         .func           = bpf_kallsyms_lookup_name,
5572         .gpl_only       = false,
5573         .ret_type       = RET_INTEGER,
5574         .arg1_type      = ARG_PTR_TO_MEM,
5575         .arg2_type      = ARG_CONST_SIZE_OR_ZERO,
5576         .arg3_type      = ARG_ANYTHING,
5577         .arg4_type      = ARG_PTR_TO_LONG,
5578 };
5579
5580 static const struct bpf_func_proto *
5581 syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
5582 {
5583         switch (func_id) {
5584         case BPF_FUNC_sys_bpf:
5585                 return !perfmon_capable() ? NULL : &bpf_sys_bpf_proto;
5586         case BPF_FUNC_btf_find_by_name_kind:
5587                 return &bpf_btf_find_by_name_kind_proto;
5588         case BPF_FUNC_sys_close:
5589                 return &bpf_sys_close_proto;
5590         case BPF_FUNC_kallsyms_lookup_name:
5591                 return &bpf_kallsyms_lookup_name_proto;
5592         default:
5593                 return tracing_prog_func_proto(func_id, prog);
5594         }
5595 }
5596
5597 const struct bpf_verifier_ops bpf_syscall_verifier_ops = {
5598         .get_func_proto  = syscall_prog_func_proto,
5599         .is_valid_access = syscall_prog_is_valid_access,
5600 };
5601
5602 const struct bpf_prog_ops bpf_syscall_prog_ops = {
5603         .test_run = bpf_prog_test_run_syscall,
5604 };
5605
5606 #ifdef CONFIG_SYSCTL
5607 static int bpf_stats_handler(struct ctl_table *table, int write,
5608                              void *buffer, size_t *lenp, loff_t *ppos)
5609 {
5610         struct static_key *key = (struct static_key *)table->data;
5611         static int saved_val;
5612         int val, ret;
5613         struct ctl_table tmp = {
5614                 .data   = &val,
5615                 .maxlen = sizeof(val),
5616                 .mode   = table->mode,
5617                 .extra1 = SYSCTL_ZERO,
5618                 .extra2 = SYSCTL_ONE,
5619         };
5620
5621         if (write && !capable(CAP_SYS_ADMIN))
5622                 return -EPERM;
5623
5624         mutex_lock(&bpf_stats_enabled_mutex);
5625         val = saved_val;
5626         ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
5627         if (write && !ret && val != saved_val) {
5628                 if (val)
5629                         static_key_slow_inc(key);
5630                 else
5631                         static_key_slow_dec(key);
5632                 saved_val = val;
5633         }
5634         mutex_unlock(&bpf_stats_enabled_mutex);
5635         return ret;
5636 }
5637
5638 void __weak unpriv_ebpf_notify(int new_state)
5639 {
5640 }
5641
5642 static int bpf_unpriv_handler(struct ctl_table *table, int write,
5643                               void *buffer, size_t *lenp, loff_t *ppos)
5644 {
5645         int ret, unpriv_enable = *(int *)table->data;
5646         bool locked_state = unpriv_enable == 1;
5647         struct ctl_table tmp = *table;
5648
5649         if (write && !capable(CAP_SYS_ADMIN))
5650                 return -EPERM;
5651
5652         tmp.data = &unpriv_enable;
5653         ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
5654         if (write && !ret) {
5655                 if (locked_state && unpriv_enable != 1)
5656                         return -EPERM;
5657                 *(int *)table->data = unpriv_enable;
5658         }
5659
5660         if (write)
5661                 unpriv_ebpf_notify(unpriv_enable);
5662
5663         return ret;
5664 }
5665
5666 static struct ctl_table bpf_syscall_table[] = {
5667         {
5668                 .procname       = "unprivileged_bpf_disabled",
5669                 .data           = &sysctl_unprivileged_bpf_disabled,
5670                 .maxlen         = sizeof(sysctl_unprivileged_bpf_disabled),
5671                 .mode           = 0644,
5672                 .proc_handler   = bpf_unpriv_handler,
5673                 .extra1         = SYSCTL_ZERO,
5674                 .extra2         = SYSCTL_TWO,
5675         },
5676         {
5677                 .procname       = "bpf_stats_enabled",
5678                 .data           = &bpf_stats_enabled_key.key,
5679                 .mode           = 0644,
5680                 .proc_handler   = bpf_stats_handler,
5681         },
5682         { }
5683 };
5684
5685 static int __init bpf_syscall_sysctl_init(void)
5686 {
5687         register_sysctl_init("kernel", bpf_syscall_table);
5688         return 0;
5689 }
5690 late_initcall(bpf_syscall_sysctl_init);
5691 #endif /* CONFIG_SYSCTL */