Merge branch 'akpm' (patches from Andrew)
[linux-2.6-microblaze.git] / kernel / bpf / hashtab.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
3  * Copyright (c) 2016 Facebook
4  */
5 #include <linux/bpf.h>
6 #include <linux/btf.h>
7 #include <linux/jhash.h>
8 #include <linux/filter.h>
9 #include <linux/rculist_nulls.h>
10 #include <linux/random.h>
11 #include <uapi/linux/btf.h>
12 #include <linux/rcupdate_trace.h>
13 #include "percpu_freelist.h"
14 #include "bpf_lru_list.h"
15 #include "map_in_map.h"
16
17 #define HTAB_CREATE_FLAG_MASK                                           \
18         (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE |    \
19          BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED)
20
21 #define BATCH_OPS(_name)                        \
22         .map_lookup_batch =                     \
23         _name##_map_lookup_batch,               \
24         .map_lookup_and_delete_batch =          \
25         _name##_map_lookup_and_delete_batch,    \
26         .map_update_batch =                     \
27         generic_map_update_batch,               \
28         .map_delete_batch =                     \
29         generic_map_delete_batch
30
31 /*
32  * The bucket lock has two protection scopes:
33  *
34  * 1) Serializing concurrent operations from BPF programs on different
35  *    CPUs
36  *
37  * 2) Serializing concurrent operations from BPF programs and sys_bpf()
38  *
39  * BPF programs can execute in any context including perf, kprobes and
40  * tracing. As there are almost no limits where perf, kprobes and tracing
41  * can be invoked from the lock operations need to be protected against
42  * deadlocks. Deadlocks can be caused by recursion and by an invocation in
43  * the lock held section when functions which acquire this lock are invoked
44  * from sys_bpf(). BPF recursion is prevented by incrementing the per CPU
45  * variable bpf_prog_active, which prevents BPF programs attached to perf
46  * events, kprobes and tracing to be invoked before the prior invocation
47  * from one of these contexts completed. sys_bpf() uses the same mechanism
48  * by pinning the task to the current CPU and incrementing the recursion
49  * protection across the map operation.
50  *
51  * This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain
52  * operations like memory allocations (even with GFP_ATOMIC) from atomic
53  * contexts. This is required because even with GFP_ATOMIC the memory
54  * allocator calls into code paths which acquire locks with long held lock
55  * sections. To ensure the deterministic behaviour these locks are regular
56  * spinlocks, which are converted to 'sleepable' spinlocks on RT. The only
57  * true atomic contexts on an RT kernel are the low level hardware
58  * handling, scheduling, low level interrupt handling, NMIs etc. None of
59  * these contexts should ever do memory allocations.
60  *
61  * As regular device interrupt handlers and soft interrupts are forced into
62  * thread context, the existing code which does
63  *   spin_lock*(); alloc(GPF_ATOMIC); spin_unlock*();
64  * just works.
65  *
66  * In theory the BPF locks could be converted to regular spinlocks as well,
67  * but the bucket locks and percpu_freelist locks can be taken from
68  * arbitrary contexts (perf, kprobes, tracepoints) which are required to be
69  * atomic contexts even on RT. These mechanisms require preallocated maps,
70  * so there is no need to invoke memory allocations within the lock held
71  * sections.
72  *
73  * BPF maps which need dynamic allocation are only used from (forced)
74  * thread context on RT and can therefore use regular spinlocks which in
75  * turn allows to invoke memory allocations from the lock held section.
76  *
77  * On a non RT kernel this distinction is neither possible nor required.
78  * spinlock maps to raw_spinlock and the extra code is optimized out by the
79  * compiler.
80  */
81 struct bucket {
82         struct hlist_nulls_head head;
83         union {
84                 raw_spinlock_t raw_lock;
85                 spinlock_t     lock;
86         };
87 };
88
89 #define HASHTAB_MAP_LOCK_COUNT 8
90 #define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1)
91
92 struct bpf_htab {
93         struct bpf_map map;
94         struct bucket *buckets;
95         void *elems;
96         union {
97                 struct pcpu_freelist freelist;
98                 struct bpf_lru lru;
99         };
100         struct htab_elem *__percpu *extra_elems;
101         atomic_t count; /* number of elements in this hashtable */
102         u32 n_buckets;  /* number of hash buckets */
103         u32 elem_size;  /* size of each element in bytes */
104         u32 hashrnd;
105         struct lock_class_key lockdep_key;
106         int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT];
107 };
108
109 /* each htab element is struct htab_elem + key + value */
110 struct htab_elem {
111         union {
112                 struct hlist_nulls_node hash_node;
113                 struct {
114                         void *padding;
115                         union {
116                                 struct bpf_htab *htab;
117                                 struct pcpu_freelist_node fnode;
118                                 struct htab_elem *batch_flink;
119                         };
120                 };
121         };
122         union {
123                 struct rcu_head rcu;
124                 struct bpf_lru_node lru_node;
125         };
126         u32 hash;
127         char key[] __aligned(8);
128 };
129
130 static inline bool htab_is_prealloc(const struct bpf_htab *htab)
131 {
132         return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
133 }
134
135 static inline bool htab_use_raw_lock(const struct bpf_htab *htab)
136 {
137         return (!IS_ENABLED(CONFIG_PREEMPT_RT) || htab_is_prealloc(htab));
138 }
139
140 static void htab_init_buckets(struct bpf_htab *htab)
141 {
142         unsigned i;
143
144         for (i = 0; i < htab->n_buckets; i++) {
145                 INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
146                 if (htab_use_raw_lock(htab)) {
147                         raw_spin_lock_init(&htab->buckets[i].raw_lock);
148                         lockdep_set_class(&htab->buckets[i].raw_lock,
149                                           &htab->lockdep_key);
150                 } else {
151                         spin_lock_init(&htab->buckets[i].lock);
152                         lockdep_set_class(&htab->buckets[i].lock,
153                                           &htab->lockdep_key);
154                 }
155                 cond_resched();
156         }
157 }
158
159 static inline int htab_lock_bucket(const struct bpf_htab *htab,
160                                    struct bucket *b, u32 hash,
161                                    unsigned long *pflags)
162 {
163         unsigned long flags;
164
165         hash = hash & HASHTAB_MAP_LOCK_MASK;
166
167         migrate_disable();
168         if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) {
169                 __this_cpu_dec(*(htab->map_locked[hash]));
170                 migrate_enable();
171                 return -EBUSY;
172         }
173
174         if (htab_use_raw_lock(htab))
175                 raw_spin_lock_irqsave(&b->raw_lock, flags);
176         else
177                 spin_lock_irqsave(&b->lock, flags);
178         *pflags = flags;
179
180         return 0;
181 }
182
183 static inline void htab_unlock_bucket(const struct bpf_htab *htab,
184                                       struct bucket *b, u32 hash,
185                                       unsigned long flags)
186 {
187         hash = hash & HASHTAB_MAP_LOCK_MASK;
188         if (htab_use_raw_lock(htab))
189                 raw_spin_unlock_irqrestore(&b->raw_lock, flags);
190         else
191                 spin_unlock_irqrestore(&b->lock, flags);
192         __this_cpu_dec(*(htab->map_locked[hash]));
193         migrate_enable();
194 }
195
196 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
197
198 static bool htab_is_lru(const struct bpf_htab *htab)
199 {
200         return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH ||
201                 htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
202 }
203
204 static bool htab_is_percpu(const struct bpf_htab *htab)
205 {
206         return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH ||
207                 htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
208 }
209
210 static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
211                                      void __percpu *pptr)
212 {
213         *(void __percpu **)(l->key + key_size) = pptr;
214 }
215
216 static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
217 {
218         return *(void __percpu **)(l->key + key_size);
219 }
220
221 static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l)
222 {
223         return *(void **)(l->key + roundup(map->key_size, 8));
224 }
225
226 static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
227 {
228         return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size);
229 }
230
231 static bool htab_has_extra_elems(struct bpf_htab *htab)
232 {
233         return !htab_is_percpu(htab) && !htab_is_lru(htab);
234 }
235
236 static void htab_free_prealloced_timers(struct bpf_htab *htab)
237 {
238         u32 num_entries = htab->map.max_entries;
239         int i;
240
241         if (likely(!map_value_has_timer(&htab->map)))
242                 return;
243         if (htab_has_extra_elems(htab))
244                 num_entries += num_possible_cpus();
245
246         for (i = 0; i < num_entries; i++) {
247                 struct htab_elem *elem;
248
249                 elem = get_htab_elem(htab, i);
250                 bpf_timer_cancel_and_free(elem->key +
251                                           round_up(htab->map.key_size, 8) +
252                                           htab->map.timer_off);
253                 cond_resched();
254         }
255 }
256
257 static void htab_free_elems(struct bpf_htab *htab)
258 {
259         int i;
260
261         if (!htab_is_percpu(htab))
262                 goto free_elems;
263
264         for (i = 0; i < htab->map.max_entries; i++) {
265                 void __percpu *pptr;
266
267                 pptr = htab_elem_get_ptr(get_htab_elem(htab, i),
268                                          htab->map.key_size);
269                 free_percpu(pptr);
270                 cond_resched();
271         }
272 free_elems:
273         bpf_map_area_free(htab->elems);
274 }
275
276 /* The LRU list has a lock (lru_lock). Each htab bucket has a lock
277  * (bucket_lock). If both locks need to be acquired together, the lock
278  * order is always lru_lock -> bucket_lock and this only happens in
279  * bpf_lru_list.c logic. For example, certain code path of
280  * bpf_lru_pop_free(), which is called by function prealloc_lru_pop(),
281  * will acquire lru_lock first followed by acquiring bucket_lock.
282  *
283  * In hashtab.c, to avoid deadlock, lock acquisition of
284  * bucket_lock followed by lru_lock is not allowed. In such cases,
285  * bucket_lock needs to be released first before acquiring lru_lock.
286  */
287 static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
288                                           u32 hash)
289 {
290         struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash);
291         struct htab_elem *l;
292
293         if (node) {
294                 u32 key_size = htab->map.key_size;
295
296                 l = container_of(node, struct htab_elem, lru_node);
297                 memcpy(l->key, key, key_size);
298                 check_and_init_map_value(&htab->map,
299                                          l->key + round_up(key_size, 8));
300                 return l;
301         }
302
303         return NULL;
304 }
305
306 static int prealloc_init(struct bpf_htab *htab)
307 {
308         u32 num_entries = htab->map.max_entries;
309         int err = -ENOMEM, i;
310
311         if (htab_has_extra_elems(htab))
312                 num_entries += num_possible_cpus();
313
314         htab->elems = bpf_map_area_alloc((u64)htab->elem_size * num_entries,
315                                          htab->map.numa_node);
316         if (!htab->elems)
317                 return -ENOMEM;
318
319         if (!htab_is_percpu(htab))
320                 goto skip_percpu_elems;
321
322         for (i = 0; i < num_entries; i++) {
323                 u32 size = round_up(htab->map.value_size, 8);
324                 void __percpu *pptr;
325
326                 pptr = bpf_map_alloc_percpu(&htab->map, size, 8,
327                                             GFP_USER | __GFP_NOWARN);
328                 if (!pptr)
329                         goto free_elems;
330                 htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size,
331                                   pptr);
332                 cond_resched();
333         }
334
335 skip_percpu_elems:
336         if (htab_is_lru(htab))
337                 err = bpf_lru_init(&htab->lru,
338                                    htab->map.map_flags & BPF_F_NO_COMMON_LRU,
339                                    offsetof(struct htab_elem, hash) -
340                                    offsetof(struct htab_elem, lru_node),
341                                    htab_lru_map_delete_node,
342                                    htab);
343         else
344                 err = pcpu_freelist_init(&htab->freelist);
345
346         if (err)
347                 goto free_elems;
348
349         if (htab_is_lru(htab))
350                 bpf_lru_populate(&htab->lru, htab->elems,
351                                  offsetof(struct htab_elem, lru_node),
352                                  htab->elem_size, num_entries);
353         else
354                 pcpu_freelist_populate(&htab->freelist,
355                                        htab->elems + offsetof(struct htab_elem, fnode),
356                                        htab->elem_size, num_entries);
357
358         return 0;
359
360 free_elems:
361         htab_free_elems(htab);
362         return err;
363 }
364
365 static void prealloc_destroy(struct bpf_htab *htab)
366 {
367         htab_free_elems(htab);
368
369         if (htab_is_lru(htab))
370                 bpf_lru_destroy(&htab->lru);
371         else
372                 pcpu_freelist_destroy(&htab->freelist);
373 }
374
375 static int alloc_extra_elems(struct bpf_htab *htab)
376 {
377         struct htab_elem *__percpu *pptr, *l_new;
378         struct pcpu_freelist_node *l;
379         int cpu;
380
381         pptr = bpf_map_alloc_percpu(&htab->map, sizeof(struct htab_elem *), 8,
382                                     GFP_USER | __GFP_NOWARN);
383         if (!pptr)
384                 return -ENOMEM;
385
386         for_each_possible_cpu(cpu) {
387                 l = pcpu_freelist_pop(&htab->freelist);
388                 /* pop will succeed, since prealloc_init()
389                  * preallocated extra num_possible_cpus elements
390                  */
391                 l_new = container_of(l, struct htab_elem, fnode);
392                 *per_cpu_ptr(pptr, cpu) = l_new;
393         }
394         htab->extra_elems = pptr;
395         return 0;
396 }
397
398 /* Called from syscall */
399 static int htab_map_alloc_check(union bpf_attr *attr)
400 {
401         bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
402                        attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
403         bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH ||
404                     attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
405         /* percpu_lru means each cpu has its own LRU list.
406          * it is different from BPF_MAP_TYPE_PERCPU_HASH where
407          * the map's value itself is percpu.  percpu_lru has
408          * nothing to do with the map's value.
409          */
410         bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
411         bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
412         bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED);
413         int numa_node = bpf_map_attr_numa_node(attr);
414
415         BUILD_BUG_ON(offsetof(struct htab_elem, htab) !=
416                      offsetof(struct htab_elem, hash_node.pprev));
417         BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
418                      offsetof(struct htab_elem, hash_node.pprev));
419
420         if (lru && !bpf_capable())
421                 /* LRU implementation is much complicated than other
422                  * maps.  Hence, limit to CAP_BPF.
423                  */
424                 return -EPERM;
425
426         if (zero_seed && !capable(CAP_SYS_ADMIN))
427                 /* Guard against local DoS, and discourage production use. */
428                 return -EPERM;
429
430         if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK ||
431             !bpf_map_flags_access_ok(attr->map_flags))
432                 return -EINVAL;
433
434         if (!lru && percpu_lru)
435                 return -EINVAL;
436
437         if (lru && !prealloc)
438                 return -ENOTSUPP;
439
440         if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru))
441                 return -EINVAL;
442
443         /* check sanity of attributes.
444          * value_size == 0 may be allowed in the future to use map as a set
445          */
446         if (attr->max_entries == 0 || attr->key_size == 0 ||
447             attr->value_size == 0)
448                 return -EINVAL;
449
450         if ((u64)attr->key_size + attr->value_size >= KMALLOC_MAX_SIZE -
451            sizeof(struct htab_elem))
452                 /* if key_size + value_size is bigger, the user space won't be
453                  * able to access the elements via bpf syscall. This check
454                  * also makes sure that the elem_size doesn't overflow and it's
455                  * kmalloc-able later in htab_map_update_elem()
456                  */
457                 return -E2BIG;
458
459         return 0;
460 }
461
462 static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
463 {
464         bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
465                        attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
466         bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH ||
467                     attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
468         /* percpu_lru means each cpu has its own LRU list.
469          * it is different from BPF_MAP_TYPE_PERCPU_HASH where
470          * the map's value itself is percpu.  percpu_lru has
471          * nothing to do with the map's value.
472          */
473         bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
474         bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
475         struct bpf_htab *htab;
476         int err, i;
477
478         htab = kzalloc(sizeof(*htab), GFP_USER | __GFP_ACCOUNT);
479         if (!htab)
480                 return ERR_PTR(-ENOMEM);
481
482         lockdep_register_key(&htab->lockdep_key);
483
484         bpf_map_init_from_attr(&htab->map, attr);
485
486         if (percpu_lru) {
487                 /* ensure each CPU's lru list has >=1 elements.
488                  * since we are at it, make each lru list has the same
489                  * number of elements.
490                  */
491                 htab->map.max_entries = roundup(attr->max_entries,
492                                                 num_possible_cpus());
493                 if (htab->map.max_entries < attr->max_entries)
494                         htab->map.max_entries = rounddown(attr->max_entries,
495                                                           num_possible_cpus());
496         }
497
498         /* hash table size must be power of 2 */
499         htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
500
501         htab->elem_size = sizeof(struct htab_elem) +
502                           round_up(htab->map.key_size, 8);
503         if (percpu)
504                 htab->elem_size += sizeof(void *);
505         else
506                 htab->elem_size += round_up(htab->map.value_size, 8);
507
508         err = -E2BIG;
509         /* prevent zero size kmalloc and check for u32 overflow */
510         if (htab->n_buckets == 0 ||
511             htab->n_buckets > U32_MAX / sizeof(struct bucket))
512                 goto free_htab;
513
514         err = -ENOMEM;
515         htab->buckets = bpf_map_area_alloc(htab->n_buckets *
516                                            sizeof(struct bucket),
517                                            htab->map.numa_node);
518         if (!htab->buckets)
519                 goto free_htab;
520
521         for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) {
522                 htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map,
523                                                            sizeof(int),
524                                                            sizeof(int),
525                                                            GFP_USER);
526                 if (!htab->map_locked[i])
527                         goto free_map_locked;
528         }
529
530         if (htab->map.map_flags & BPF_F_ZERO_SEED)
531                 htab->hashrnd = 0;
532         else
533                 htab->hashrnd = get_random_int();
534
535         htab_init_buckets(htab);
536
537         if (prealloc) {
538                 err = prealloc_init(htab);
539                 if (err)
540                         goto free_map_locked;
541
542                 if (!percpu && !lru) {
543                         /* lru itself can remove the least used element, so
544                          * there is no need for an extra elem during map_update.
545                          */
546                         err = alloc_extra_elems(htab);
547                         if (err)
548                                 goto free_prealloc;
549                 }
550         }
551
552         return &htab->map;
553
554 free_prealloc:
555         prealloc_destroy(htab);
556 free_map_locked:
557         for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
558                 free_percpu(htab->map_locked[i]);
559         bpf_map_area_free(htab->buckets);
560 free_htab:
561         lockdep_unregister_key(&htab->lockdep_key);
562         kfree(htab);
563         return ERR_PTR(err);
564 }
565
566 static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd)
567 {
568         return jhash(key, key_len, hashrnd);
569 }
570
571 static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
572 {
573         return &htab->buckets[hash & (htab->n_buckets - 1)];
574 }
575
576 static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 hash)
577 {
578         return &__select_bucket(htab, hash)->head;
579 }
580
581 /* this lookup function can only be called with bucket lock taken */
582 static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash,
583                                          void *key, u32 key_size)
584 {
585         struct hlist_nulls_node *n;
586         struct htab_elem *l;
587
588         hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
589                 if (l->hash == hash && !memcmp(&l->key, key, key_size))
590                         return l;
591
592         return NULL;
593 }
594
595 /* can be called without bucket lock. it will repeat the loop in
596  * the unlikely event when elements moved from one bucket into another
597  * while link list is being walked
598  */
599 static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head,
600                                                u32 hash, void *key,
601                                                u32 key_size, u32 n_buckets)
602 {
603         struct hlist_nulls_node *n;
604         struct htab_elem *l;
605
606 again:
607         hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
608                 if (l->hash == hash && !memcmp(&l->key, key, key_size))
609                         return l;
610
611         if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1))))
612                 goto again;
613
614         return NULL;
615 }
616
617 /* Called from syscall or from eBPF program directly, so
618  * arguments have to match bpf_map_lookup_elem() exactly.
619  * The return value is adjusted by BPF instructions
620  * in htab_map_gen_lookup().
621  */
622 static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
623 {
624         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
625         struct hlist_nulls_head *head;
626         struct htab_elem *l;
627         u32 hash, key_size;
628
629         WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
630                      !rcu_read_lock_bh_held());
631
632         key_size = map->key_size;
633
634         hash = htab_map_hash(key, key_size, htab->hashrnd);
635
636         head = select_bucket(htab, hash);
637
638         l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
639
640         return l;
641 }
642
643 static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
644 {
645         struct htab_elem *l = __htab_map_lookup_elem(map, key);
646
647         if (l)
648                 return l->key + round_up(map->key_size, 8);
649
650         return NULL;
651 }
652
653 /* inline bpf_map_lookup_elem() call.
654  * Instead of:
655  * bpf_prog
656  *   bpf_map_lookup_elem
657  *     map->ops->map_lookup_elem
658  *       htab_map_lookup_elem
659  *         __htab_map_lookup_elem
660  * do:
661  * bpf_prog
662  *   __htab_map_lookup_elem
663  */
664 static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
665 {
666         struct bpf_insn *insn = insn_buf;
667         const int ret = BPF_REG_0;
668
669         BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
670                      (void *(*)(struct bpf_map *map, void *key))NULL));
671         *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
672         *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
673         *insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
674                                 offsetof(struct htab_elem, key) +
675                                 round_up(map->key_size, 8));
676         return insn - insn_buf;
677 }
678
679 static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map,
680                                                         void *key, const bool mark)
681 {
682         struct htab_elem *l = __htab_map_lookup_elem(map, key);
683
684         if (l) {
685                 if (mark)
686                         bpf_lru_node_set_ref(&l->lru_node);
687                 return l->key + round_up(map->key_size, 8);
688         }
689
690         return NULL;
691 }
692
693 static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
694 {
695         return __htab_lru_map_lookup_elem(map, key, true);
696 }
697
698 static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key)
699 {
700         return __htab_lru_map_lookup_elem(map, key, false);
701 }
702
703 static int htab_lru_map_gen_lookup(struct bpf_map *map,
704                                    struct bpf_insn *insn_buf)
705 {
706         struct bpf_insn *insn = insn_buf;
707         const int ret = BPF_REG_0;
708         const int ref_reg = BPF_REG_1;
709
710         BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
711                      (void *(*)(struct bpf_map *map, void *key))NULL));
712         *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
713         *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4);
714         *insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret,
715                               offsetof(struct htab_elem, lru_node) +
716                               offsetof(struct bpf_lru_node, ref));
717         *insn++ = BPF_JMP_IMM(BPF_JNE, ref_reg, 0, 1);
718         *insn++ = BPF_ST_MEM(BPF_B, ret,
719                              offsetof(struct htab_elem, lru_node) +
720                              offsetof(struct bpf_lru_node, ref),
721                              1);
722         *insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
723                                 offsetof(struct htab_elem, key) +
724                                 round_up(map->key_size, 8));
725         return insn - insn_buf;
726 }
727
728 static void check_and_free_timer(struct bpf_htab *htab, struct htab_elem *elem)
729 {
730         if (unlikely(map_value_has_timer(&htab->map)))
731                 bpf_timer_cancel_and_free(elem->key +
732                                           round_up(htab->map.key_size, 8) +
733                                           htab->map.timer_off);
734 }
735
736 /* It is called from the bpf_lru_list when the LRU needs to delete
737  * older elements from the htab.
738  */
739 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
740 {
741         struct bpf_htab *htab = (struct bpf_htab *)arg;
742         struct htab_elem *l = NULL, *tgt_l;
743         struct hlist_nulls_head *head;
744         struct hlist_nulls_node *n;
745         unsigned long flags;
746         struct bucket *b;
747         int ret;
748
749         tgt_l = container_of(node, struct htab_elem, lru_node);
750         b = __select_bucket(htab, tgt_l->hash);
751         head = &b->head;
752
753         ret = htab_lock_bucket(htab, b, tgt_l->hash, &flags);
754         if (ret)
755                 return false;
756
757         hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
758                 if (l == tgt_l) {
759                         hlist_nulls_del_rcu(&l->hash_node);
760                         check_and_free_timer(htab, l);
761                         break;
762                 }
763
764         htab_unlock_bucket(htab, b, tgt_l->hash, flags);
765
766         return l == tgt_l;
767 }
768
769 /* Called from syscall */
770 static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
771 {
772         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
773         struct hlist_nulls_head *head;
774         struct htab_elem *l, *next_l;
775         u32 hash, key_size;
776         int i = 0;
777
778         WARN_ON_ONCE(!rcu_read_lock_held());
779
780         key_size = map->key_size;
781
782         if (!key)
783                 goto find_first_elem;
784
785         hash = htab_map_hash(key, key_size, htab->hashrnd);
786
787         head = select_bucket(htab, hash);
788
789         /* lookup the key */
790         l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
791
792         if (!l)
793                 goto find_first_elem;
794
795         /* key was found, get next key in the same bucket */
796         next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)),
797                                   struct htab_elem, hash_node);
798
799         if (next_l) {
800                 /* if next elem in this hash list is non-zero, just return it */
801                 memcpy(next_key, next_l->key, key_size);
802                 return 0;
803         }
804
805         /* no more elements in this hash list, go to the next bucket */
806         i = hash & (htab->n_buckets - 1);
807         i++;
808
809 find_first_elem:
810         /* iterate over buckets */
811         for (; i < htab->n_buckets; i++) {
812                 head = select_bucket(htab, i);
813
814                 /* pick first element in the bucket */
815                 next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)),
816                                           struct htab_elem, hash_node);
817                 if (next_l) {
818                         /* if it's not empty, just return it */
819                         memcpy(next_key, next_l->key, key_size);
820                         return 0;
821                 }
822         }
823
824         /* iterated over all buckets and all elements */
825         return -ENOENT;
826 }
827
828 static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
829 {
830         if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
831                 free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
832         check_and_free_timer(htab, l);
833         kfree(l);
834 }
835
836 static void htab_elem_free_rcu(struct rcu_head *head)
837 {
838         struct htab_elem *l = container_of(head, struct htab_elem, rcu);
839         struct bpf_htab *htab = l->htab;
840
841         htab_elem_free(htab, l);
842 }
843
844 static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
845 {
846         struct bpf_map *map = &htab->map;
847         void *ptr;
848
849         if (map->ops->map_fd_put_ptr) {
850                 ptr = fd_htab_map_get_ptr(map, l);
851                 map->ops->map_fd_put_ptr(ptr);
852         }
853 }
854
855 static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
856 {
857         htab_put_fd_value(htab, l);
858
859         if (htab_is_prealloc(htab)) {
860                 check_and_free_timer(htab, l);
861                 __pcpu_freelist_push(&htab->freelist, &l->fnode);
862         } else {
863                 atomic_dec(&htab->count);
864                 l->htab = htab;
865                 call_rcu(&l->rcu, htab_elem_free_rcu);
866         }
867 }
868
869 static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
870                             void *value, bool onallcpus)
871 {
872         if (!onallcpus) {
873                 /* copy true value_size bytes */
874                 memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
875         } else {
876                 u32 size = round_up(htab->map.value_size, 8);
877                 int off = 0, cpu;
878
879                 for_each_possible_cpu(cpu) {
880                         bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
881                                         value + off, size);
882                         off += size;
883                 }
884         }
885 }
886
887 static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr,
888                             void *value, bool onallcpus)
889 {
890         /* When using prealloc and not setting the initial value on all cpus,
891          * zero-fill element values for other cpus (just as what happens when
892          * not using prealloc). Otherwise, bpf program has no way to ensure
893          * known initial values for cpus other than current one
894          * (onallcpus=false always when coming from bpf prog).
895          */
896         if (htab_is_prealloc(htab) && !onallcpus) {
897                 u32 size = round_up(htab->map.value_size, 8);
898                 int current_cpu = raw_smp_processor_id();
899                 int cpu;
900
901                 for_each_possible_cpu(cpu) {
902                         if (cpu == current_cpu)
903                                 bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value,
904                                                 size);
905                         else
906                                 memset(per_cpu_ptr(pptr, cpu), 0, size);
907                 }
908         } else {
909                 pcpu_copy_value(htab, pptr, value, onallcpus);
910         }
911 }
912
913 static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
914 {
915         return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS &&
916                BITS_PER_LONG == 64;
917 }
918
919 static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
920                                          void *value, u32 key_size, u32 hash,
921                                          bool percpu, bool onallcpus,
922                                          struct htab_elem *old_elem)
923 {
924         u32 size = htab->map.value_size;
925         bool prealloc = htab_is_prealloc(htab);
926         struct htab_elem *l_new, **pl_new;
927         void __percpu *pptr;
928
929         if (prealloc) {
930                 if (old_elem) {
931                         /* if we're updating the existing element,
932                          * use per-cpu extra elems to avoid freelist_pop/push
933                          */
934                         pl_new = this_cpu_ptr(htab->extra_elems);
935                         l_new = *pl_new;
936                         htab_put_fd_value(htab, old_elem);
937                         *pl_new = old_elem;
938                 } else {
939                         struct pcpu_freelist_node *l;
940
941                         l = __pcpu_freelist_pop(&htab->freelist);
942                         if (!l)
943                                 return ERR_PTR(-E2BIG);
944                         l_new = container_of(l, struct htab_elem, fnode);
945                 }
946         } else {
947                 if (atomic_inc_return(&htab->count) > htab->map.max_entries)
948                         if (!old_elem) {
949                                 /* when map is full and update() is replacing
950                                  * old element, it's ok to allocate, since
951                                  * old element will be freed immediately.
952                                  * Otherwise return an error
953                                  */
954                                 l_new = ERR_PTR(-E2BIG);
955                                 goto dec_count;
956                         }
957                 l_new = bpf_map_kmalloc_node(&htab->map, htab->elem_size,
958                                              GFP_ATOMIC | __GFP_NOWARN,
959                                              htab->map.numa_node);
960                 if (!l_new) {
961                         l_new = ERR_PTR(-ENOMEM);
962                         goto dec_count;
963                 }
964                 check_and_init_map_value(&htab->map,
965                                          l_new->key + round_up(key_size, 8));
966         }
967
968         memcpy(l_new->key, key, key_size);
969         if (percpu) {
970                 size = round_up(size, 8);
971                 if (prealloc) {
972                         pptr = htab_elem_get_ptr(l_new, key_size);
973                 } else {
974                         /* alloc_percpu zero-fills */
975                         pptr = bpf_map_alloc_percpu(&htab->map, size, 8,
976                                                     GFP_ATOMIC | __GFP_NOWARN);
977                         if (!pptr) {
978                                 kfree(l_new);
979                                 l_new = ERR_PTR(-ENOMEM);
980                                 goto dec_count;
981                         }
982                 }
983
984                 pcpu_init_value(htab, pptr, value, onallcpus);
985
986                 if (!prealloc)
987                         htab_elem_set_ptr(l_new, key_size, pptr);
988         } else if (fd_htab_map_needs_adjust(htab)) {
989                 size = round_up(size, 8);
990                 memcpy(l_new->key + round_up(key_size, 8), value, size);
991         } else {
992                 copy_map_value(&htab->map,
993                                l_new->key + round_up(key_size, 8),
994                                value);
995         }
996
997         l_new->hash = hash;
998         return l_new;
999 dec_count:
1000         atomic_dec(&htab->count);
1001         return l_new;
1002 }
1003
1004 static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
1005                        u64 map_flags)
1006 {
1007         if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
1008                 /* elem already exists */
1009                 return -EEXIST;
1010
1011         if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
1012                 /* elem doesn't exist, cannot update it */
1013                 return -ENOENT;
1014
1015         return 0;
1016 }
1017
1018 /* Called from syscall or from eBPF program */
1019 static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
1020                                 u64 map_flags)
1021 {
1022         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1023         struct htab_elem *l_new = NULL, *l_old;
1024         struct hlist_nulls_head *head;
1025         unsigned long flags;
1026         struct bucket *b;
1027         u32 key_size, hash;
1028         int ret;
1029
1030         if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))
1031                 /* unknown flags */
1032                 return -EINVAL;
1033
1034         WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1035                      !rcu_read_lock_bh_held());
1036
1037         key_size = map->key_size;
1038
1039         hash = htab_map_hash(key, key_size, htab->hashrnd);
1040
1041         b = __select_bucket(htab, hash);
1042         head = &b->head;
1043
1044         if (unlikely(map_flags & BPF_F_LOCK)) {
1045                 if (unlikely(!map_value_has_spin_lock(map)))
1046                         return -EINVAL;
1047                 /* find an element without taking the bucket lock */
1048                 l_old = lookup_nulls_elem_raw(head, hash, key, key_size,
1049                                               htab->n_buckets);
1050                 ret = check_flags(htab, l_old, map_flags);
1051                 if (ret)
1052                         return ret;
1053                 if (l_old) {
1054                         /* grab the element lock and update value in place */
1055                         copy_map_value_locked(map,
1056                                               l_old->key + round_up(key_size, 8),
1057                                               value, false);
1058                         return 0;
1059                 }
1060                 /* fall through, grab the bucket lock and lookup again.
1061                  * 99.9% chance that the element won't be found,
1062                  * but second lookup under lock has to be done.
1063                  */
1064         }
1065
1066         ret = htab_lock_bucket(htab, b, hash, &flags);
1067         if (ret)
1068                 return ret;
1069
1070         l_old = lookup_elem_raw(head, hash, key, key_size);
1071
1072         ret = check_flags(htab, l_old, map_flags);
1073         if (ret)
1074                 goto err;
1075
1076         if (unlikely(l_old && (map_flags & BPF_F_LOCK))) {
1077                 /* first lookup without the bucket lock didn't find the element,
1078                  * but second lookup with the bucket lock found it.
1079                  * This case is highly unlikely, but has to be dealt with:
1080                  * grab the element lock in addition to the bucket lock
1081                  * and update element in place
1082                  */
1083                 copy_map_value_locked(map,
1084                                       l_old->key + round_up(key_size, 8),
1085                                       value, false);
1086                 ret = 0;
1087                 goto err;
1088         }
1089
1090         l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
1091                                 l_old);
1092         if (IS_ERR(l_new)) {
1093                 /* all pre-allocated elements are in use or memory exhausted */
1094                 ret = PTR_ERR(l_new);
1095                 goto err;
1096         }
1097
1098         /* add new element to the head of the list, so that
1099          * concurrent search will find it before old elem
1100          */
1101         hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1102         if (l_old) {
1103                 hlist_nulls_del_rcu(&l_old->hash_node);
1104                 if (!htab_is_prealloc(htab))
1105                         free_htab_elem(htab, l_old);
1106                 else
1107                         check_and_free_timer(htab, l_old);
1108         }
1109         ret = 0;
1110 err:
1111         htab_unlock_bucket(htab, b, hash, flags);
1112         return ret;
1113 }
1114
1115 static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem)
1116 {
1117         check_and_free_timer(htab, elem);
1118         bpf_lru_push_free(&htab->lru, &elem->lru_node);
1119 }
1120
1121 static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
1122                                     u64 map_flags)
1123 {
1124         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1125         struct htab_elem *l_new, *l_old = NULL;
1126         struct hlist_nulls_head *head;
1127         unsigned long flags;
1128         struct bucket *b;
1129         u32 key_size, hash;
1130         int ret;
1131
1132         if (unlikely(map_flags > BPF_EXIST))
1133                 /* unknown flags */
1134                 return -EINVAL;
1135
1136         WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1137                      !rcu_read_lock_bh_held());
1138
1139         key_size = map->key_size;
1140
1141         hash = htab_map_hash(key, key_size, htab->hashrnd);
1142
1143         b = __select_bucket(htab, hash);
1144         head = &b->head;
1145
1146         /* For LRU, we need to alloc before taking bucket's
1147          * spinlock because getting free nodes from LRU may need
1148          * to remove older elements from htab and this removal
1149          * operation will need a bucket lock.
1150          */
1151         l_new = prealloc_lru_pop(htab, key, hash);
1152         if (!l_new)
1153                 return -ENOMEM;
1154         copy_map_value(&htab->map,
1155                        l_new->key + round_up(map->key_size, 8), value);
1156
1157         ret = htab_lock_bucket(htab, b, hash, &flags);
1158         if (ret)
1159                 return ret;
1160
1161         l_old = lookup_elem_raw(head, hash, key, key_size);
1162
1163         ret = check_flags(htab, l_old, map_flags);
1164         if (ret)
1165                 goto err;
1166
1167         /* add new element to the head of the list, so that
1168          * concurrent search will find it before old elem
1169          */
1170         hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1171         if (l_old) {
1172                 bpf_lru_node_set_ref(&l_new->lru_node);
1173                 hlist_nulls_del_rcu(&l_old->hash_node);
1174         }
1175         ret = 0;
1176
1177 err:
1178         htab_unlock_bucket(htab, b, hash, flags);
1179
1180         if (ret)
1181                 htab_lru_push_free(htab, l_new);
1182         else if (l_old)
1183                 htab_lru_push_free(htab, l_old);
1184
1185         return ret;
1186 }
1187
1188 static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
1189                                          void *value, u64 map_flags,
1190                                          bool onallcpus)
1191 {
1192         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1193         struct htab_elem *l_new = NULL, *l_old;
1194         struct hlist_nulls_head *head;
1195         unsigned long flags;
1196         struct bucket *b;
1197         u32 key_size, hash;
1198         int ret;
1199
1200         if (unlikely(map_flags > BPF_EXIST))
1201                 /* unknown flags */
1202                 return -EINVAL;
1203
1204         WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1205                      !rcu_read_lock_bh_held());
1206
1207         key_size = map->key_size;
1208
1209         hash = htab_map_hash(key, key_size, htab->hashrnd);
1210
1211         b = __select_bucket(htab, hash);
1212         head = &b->head;
1213
1214         ret = htab_lock_bucket(htab, b, hash, &flags);
1215         if (ret)
1216                 return ret;
1217
1218         l_old = lookup_elem_raw(head, hash, key, key_size);
1219
1220         ret = check_flags(htab, l_old, map_flags);
1221         if (ret)
1222                 goto err;
1223
1224         if (l_old) {
1225                 /* per-cpu hash map can update value in-place */
1226                 pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
1227                                 value, onallcpus);
1228         } else {
1229                 l_new = alloc_htab_elem(htab, key, value, key_size,
1230                                         hash, true, onallcpus, NULL);
1231                 if (IS_ERR(l_new)) {
1232                         ret = PTR_ERR(l_new);
1233                         goto err;
1234                 }
1235                 hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1236         }
1237         ret = 0;
1238 err:
1239         htab_unlock_bucket(htab, b, hash, flags);
1240         return ret;
1241 }
1242
1243 static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
1244                                              void *value, u64 map_flags,
1245                                              bool onallcpus)
1246 {
1247         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1248         struct htab_elem *l_new = NULL, *l_old;
1249         struct hlist_nulls_head *head;
1250         unsigned long flags;
1251         struct bucket *b;
1252         u32 key_size, hash;
1253         int ret;
1254
1255         if (unlikely(map_flags > BPF_EXIST))
1256                 /* unknown flags */
1257                 return -EINVAL;
1258
1259         WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1260                      !rcu_read_lock_bh_held());
1261
1262         key_size = map->key_size;
1263
1264         hash = htab_map_hash(key, key_size, htab->hashrnd);
1265
1266         b = __select_bucket(htab, hash);
1267         head = &b->head;
1268
1269         /* For LRU, we need to alloc before taking bucket's
1270          * spinlock because LRU's elem alloc may need
1271          * to remove older elem from htab and this removal
1272          * operation will need a bucket lock.
1273          */
1274         if (map_flags != BPF_EXIST) {
1275                 l_new = prealloc_lru_pop(htab, key, hash);
1276                 if (!l_new)
1277                         return -ENOMEM;
1278         }
1279
1280         ret = htab_lock_bucket(htab, b, hash, &flags);
1281         if (ret)
1282                 return ret;
1283
1284         l_old = lookup_elem_raw(head, hash, key, key_size);
1285
1286         ret = check_flags(htab, l_old, map_flags);
1287         if (ret)
1288                 goto err;
1289
1290         if (l_old) {
1291                 bpf_lru_node_set_ref(&l_old->lru_node);
1292
1293                 /* per-cpu hash map can update value in-place */
1294                 pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
1295                                 value, onallcpus);
1296         } else {
1297                 pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size),
1298                                 value, onallcpus);
1299                 hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1300                 l_new = NULL;
1301         }
1302         ret = 0;
1303 err:
1304         htab_unlock_bucket(htab, b, hash, flags);
1305         if (l_new)
1306                 bpf_lru_push_free(&htab->lru, &l_new->lru_node);
1307         return ret;
1308 }
1309
1310 static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
1311                                        void *value, u64 map_flags)
1312 {
1313         return __htab_percpu_map_update_elem(map, key, value, map_flags, false);
1314 }
1315
1316 static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
1317                                            void *value, u64 map_flags)
1318 {
1319         return __htab_lru_percpu_map_update_elem(map, key, value, map_flags,
1320                                                  false);
1321 }
1322
1323 /* Called from syscall or from eBPF program */
1324 static int htab_map_delete_elem(struct bpf_map *map, void *key)
1325 {
1326         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1327         struct hlist_nulls_head *head;
1328         struct bucket *b;
1329         struct htab_elem *l;
1330         unsigned long flags;
1331         u32 hash, key_size;
1332         int ret;
1333
1334         WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1335                      !rcu_read_lock_bh_held());
1336
1337         key_size = map->key_size;
1338
1339         hash = htab_map_hash(key, key_size, htab->hashrnd);
1340         b = __select_bucket(htab, hash);
1341         head = &b->head;
1342
1343         ret = htab_lock_bucket(htab, b, hash, &flags);
1344         if (ret)
1345                 return ret;
1346
1347         l = lookup_elem_raw(head, hash, key, key_size);
1348
1349         if (l) {
1350                 hlist_nulls_del_rcu(&l->hash_node);
1351                 free_htab_elem(htab, l);
1352         } else {
1353                 ret = -ENOENT;
1354         }
1355
1356         htab_unlock_bucket(htab, b, hash, flags);
1357         return ret;
1358 }
1359
1360 static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
1361 {
1362         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1363         struct hlist_nulls_head *head;
1364         struct bucket *b;
1365         struct htab_elem *l;
1366         unsigned long flags;
1367         u32 hash, key_size;
1368         int ret;
1369
1370         WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1371                      !rcu_read_lock_bh_held());
1372
1373         key_size = map->key_size;
1374
1375         hash = htab_map_hash(key, key_size, htab->hashrnd);
1376         b = __select_bucket(htab, hash);
1377         head = &b->head;
1378
1379         ret = htab_lock_bucket(htab, b, hash, &flags);
1380         if (ret)
1381                 return ret;
1382
1383         l = lookup_elem_raw(head, hash, key, key_size);
1384
1385         if (l)
1386                 hlist_nulls_del_rcu(&l->hash_node);
1387         else
1388                 ret = -ENOENT;
1389
1390         htab_unlock_bucket(htab, b, hash, flags);
1391         if (l)
1392                 htab_lru_push_free(htab, l);
1393         return ret;
1394 }
1395
1396 static void delete_all_elements(struct bpf_htab *htab)
1397 {
1398         int i;
1399
1400         for (i = 0; i < htab->n_buckets; i++) {
1401                 struct hlist_nulls_head *head = select_bucket(htab, i);
1402                 struct hlist_nulls_node *n;
1403                 struct htab_elem *l;
1404
1405                 hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
1406                         hlist_nulls_del_rcu(&l->hash_node);
1407                         htab_elem_free(htab, l);
1408                 }
1409         }
1410 }
1411
1412 static void htab_free_malloced_timers(struct bpf_htab *htab)
1413 {
1414         int i;
1415
1416         rcu_read_lock();
1417         for (i = 0; i < htab->n_buckets; i++) {
1418                 struct hlist_nulls_head *head = select_bucket(htab, i);
1419                 struct hlist_nulls_node *n;
1420                 struct htab_elem *l;
1421
1422                 hlist_nulls_for_each_entry(l, n, head, hash_node)
1423                         check_and_free_timer(htab, l);
1424                 cond_resched_rcu();
1425         }
1426         rcu_read_unlock();
1427 }
1428
1429 static void htab_map_free_timers(struct bpf_map *map)
1430 {
1431         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1432
1433         if (likely(!map_value_has_timer(&htab->map)))
1434                 return;
1435         if (!htab_is_prealloc(htab))
1436                 htab_free_malloced_timers(htab);
1437         else
1438                 htab_free_prealloced_timers(htab);
1439 }
1440
1441 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
1442 static void htab_map_free(struct bpf_map *map)
1443 {
1444         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1445         int i;
1446
1447         /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback.
1448          * bpf_free_used_maps() is called after bpf prog is no longer executing.
1449          * There is no need to synchronize_rcu() here to protect map elements.
1450          */
1451
1452         /* some of free_htab_elem() callbacks for elements of this map may
1453          * not have executed. Wait for them.
1454          */
1455         rcu_barrier();
1456         if (!htab_is_prealloc(htab))
1457                 delete_all_elements(htab);
1458         else
1459                 prealloc_destroy(htab);
1460
1461         free_percpu(htab->extra_elems);
1462         bpf_map_area_free(htab->buckets);
1463         for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
1464                 free_percpu(htab->map_locked[i]);
1465         lockdep_unregister_key(&htab->lockdep_key);
1466         kfree(htab);
1467 }
1468
1469 static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
1470                                    struct seq_file *m)
1471 {
1472         void *value;
1473
1474         rcu_read_lock();
1475
1476         value = htab_map_lookup_elem(map, key);
1477         if (!value) {
1478                 rcu_read_unlock();
1479                 return;
1480         }
1481
1482         btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
1483         seq_puts(m, ": ");
1484         btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
1485         seq_puts(m, "\n");
1486
1487         rcu_read_unlock();
1488 }
1489
1490 static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
1491                                              void *value, bool is_lru_map,
1492                                              bool is_percpu, u64 flags)
1493 {
1494         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1495         struct hlist_nulls_head *head;
1496         unsigned long bflags;
1497         struct htab_elem *l;
1498         u32 hash, key_size;
1499         struct bucket *b;
1500         int ret;
1501
1502         key_size = map->key_size;
1503
1504         hash = htab_map_hash(key, key_size, htab->hashrnd);
1505         b = __select_bucket(htab, hash);
1506         head = &b->head;
1507
1508         ret = htab_lock_bucket(htab, b, hash, &bflags);
1509         if (ret)
1510                 return ret;
1511
1512         l = lookup_elem_raw(head, hash, key, key_size);
1513         if (!l) {
1514                 ret = -ENOENT;
1515         } else {
1516                 if (is_percpu) {
1517                         u32 roundup_value_size = round_up(map->value_size, 8);
1518                         void __percpu *pptr;
1519                         int off = 0, cpu;
1520
1521                         pptr = htab_elem_get_ptr(l, key_size);
1522                         for_each_possible_cpu(cpu) {
1523                                 bpf_long_memcpy(value + off,
1524                                                 per_cpu_ptr(pptr, cpu),
1525                                                 roundup_value_size);
1526                                 off += roundup_value_size;
1527                         }
1528                 } else {
1529                         u32 roundup_key_size = round_up(map->key_size, 8);
1530
1531                         if (flags & BPF_F_LOCK)
1532                                 copy_map_value_locked(map, value, l->key +
1533                                                       roundup_key_size,
1534                                                       true);
1535                         else
1536                                 copy_map_value(map, value, l->key +
1537                                                roundup_key_size);
1538                         check_and_init_map_value(map, value);
1539                 }
1540
1541                 hlist_nulls_del_rcu(&l->hash_node);
1542                 if (!is_lru_map)
1543                         free_htab_elem(htab, l);
1544         }
1545
1546         htab_unlock_bucket(htab, b, hash, bflags);
1547
1548         if (is_lru_map && l)
1549                 htab_lru_push_free(htab, l);
1550
1551         return ret;
1552 }
1553
1554 static int htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
1555                                            void *value, u64 flags)
1556 {
1557         return __htab_map_lookup_and_delete_elem(map, key, value, false, false,
1558                                                  flags);
1559 }
1560
1561 static int htab_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
1562                                                   void *key, void *value,
1563                                                   u64 flags)
1564 {
1565         return __htab_map_lookup_and_delete_elem(map, key, value, false, true,
1566                                                  flags);
1567 }
1568
1569 static int htab_lru_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
1570                                                void *value, u64 flags)
1571 {
1572         return __htab_map_lookup_and_delete_elem(map, key, value, true, false,
1573                                                  flags);
1574 }
1575
1576 static int htab_lru_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
1577                                                       void *key, void *value,
1578                                                       u64 flags)
1579 {
1580         return __htab_map_lookup_and_delete_elem(map, key, value, true, true,
1581                                                  flags);
1582 }
1583
1584 static int
1585 __htab_map_lookup_and_delete_batch(struct bpf_map *map,
1586                                    const union bpf_attr *attr,
1587                                    union bpf_attr __user *uattr,
1588                                    bool do_delete, bool is_lru_map,
1589                                    bool is_percpu)
1590 {
1591         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1592         u32 bucket_cnt, total, key_size, value_size, roundup_key_size;
1593         void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val;
1594         void __user *uvalues = u64_to_user_ptr(attr->batch.values);
1595         void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
1596         void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
1597         u32 batch, max_count, size, bucket_size;
1598         struct htab_elem *node_to_free = NULL;
1599         u64 elem_map_flags, map_flags;
1600         struct hlist_nulls_head *head;
1601         struct hlist_nulls_node *n;
1602         unsigned long flags = 0;
1603         bool locked = false;
1604         struct htab_elem *l;
1605         struct bucket *b;
1606         int ret = 0;
1607
1608         elem_map_flags = attr->batch.elem_flags;
1609         if ((elem_map_flags & ~BPF_F_LOCK) ||
1610             ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map)))
1611                 return -EINVAL;
1612
1613         map_flags = attr->batch.flags;
1614         if (map_flags)
1615                 return -EINVAL;
1616
1617         max_count = attr->batch.count;
1618         if (!max_count)
1619                 return 0;
1620
1621         if (put_user(0, &uattr->batch.count))
1622                 return -EFAULT;
1623
1624         batch = 0;
1625         if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch)))
1626                 return -EFAULT;
1627
1628         if (batch >= htab->n_buckets)
1629                 return -ENOENT;
1630
1631         key_size = htab->map.key_size;
1632         roundup_key_size = round_up(htab->map.key_size, 8);
1633         value_size = htab->map.value_size;
1634         size = round_up(value_size, 8);
1635         if (is_percpu)
1636                 value_size = size * num_possible_cpus();
1637         total = 0;
1638         /* while experimenting with hash tables with sizes ranging from 10 to
1639          * 1000, it was observed that a bucket can have upto 5 entries.
1640          */
1641         bucket_size = 5;
1642
1643 alloc:
1644         /* We cannot do copy_from_user or copy_to_user inside
1645          * the rcu_read_lock. Allocate enough space here.
1646          */
1647         keys = kvmalloc_array(key_size, bucket_size, GFP_USER | __GFP_NOWARN);
1648         values = kvmalloc_array(value_size, bucket_size, GFP_USER | __GFP_NOWARN);
1649         if (!keys || !values) {
1650                 ret = -ENOMEM;
1651                 goto after_loop;
1652         }
1653
1654 again:
1655         bpf_disable_instrumentation();
1656         rcu_read_lock();
1657 again_nocopy:
1658         dst_key = keys;
1659         dst_val = values;
1660         b = &htab->buckets[batch];
1661         head = &b->head;
1662         /* do not grab the lock unless need it (bucket_cnt > 0). */
1663         if (locked) {
1664                 ret = htab_lock_bucket(htab, b, batch, &flags);
1665                 if (ret)
1666                         goto next_batch;
1667         }
1668
1669         bucket_cnt = 0;
1670         hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
1671                 bucket_cnt++;
1672
1673         if (bucket_cnt && !locked) {
1674                 locked = true;
1675                 goto again_nocopy;
1676         }
1677
1678         if (bucket_cnt > (max_count - total)) {
1679                 if (total == 0)
1680                         ret = -ENOSPC;
1681                 /* Note that since bucket_cnt > 0 here, it is implicit
1682                  * that the locked was grabbed, so release it.
1683                  */
1684                 htab_unlock_bucket(htab, b, batch, flags);
1685                 rcu_read_unlock();
1686                 bpf_enable_instrumentation();
1687                 goto after_loop;
1688         }
1689
1690         if (bucket_cnt > bucket_size) {
1691                 bucket_size = bucket_cnt;
1692                 /* Note that since bucket_cnt > 0 here, it is implicit
1693                  * that the locked was grabbed, so release it.
1694                  */
1695                 htab_unlock_bucket(htab, b, batch, flags);
1696                 rcu_read_unlock();
1697                 bpf_enable_instrumentation();
1698                 kvfree(keys);
1699                 kvfree(values);
1700                 goto alloc;
1701         }
1702
1703         /* Next block is only safe to run if you have grabbed the lock */
1704         if (!locked)
1705                 goto next_batch;
1706
1707         hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
1708                 memcpy(dst_key, l->key, key_size);
1709
1710                 if (is_percpu) {
1711                         int off = 0, cpu;
1712                         void __percpu *pptr;
1713
1714                         pptr = htab_elem_get_ptr(l, map->key_size);
1715                         for_each_possible_cpu(cpu) {
1716                                 bpf_long_memcpy(dst_val + off,
1717                                                 per_cpu_ptr(pptr, cpu), size);
1718                                 off += size;
1719                         }
1720                 } else {
1721                         value = l->key + roundup_key_size;
1722                         if (elem_map_flags & BPF_F_LOCK)
1723                                 copy_map_value_locked(map, dst_val, value,
1724                                                       true);
1725                         else
1726                                 copy_map_value(map, dst_val, value);
1727                         check_and_init_map_value(map, dst_val);
1728                 }
1729                 if (do_delete) {
1730                         hlist_nulls_del_rcu(&l->hash_node);
1731
1732                         /* bpf_lru_push_free() will acquire lru_lock, which
1733                          * may cause deadlock. See comments in function
1734                          * prealloc_lru_pop(). Let us do bpf_lru_push_free()
1735                          * after releasing the bucket lock.
1736                          */
1737                         if (is_lru_map) {
1738                                 l->batch_flink = node_to_free;
1739                                 node_to_free = l;
1740                         } else {
1741                                 free_htab_elem(htab, l);
1742                         }
1743                 }
1744                 dst_key += key_size;
1745                 dst_val += value_size;
1746         }
1747
1748         htab_unlock_bucket(htab, b, batch, flags);
1749         locked = false;
1750
1751         while (node_to_free) {
1752                 l = node_to_free;
1753                 node_to_free = node_to_free->batch_flink;
1754                 htab_lru_push_free(htab, l);
1755         }
1756
1757 next_batch:
1758         /* If we are not copying data, we can go to next bucket and avoid
1759          * unlocking the rcu.
1760          */
1761         if (!bucket_cnt && (batch + 1 < htab->n_buckets)) {
1762                 batch++;
1763                 goto again_nocopy;
1764         }
1765
1766         rcu_read_unlock();
1767         bpf_enable_instrumentation();
1768         if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys,
1769             key_size * bucket_cnt) ||
1770             copy_to_user(uvalues + total * value_size, values,
1771             value_size * bucket_cnt))) {
1772                 ret = -EFAULT;
1773                 goto after_loop;
1774         }
1775
1776         total += bucket_cnt;
1777         batch++;
1778         if (batch >= htab->n_buckets) {
1779                 ret = -ENOENT;
1780                 goto after_loop;
1781         }
1782         goto again;
1783
1784 after_loop:
1785         if (ret == -EFAULT)
1786                 goto out;
1787
1788         /* copy # of entries and next batch */
1789         ubatch = u64_to_user_ptr(attr->batch.out_batch);
1790         if (copy_to_user(ubatch, &batch, sizeof(batch)) ||
1791             put_user(total, &uattr->batch.count))
1792                 ret = -EFAULT;
1793
1794 out:
1795         kvfree(keys);
1796         kvfree(values);
1797         return ret;
1798 }
1799
1800 static int
1801 htab_percpu_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
1802                              union bpf_attr __user *uattr)
1803 {
1804         return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1805                                                   false, true);
1806 }
1807
1808 static int
1809 htab_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
1810                                         const union bpf_attr *attr,
1811                                         union bpf_attr __user *uattr)
1812 {
1813         return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1814                                                   false, true);
1815 }
1816
1817 static int
1818 htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
1819                       union bpf_attr __user *uattr)
1820 {
1821         return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1822                                                   false, false);
1823 }
1824
1825 static int
1826 htab_map_lookup_and_delete_batch(struct bpf_map *map,
1827                                  const union bpf_attr *attr,
1828                                  union bpf_attr __user *uattr)
1829 {
1830         return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1831                                                   false, false);
1832 }
1833
1834 static int
1835 htab_lru_percpu_map_lookup_batch(struct bpf_map *map,
1836                                  const union bpf_attr *attr,
1837                                  union bpf_attr __user *uattr)
1838 {
1839         return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1840                                                   true, true);
1841 }
1842
1843 static int
1844 htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
1845                                             const union bpf_attr *attr,
1846                                             union bpf_attr __user *uattr)
1847 {
1848         return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1849                                                   true, true);
1850 }
1851
1852 static int
1853 htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
1854                           union bpf_attr __user *uattr)
1855 {
1856         return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1857                                                   true, false);
1858 }
1859
1860 static int
1861 htab_lru_map_lookup_and_delete_batch(struct bpf_map *map,
1862                                      const union bpf_attr *attr,
1863                                      union bpf_attr __user *uattr)
1864 {
1865         return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1866                                                   true, false);
1867 }
1868
1869 struct bpf_iter_seq_hash_map_info {
1870         struct bpf_map *map;
1871         struct bpf_htab *htab;
1872         void *percpu_value_buf; // non-zero means percpu hash
1873         u32 bucket_id;
1874         u32 skip_elems;
1875 };
1876
1877 static struct htab_elem *
1878 bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info,
1879                            struct htab_elem *prev_elem)
1880 {
1881         const struct bpf_htab *htab = info->htab;
1882         u32 skip_elems = info->skip_elems;
1883         u32 bucket_id = info->bucket_id;
1884         struct hlist_nulls_head *head;
1885         struct hlist_nulls_node *n;
1886         struct htab_elem *elem;
1887         struct bucket *b;
1888         u32 i, count;
1889
1890         if (bucket_id >= htab->n_buckets)
1891                 return NULL;
1892
1893         /* try to find next elem in the same bucket */
1894         if (prev_elem) {
1895                 /* no update/deletion on this bucket, prev_elem should be still valid
1896                  * and we won't skip elements.
1897                  */
1898                 n = rcu_dereference_raw(hlist_nulls_next_rcu(&prev_elem->hash_node));
1899                 elem = hlist_nulls_entry_safe(n, struct htab_elem, hash_node);
1900                 if (elem)
1901                         return elem;
1902
1903                 /* not found, unlock and go to the next bucket */
1904                 b = &htab->buckets[bucket_id++];
1905                 rcu_read_unlock();
1906                 skip_elems = 0;
1907         }
1908
1909         for (i = bucket_id; i < htab->n_buckets; i++) {
1910                 b = &htab->buckets[i];
1911                 rcu_read_lock();
1912
1913                 count = 0;
1914                 head = &b->head;
1915                 hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) {
1916                         if (count >= skip_elems) {
1917                                 info->bucket_id = i;
1918                                 info->skip_elems = count;
1919                                 return elem;
1920                         }
1921                         count++;
1922                 }
1923
1924                 rcu_read_unlock();
1925                 skip_elems = 0;
1926         }
1927
1928         info->bucket_id = i;
1929         info->skip_elems = 0;
1930         return NULL;
1931 }
1932
1933 static void *bpf_hash_map_seq_start(struct seq_file *seq, loff_t *pos)
1934 {
1935         struct bpf_iter_seq_hash_map_info *info = seq->private;
1936         struct htab_elem *elem;
1937
1938         elem = bpf_hash_map_seq_find_next(info, NULL);
1939         if (!elem)
1940                 return NULL;
1941
1942         if (*pos == 0)
1943                 ++*pos;
1944         return elem;
1945 }
1946
1947 static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1948 {
1949         struct bpf_iter_seq_hash_map_info *info = seq->private;
1950
1951         ++*pos;
1952         ++info->skip_elems;
1953         return bpf_hash_map_seq_find_next(info, v);
1954 }
1955
1956 static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem)
1957 {
1958         struct bpf_iter_seq_hash_map_info *info = seq->private;
1959         u32 roundup_key_size, roundup_value_size;
1960         struct bpf_iter__bpf_map_elem ctx = {};
1961         struct bpf_map *map = info->map;
1962         struct bpf_iter_meta meta;
1963         int ret = 0, off = 0, cpu;
1964         struct bpf_prog *prog;
1965         void __percpu *pptr;
1966
1967         meta.seq = seq;
1968         prog = bpf_iter_get_info(&meta, elem == NULL);
1969         if (prog) {
1970                 ctx.meta = &meta;
1971                 ctx.map = info->map;
1972                 if (elem) {
1973                         roundup_key_size = round_up(map->key_size, 8);
1974                         ctx.key = elem->key;
1975                         if (!info->percpu_value_buf) {
1976                                 ctx.value = elem->key + roundup_key_size;
1977                         } else {
1978                                 roundup_value_size = round_up(map->value_size, 8);
1979                                 pptr = htab_elem_get_ptr(elem, map->key_size);
1980                                 for_each_possible_cpu(cpu) {
1981                                         bpf_long_memcpy(info->percpu_value_buf + off,
1982                                                         per_cpu_ptr(pptr, cpu),
1983                                                         roundup_value_size);
1984                                         off += roundup_value_size;
1985                                 }
1986                                 ctx.value = info->percpu_value_buf;
1987                         }
1988                 }
1989                 ret = bpf_iter_run_prog(prog, &ctx);
1990         }
1991
1992         return ret;
1993 }
1994
1995 static int bpf_hash_map_seq_show(struct seq_file *seq, void *v)
1996 {
1997         return __bpf_hash_map_seq_show(seq, v);
1998 }
1999
2000 static void bpf_hash_map_seq_stop(struct seq_file *seq, void *v)
2001 {
2002         if (!v)
2003                 (void)__bpf_hash_map_seq_show(seq, NULL);
2004         else
2005                 rcu_read_unlock();
2006 }
2007
2008 static int bpf_iter_init_hash_map(void *priv_data,
2009                                   struct bpf_iter_aux_info *aux)
2010 {
2011         struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
2012         struct bpf_map *map = aux->map;
2013         void *value_buf;
2014         u32 buf_size;
2015
2016         if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
2017             map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
2018                 buf_size = round_up(map->value_size, 8) * num_possible_cpus();
2019                 value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN);
2020                 if (!value_buf)
2021                         return -ENOMEM;
2022
2023                 seq_info->percpu_value_buf = value_buf;
2024         }
2025
2026         seq_info->map = map;
2027         seq_info->htab = container_of(map, struct bpf_htab, map);
2028         return 0;
2029 }
2030
2031 static void bpf_iter_fini_hash_map(void *priv_data)
2032 {
2033         struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
2034
2035         kfree(seq_info->percpu_value_buf);
2036 }
2037
2038 static const struct seq_operations bpf_hash_map_seq_ops = {
2039         .start  = bpf_hash_map_seq_start,
2040         .next   = bpf_hash_map_seq_next,
2041         .stop   = bpf_hash_map_seq_stop,
2042         .show   = bpf_hash_map_seq_show,
2043 };
2044
2045 static const struct bpf_iter_seq_info iter_seq_info = {
2046         .seq_ops                = &bpf_hash_map_seq_ops,
2047         .init_seq_private       = bpf_iter_init_hash_map,
2048         .fini_seq_private       = bpf_iter_fini_hash_map,
2049         .seq_priv_size          = sizeof(struct bpf_iter_seq_hash_map_info),
2050 };
2051
2052 static int bpf_for_each_hash_elem(struct bpf_map *map, void *callback_fn,
2053                                   void *callback_ctx, u64 flags)
2054 {
2055         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
2056         struct hlist_nulls_head *head;
2057         struct hlist_nulls_node *n;
2058         struct htab_elem *elem;
2059         u32 roundup_key_size;
2060         int i, num_elems = 0;
2061         void __percpu *pptr;
2062         struct bucket *b;
2063         void *key, *val;
2064         bool is_percpu;
2065         u64 ret = 0;
2066
2067         if (flags != 0)
2068                 return -EINVAL;
2069
2070         is_percpu = htab_is_percpu(htab);
2071
2072         roundup_key_size = round_up(map->key_size, 8);
2073         /* disable migration so percpu value prepared here will be the
2074          * same as the one seen by the bpf program with bpf_map_lookup_elem().
2075          */
2076         if (is_percpu)
2077                 migrate_disable();
2078         for (i = 0; i < htab->n_buckets; i++) {
2079                 b = &htab->buckets[i];
2080                 rcu_read_lock();
2081                 head = &b->head;
2082                 hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) {
2083                         key = elem->key;
2084                         if (is_percpu) {
2085                                 /* current cpu value for percpu map */
2086                                 pptr = htab_elem_get_ptr(elem, map->key_size);
2087                                 val = this_cpu_ptr(pptr);
2088                         } else {
2089                                 val = elem->key + roundup_key_size;
2090                         }
2091                         num_elems++;
2092                         ret = BPF_CAST_CALL(callback_fn)((u64)(long)map,
2093                                         (u64)(long)key, (u64)(long)val,
2094                                         (u64)(long)callback_ctx, 0);
2095                         /* return value: 0 - continue, 1 - stop and return */
2096                         if (ret) {
2097                                 rcu_read_unlock();
2098                                 goto out;
2099                         }
2100                 }
2101                 rcu_read_unlock();
2102         }
2103 out:
2104         if (is_percpu)
2105                 migrate_enable();
2106         return num_elems;
2107 }
2108
2109 static int htab_map_btf_id;
2110 const struct bpf_map_ops htab_map_ops = {
2111         .map_meta_equal = bpf_map_meta_equal,
2112         .map_alloc_check = htab_map_alloc_check,
2113         .map_alloc = htab_map_alloc,
2114         .map_free = htab_map_free,
2115         .map_get_next_key = htab_map_get_next_key,
2116         .map_release_uref = htab_map_free_timers,
2117         .map_lookup_elem = htab_map_lookup_elem,
2118         .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem,
2119         .map_update_elem = htab_map_update_elem,
2120         .map_delete_elem = htab_map_delete_elem,
2121         .map_gen_lookup = htab_map_gen_lookup,
2122         .map_seq_show_elem = htab_map_seq_show_elem,
2123         .map_set_for_each_callback_args = map_set_for_each_callback_args,
2124         .map_for_each_callback = bpf_for_each_hash_elem,
2125         BATCH_OPS(htab),
2126         .map_btf_name = "bpf_htab",
2127         .map_btf_id = &htab_map_btf_id,
2128         .iter_seq_info = &iter_seq_info,
2129 };
2130
2131 static int htab_lru_map_btf_id;
2132 const struct bpf_map_ops htab_lru_map_ops = {
2133         .map_meta_equal = bpf_map_meta_equal,
2134         .map_alloc_check = htab_map_alloc_check,
2135         .map_alloc = htab_map_alloc,
2136         .map_free = htab_map_free,
2137         .map_get_next_key = htab_map_get_next_key,
2138         .map_release_uref = htab_map_free_timers,
2139         .map_lookup_elem = htab_lru_map_lookup_elem,
2140         .map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem,
2141         .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
2142         .map_update_elem = htab_lru_map_update_elem,
2143         .map_delete_elem = htab_lru_map_delete_elem,
2144         .map_gen_lookup = htab_lru_map_gen_lookup,
2145         .map_seq_show_elem = htab_map_seq_show_elem,
2146         .map_set_for_each_callback_args = map_set_for_each_callback_args,
2147         .map_for_each_callback = bpf_for_each_hash_elem,
2148         BATCH_OPS(htab_lru),
2149         .map_btf_name = "bpf_htab",
2150         .map_btf_id = &htab_lru_map_btf_id,
2151         .iter_seq_info = &iter_seq_info,
2152 };
2153
2154 /* Called from eBPF program */
2155 static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
2156 {
2157         struct htab_elem *l = __htab_map_lookup_elem(map, key);
2158
2159         if (l)
2160                 return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
2161         else
2162                 return NULL;
2163 }
2164
2165 static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)
2166 {
2167         struct htab_elem *l = __htab_map_lookup_elem(map, key);
2168
2169         if (l) {
2170                 bpf_lru_node_set_ref(&l->lru_node);
2171                 return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
2172         }
2173
2174         return NULL;
2175 }
2176
2177 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
2178 {
2179         struct htab_elem *l;
2180         void __percpu *pptr;
2181         int ret = -ENOENT;
2182         int cpu, off = 0;
2183         u32 size;
2184
2185         /* per_cpu areas are zero-filled and bpf programs can only
2186          * access 'value_size' of them, so copying rounded areas
2187          * will not leak any kernel data
2188          */
2189         size = round_up(map->value_size, 8);
2190         rcu_read_lock();
2191         l = __htab_map_lookup_elem(map, key);
2192         if (!l)
2193                 goto out;
2194         /* We do not mark LRU map element here in order to not mess up
2195          * eviction heuristics when user space does a map walk.
2196          */
2197         pptr = htab_elem_get_ptr(l, map->key_size);
2198         for_each_possible_cpu(cpu) {
2199                 bpf_long_memcpy(value + off,
2200                                 per_cpu_ptr(pptr, cpu), size);
2201                 off += size;
2202         }
2203         ret = 0;
2204 out:
2205         rcu_read_unlock();
2206         return ret;
2207 }
2208
2209 int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
2210                            u64 map_flags)
2211 {
2212         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
2213         int ret;
2214
2215         rcu_read_lock();
2216         if (htab_is_lru(htab))
2217                 ret = __htab_lru_percpu_map_update_elem(map, key, value,
2218                                                         map_flags, true);
2219         else
2220                 ret = __htab_percpu_map_update_elem(map, key, value, map_flags,
2221                                                     true);
2222         rcu_read_unlock();
2223
2224         return ret;
2225 }
2226
2227 static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key,
2228                                           struct seq_file *m)
2229 {
2230         struct htab_elem *l;
2231         void __percpu *pptr;
2232         int cpu;
2233
2234         rcu_read_lock();
2235
2236         l = __htab_map_lookup_elem(map, key);
2237         if (!l) {
2238                 rcu_read_unlock();
2239                 return;
2240         }
2241
2242         btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
2243         seq_puts(m, ": {\n");
2244         pptr = htab_elem_get_ptr(l, map->key_size);
2245         for_each_possible_cpu(cpu) {
2246                 seq_printf(m, "\tcpu%d: ", cpu);
2247                 btf_type_seq_show(map->btf, map->btf_value_type_id,
2248                                   per_cpu_ptr(pptr, cpu), m);
2249                 seq_puts(m, "\n");
2250         }
2251         seq_puts(m, "}\n");
2252
2253         rcu_read_unlock();
2254 }
2255
2256 static int htab_percpu_map_btf_id;
2257 const struct bpf_map_ops htab_percpu_map_ops = {
2258         .map_meta_equal = bpf_map_meta_equal,
2259         .map_alloc_check = htab_map_alloc_check,
2260         .map_alloc = htab_map_alloc,
2261         .map_free = htab_map_free,
2262         .map_get_next_key = htab_map_get_next_key,
2263         .map_lookup_elem = htab_percpu_map_lookup_elem,
2264         .map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem,
2265         .map_update_elem = htab_percpu_map_update_elem,
2266         .map_delete_elem = htab_map_delete_elem,
2267         .map_seq_show_elem = htab_percpu_map_seq_show_elem,
2268         .map_set_for_each_callback_args = map_set_for_each_callback_args,
2269         .map_for_each_callback = bpf_for_each_hash_elem,
2270         BATCH_OPS(htab_percpu),
2271         .map_btf_name = "bpf_htab",
2272         .map_btf_id = &htab_percpu_map_btf_id,
2273         .iter_seq_info = &iter_seq_info,
2274 };
2275
2276 static int htab_lru_percpu_map_btf_id;
2277 const struct bpf_map_ops htab_lru_percpu_map_ops = {
2278         .map_meta_equal = bpf_map_meta_equal,
2279         .map_alloc_check = htab_map_alloc_check,
2280         .map_alloc = htab_map_alloc,
2281         .map_free = htab_map_free,
2282         .map_get_next_key = htab_map_get_next_key,
2283         .map_lookup_elem = htab_lru_percpu_map_lookup_elem,
2284         .map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem,
2285         .map_update_elem = htab_lru_percpu_map_update_elem,
2286         .map_delete_elem = htab_lru_map_delete_elem,
2287         .map_seq_show_elem = htab_percpu_map_seq_show_elem,
2288         .map_set_for_each_callback_args = map_set_for_each_callback_args,
2289         .map_for_each_callback = bpf_for_each_hash_elem,
2290         BATCH_OPS(htab_lru_percpu),
2291         .map_btf_name = "bpf_htab",
2292         .map_btf_id = &htab_lru_percpu_map_btf_id,
2293         .iter_seq_info = &iter_seq_info,
2294 };
2295
2296 static int fd_htab_map_alloc_check(union bpf_attr *attr)
2297 {
2298         if (attr->value_size != sizeof(u32))
2299                 return -EINVAL;
2300         return htab_map_alloc_check(attr);
2301 }
2302
2303 static void fd_htab_map_free(struct bpf_map *map)
2304 {
2305         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
2306         struct hlist_nulls_node *n;
2307         struct hlist_nulls_head *head;
2308         struct htab_elem *l;
2309         int i;
2310
2311         for (i = 0; i < htab->n_buckets; i++) {
2312                 head = select_bucket(htab, i);
2313
2314                 hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
2315                         void *ptr = fd_htab_map_get_ptr(map, l);
2316
2317                         map->ops->map_fd_put_ptr(ptr);
2318                 }
2319         }
2320
2321         htab_map_free(map);
2322 }
2323
2324 /* only called from syscall */
2325 int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
2326 {
2327         void **ptr;
2328         int ret = 0;
2329
2330         if (!map->ops->map_fd_sys_lookup_elem)
2331                 return -ENOTSUPP;
2332
2333         rcu_read_lock();
2334         ptr = htab_map_lookup_elem(map, key);
2335         if (ptr)
2336                 *value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr));
2337         else
2338                 ret = -ENOENT;
2339         rcu_read_unlock();
2340
2341         return ret;
2342 }
2343
2344 /* only called from syscall */
2345 int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
2346                                 void *key, void *value, u64 map_flags)
2347 {
2348         void *ptr;
2349         int ret;
2350         u32 ufd = *(u32 *)value;
2351
2352         ptr = map->ops->map_fd_get_ptr(map, map_file, ufd);
2353         if (IS_ERR(ptr))
2354                 return PTR_ERR(ptr);
2355
2356         ret = htab_map_update_elem(map, key, &ptr, map_flags);
2357         if (ret)
2358                 map->ops->map_fd_put_ptr(ptr);
2359
2360         return ret;
2361 }
2362
2363 static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr)
2364 {
2365         struct bpf_map *map, *inner_map_meta;
2366
2367         inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd);
2368         if (IS_ERR(inner_map_meta))
2369                 return inner_map_meta;
2370
2371         map = htab_map_alloc(attr);
2372         if (IS_ERR(map)) {
2373                 bpf_map_meta_free(inner_map_meta);
2374                 return map;
2375         }
2376
2377         map->inner_map_meta = inner_map_meta;
2378
2379         return map;
2380 }
2381
2382 static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key)
2383 {
2384         struct bpf_map **inner_map  = htab_map_lookup_elem(map, key);
2385
2386         if (!inner_map)
2387                 return NULL;
2388
2389         return READ_ONCE(*inner_map);
2390 }
2391
2392 static int htab_of_map_gen_lookup(struct bpf_map *map,
2393                                   struct bpf_insn *insn_buf)
2394 {
2395         struct bpf_insn *insn = insn_buf;
2396         const int ret = BPF_REG_0;
2397
2398         BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
2399                      (void *(*)(struct bpf_map *map, void *key))NULL));
2400         *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
2401         *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2);
2402         *insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
2403                                 offsetof(struct htab_elem, key) +
2404                                 round_up(map->key_size, 8));
2405         *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0);
2406
2407         return insn - insn_buf;
2408 }
2409
2410 static void htab_of_map_free(struct bpf_map *map)
2411 {
2412         bpf_map_meta_free(map->inner_map_meta);
2413         fd_htab_map_free(map);
2414 }
2415
2416 static int htab_of_maps_map_btf_id;
2417 const struct bpf_map_ops htab_of_maps_map_ops = {
2418         .map_alloc_check = fd_htab_map_alloc_check,
2419         .map_alloc = htab_of_map_alloc,
2420         .map_free = htab_of_map_free,
2421         .map_get_next_key = htab_map_get_next_key,
2422         .map_lookup_elem = htab_of_map_lookup_elem,
2423         .map_delete_elem = htab_map_delete_elem,
2424         .map_fd_get_ptr = bpf_map_fd_get_ptr,
2425         .map_fd_put_ptr = bpf_map_fd_put_ptr,
2426         .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
2427         .map_gen_lookup = htab_of_map_gen_lookup,
2428         .map_check_btf = map_check_no_btf,
2429         .map_btf_name = "bpf_htab",
2430         .map_btf_id = &htab_of_maps_map_btf_id,
2431 };