perf cpumap: Add new map type for aggregation
[linux-2.6-microblaze.git] / lib / rhashtable.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Resizable, Scalable, Concurrent Hash Table
4  *
5  * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au>
6  * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch>
7  * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
8  *
9  * Code partially derived from nft_hash
10  * Rewritten with rehash code from br_multicast plus single list
11  * pointer as suggested by Josh Triplett
12  */
13
14 #include <linux/atomic.h>
15 #include <linux/kernel.h>
16 #include <linux/init.h>
17 #include <linux/log2.h>
18 #include <linux/sched.h>
19 #include <linux/rculist.h>
20 #include <linux/slab.h>
21 #include <linux/vmalloc.h>
22 #include <linux/mm.h>
23 #include <linux/jhash.h>
24 #include <linux/random.h>
25 #include <linux/rhashtable.h>
26 #include <linux/err.h>
27 #include <linux/export.h>
28
29 #define HASH_DEFAULT_SIZE       64UL
30 #define HASH_MIN_SIZE           4U
31
32 union nested_table {
33         union nested_table __rcu *table;
34         struct rhash_lock_head __rcu *bucket;
35 };
36
37 static u32 head_hashfn(struct rhashtable *ht,
38                        const struct bucket_table *tbl,
39                        const struct rhash_head *he)
40 {
41         return rht_head_hashfn(ht, tbl, he, ht->p);
42 }
43
44 #ifdef CONFIG_PROVE_LOCKING
45 #define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT))
46
47 int lockdep_rht_mutex_is_held(struct rhashtable *ht)
48 {
49         return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1;
50 }
51 EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held);
52
53 int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash)
54 {
55         if (!debug_locks)
56                 return 1;
57         if (unlikely(tbl->nest))
58                 return 1;
59         return bit_spin_is_locked(0, (unsigned long *)&tbl->buckets[hash]);
60 }
61 EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held);
62 #else
63 #define ASSERT_RHT_MUTEX(HT)
64 #endif
65
66 static inline union nested_table *nested_table_top(
67         const struct bucket_table *tbl)
68 {
69         /* The top-level bucket entry does not need RCU protection
70          * because it's set at the same time as tbl->nest.
71          */
72         return (void *)rcu_dereference_protected(tbl->buckets[0], 1);
73 }
74
75 static void nested_table_free(union nested_table *ntbl, unsigned int size)
76 {
77         const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
78         const unsigned int len = 1 << shift;
79         unsigned int i;
80
81         ntbl = rcu_dereference_protected(ntbl->table, 1);
82         if (!ntbl)
83                 return;
84
85         if (size > len) {
86                 size >>= shift;
87                 for (i = 0; i < len; i++)
88                         nested_table_free(ntbl + i, size);
89         }
90
91         kfree(ntbl);
92 }
93
94 static void nested_bucket_table_free(const struct bucket_table *tbl)
95 {
96         unsigned int size = tbl->size >> tbl->nest;
97         unsigned int len = 1 << tbl->nest;
98         union nested_table *ntbl;
99         unsigned int i;
100
101         ntbl = nested_table_top(tbl);
102
103         for (i = 0; i < len; i++)
104                 nested_table_free(ntbl + i, size);
105
106         kfree(ntbl);
107 }
108
109 static void bucket_table_free(const struct bucket_table *tbl)
110 {
111         if (tbl->nest)
112                 nested_bucket_table_free(tbl);
113
114         kvfree(tbl);
115 }
116
117 static void bucket_table_free_rcu(struct rcu_head *head)
118 {
119         bucket_table_free(container_of(head, struct bucket_table, rcu));
120 }
121
122 static union nested_table *nested_table_alloc(struct rhashtable *ht,
123                                               union nested_table __rcu **prev,
124                                               bool leaf)
125 {
126         union nested_table *ntbl;
127         int i;
128
129         ntbl = rcu_dereference(*prev);
130         if (ntbl)
131                 return ntbl;
132
133         ntbl = kzalloc(PAGE_SIZE, GFP_ATOMIC);
134
135         if (ntbl && leaf) {
136                 for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0]); i++)
137                         INIT_RHT_NULLS_HEAD(ntbl[i].bucket);
138         }
139
140         if (cmpxchg((union nested_table **)prev, NULL, ntbl) == NULL)
141                 return ntbl;
142         /* Raced with another thread. */
143         kfree(ntbl);
144         return rcu_dereference(*prev);
145 }
146
147 static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht,
148                                                       size_t nbuckets,
149                                                       gfp_t gfp)
150 {
151         const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
152         struct bucket_table *tbl;
153         size_t size;
154
155         if (nbuckets < (1 << (shift + 1)))
156                 return NULL;
157
158         size = sizeof(*tbl) + sizeof(tbl->buckets[0]);
159
160         tbl = kzalloc(size, gfp);
161         if (!tbl)
162                 return NULL;
163
164         if (!nested_table_alloc(ht, (union nested_table __rcu **)tbl->buckets,
165                                 false)) {
166                 kfree(tbl);
167                 return NULL;
168         }
169
170         tbl->nest = (ilog2(nbuckets) - 1) % shift + 1;
171
172         return tbl;
173 }
174
175 static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
176                                                size_t nbuckets,
177                                                gfp_t gfp)
178 {
179         struct bucket_table *tbl = NULL;
180         size_t size;
181         int i;
182         static struct lock_class_key __key;
183
184         tbl = kvzalloc(struct_size(tbl, buckets, nbuckets), gfp);
185
186         size = nbuckets;
187
188         if (tbl == NULL && (gfp & ~__GFP_NOFAIL) != GFP_KERNEL) {
189                 tbl = nested_bucket_table_alloc(ht, nbuckets, gfp);
190                 nbuckets = 0;
191         }
192
193         if (tbl == NULL)
194                 return NULL;
195
196         lockdep_init_map(&tbl->dep_map, "rhashtable_bucket", &__key, 0);
197
198         tbl->size = size;
199
200         rcu_head_init(&tbl->rcu);
201         INIT_LIST_HEAD(&tbl->walkers);
202
203         tbl->hash_rnd = get_random_u32();
204
205         for (i = 0; i < nbuckets; i++)
206                 INIT_RHT_NULLS_HEAD(tbl->buckets[i]);
207
208         return tbl;
209 }
210
211 static struct bucket_table *rhashtable_last_table(struct rhashtable *ht,
212                                                   struct bucket_table *tbl)
213 {
214         struct bucket_table *new_tbl;
215
216         do {
217                 new_tbl = tbl;
218                 tbl = rht_dereference_rcu(tbl->future_tbl, ht);
219         } while (tbl);
220
221         return new_tbl;
222 }
223
224 static int rhashtable_rehash_one(struct rhashtable *ht,
225                                  struct rhash_lock_head __rcu **bkt,
226                                  unsigned int old_hash)
227 {
228         struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
229         struct bucket_table *new_tbl = rhashtable_last_table(ht, old_tbl);
230         int err = -EAGAIN;
231         struct rhash_head *head, *next, *entry;
232         struct rhash_head __rcu **pprev = NULL;
233         unsigned int new_hash;
234
235         if (new_tbl->nest)
236                 goto out;
237
238         err = -ENOENT;
239
240         rht_for_each_from(entry, rht_ptr(bkt, old_tbl, old_hash),
241                           old_tbl, old_hash) {
242                 err = 0;
243                 next = rht_dereference_bucket(entry->next, old_tbl, old_hash);
244
245                 if (rht_is_a_nulls(next))
246                         break;
247
248                 pprev = &entry->next;
249         }
250
251         if (err)
252                 goto out;
253
254         new_hash = head_hashfn(ht, new_tbl, entry);
255
256         rht_lock_nested(new_tbl, &new_tbl->buckets[new_hash], SINGLE_DEPTH_NESTING);
257
258         head = rht_ptr(new_tbl->buckets + new_hash, new_tbl, new_hash);
259
260         RCU_INIT_POINTER(entry->next, head);
261
262         rht_assign_unlock(new_tbl, &new_tbl->buckets[new_hash], entry);
263
264         if (pprev)
265                 rcu_assign_pointer(*pprev, next);
266         else
267                 /* Need to preserved the bit lock. */
268                 rht_assign_locked(bkt, next);
269
270 out:
271         return err;
272 }
273
274 static int rhashtable_rehash_chain(struct rhashtable *ht,
275                                     unsigned int old_hash)
276 {
277         struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
278         struct rhash_lock_head __rcu **bkt = rht_bucket_var(old_tbl, old_hash);
279         int err;
280
281         if (!bkt)
282                 return 0;
283         rht_lock(old_tbl, bkt);
284
285         while (!(err = rhashtable_rehash_one(ht, bkt, old_hash)))
286                 ;
287
288         if (err == -ENOENT)
289                 err = 0;
290         rht_unlock(old_tbl, bkt);
291
292         return err;
293 }
294
295 static int rhashtable_rehash_attach(struct rhashtable *ht,
296                                     struct bucket_table *old_tbl,
297                                     struct bucket_table *new_tbl)
298 {
299         /* Make insertions go into the new, empty table right away. Deletions
300          * and lookups will be attempted in both tables until we synchronize.
301          * As cmpxchg() provides strong barriers, we do not need
302          * rcu_assign_pointer().
303          */
304
305         if (cmpxchg((struct bucket_table **)&old_tbl->future_tbl, NULL,
306                     new_tbl) != NULL)
307                 return -EEXIST;
308
309         return 0;
310 }
311
312 static int rhashtable_rehash_table(struct rhashtable *ht)
313 {
314         struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
315         struct bucket_table *new_tbl;
316         struct rhashtable_walker *walker;
317         unsigned int old_hash;
318         int err;
319
320         new_tbl = rht_dereference(old_tbl->future_tbl, ht);
321         if (!new_tbl)
322                 return 0;
323
324         for (old_hash = 0; old_hash < old_tbl->size; old_hash++) {
325                 err = rhashtable_rehash_chain(ht, old_hash);
326                 if (err)
327                         return err;
328                 cond_resched();
329         }
330
331         /* Publish the new table pointer. */
332         rcu_assign_pointer(ht->tbl, new_tbl);
333
334         spin_lock(&ht->lock);
335         list_for_each_entry(walker, &old_tbl->walkers, list)
336                 walker->tbl = NULL;
337
338         /* Wait for readers. All new readers will see the new
339          * table, and thus no references to the old table will
340          * remain.
341          * We do this inside the locked region so that
342          * rhashtable_walk_stop() can use rcu_head_after_call_rcu()
343          * to check if it should not re-link the table.
344          */
345         call_rcu(&old_tbl->rcu, bucket_table_free_rcu);
346         spin_unlock(&ht->lock);
347
348         return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0;
349 }
350
351 static int rhashtable_rehash_alloc(struct rhashtable *ht,
352                                    struct bucket_table *old_tbl,
353                                    unsigned int size)
354 {
355         struct bucket_table *new_tbl;
356         int err;
357
358         ASSERT_RHT_MUTEX(ht);
359
360         new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
361         if (new_tbl == NULL)
362                 return -ENOMEM;
363
364         err = rhashtable_rehash_attach(ht, old_tbl, new_tbl);
365         if (err)
366                 bucket_table_free(new_tbl);
367
368         return err;
369 }
370
371 /**
372  * rhashtable_shrink - Shrink hash table while allowing concurrent lookups
373  * @ht:         the hash table to shrink
374  *
375  * This function shrinks the hash table to fit, i.e., the smallest
376  * size would not cause it to expand right away automatically.
377  *
378  * The caller must ensure that no concurrent resizing occurs by holding
379  * ht->mutex.
380  *
381  * The caller must ensure that no concurrent table mutations take place.
382  * It is however valid to have concurrent lookups if they are RCU protected.
383  *
384  * It is valid to have concurrent insertions and deletions protected by per
385  * bucket locks or concurrent RCU protected lookups and traversals.
386  */
387 static int rhashtable_shrink(struct rhashtable *ht)
388 {
389         struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
390         unsigned int nelems = atomic_read(&ht->nelems);
391         unsigned int size = 0;
392
393         if (nelems)
394                 size = roundup_pow_of_two(nelems * 3 / 2);
395         if (size < ht->p.min_size)
396                 size = ht->p.min_size;
397
398         if (old_tbl->size <= size)
399                 return 0;
400
401         if (rht_dereference(old_tbl->future_tbl, ht))
402                 return -EEXIST;
403
404         return rhashtable_rehash_alloc(ht, old_tbl, size);
405 }
406
407 static void rht_deferred_worker(struct work_struct *work)
408 {
409         struct rhashtable *ht;
410         struct bucket_table *tbl;
411         int err = 0;
412
413         ht = container_of(work, struct rhashtable, run_work);
414         mutex_lock(&ht->mutex);
415
416         tbl = rht_dereference(ht->tbl, ht);
417         tbl = rhashtable_last_table(ht, tbl);
418
419         if (rht_grow_above_75(ht, tbl))
420                 err = rhashtable_rehash_alloc(ht, tbl, tbl->size * 2);
421         else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl))
422                 err = rhashtable_shrink(ht);
423         else if (tbl->nest)
424                 err = rhashtable_rehash_alloc(ht, tbl, tbl->size);
425
426         if (!err || err == -EEXIST) {
427                 int nerr;
428
429                 nerr = rhashtable_rehash_table(ht);
430                 err = err ?: nerr;
431         }
432
433         mutex_unlock(&ht->mutex);
434
435         if (err)
436                 schedule_work(&ht->run_work);
437 }
438
439 static int rhashtable_insert_rehash(struct rhashtable *ht,
440                                     struct bucket_table *tbl)
441 {
442         struct bucket_table *old_tbl;
443         struct bucket_table *new_tbl;
444         unsigned int size;
445         int err;
446
447         old_tbl = rht_dereference_rcu(ht->tbl, ht);
448
449         size = tbl->size;
450
451         err = -EBUSY;
452
453         if (rht_grow_above_75(ht, tbl))
454                 size *= 2;
455         /* Do not schedule more than one rehash */
456         else if (old_tbl != tbl)
457                 goto fail;
458
459         err = -ENOMEM;
460
461         new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC | __GFP_NOWARN);
462         if (new_tbl == NULL)
463                 goto fail;
464
465         err = rhashtable_rehash_attach(ht, tbl, new_tbl);
466         if (err) {
467                 bucket_table_free(new_tbl);
468                 if (err == -EEXIST)
469                         err = 0;
470         } else
471                 schedule_work(&ht->run_work);
472
473         return err;
474
475 fail:
476         /* Do not fail the insert if someone else did a rehash. */
477         if (likely(rcu_access_pointer(tbl->future_tbl)))
478                 return 0;
479
480         /* Schedule async rehash to retry allocation in process context. */
481         if (err == -ENOMEM)
482                 schedule_work(&ht->run_work);
483
484         return err;
485 }
486
487 static void *rhashtable_lookup_one(struct rhashtable *ht,
488                                    struct rhash_lock_head __rcu **bkt,
489                                    struct bucket_table *tbl, unsigned int hash,
490                                    const void *key, struct rhash_head *obj)
491 {
492         struct rhashtable_compare_arg arg = {
493                 .ht = ht,
494                 .key = key,
495         };
496         struct rhash_head __rcu **pprev = NULL;
497         struct rhash_head *head;
498         int elasticity;
499
500         elasticity = RHT_ELASTICITY;
501         rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) {
502                 struct rhlist_head *list;
503                 struct rhlist_head *plist;
504
505                 elasticity--;
506                 if (!key ||
507                     (ht->p.obj_cmpfn ?
508                      ht->p.obj_cmpfn(&arg, rht_obj(ht, head)) :
509                      rhashtable_compare(&arg, rht_obj(ht, head)))) {
510                         pprev = &head->next;
511                         continue;
512                 }
513
514                 if (!ht->rhlist)
515                         return rht_obj(ht, head);
516
517                 list = container_of(obj, struct rhlist_head, rhead);
518                 plist = container_of(head, struct rhlist_head, rhead);
519
520                 RCU_INIT_POINTER(list->next, plist);
521                 head = rht_dereference_bucket(head->next, tbl, hash);
522                 RCU_INIT_POINTER(list->rhead.next, head);
523                 if (pprev)
524                         rcu_assign_pointer(*pprev, obj);
525                 else
526                         /* Need to preserve the bit lock */
527                         rht_assign_locked(bkt, obj);
528
529                 return NULL;
530         }
531
532         if (elasticity <= 0)
533                 return ERR_PTR(-EAGAIN);
534
535         return ERR_PTR(-ENOENT);
536 }
537
538 static struct bucket_table *rhashtable_insert_one(
539         struct rhashtable *ht, struct rhash_lock_head __rcu **bkt,
540         struct bucket_table *tbl, unsigned int hash, struct rhash_head *obj,
541         void *data)
542 {
543         struct bucket_table *new_tbl;
544         struct rhash_head *head;
545
546         if (!IS_ERR_OR_NULL(data))
547                 return ERR_PTR(-EEXIST);
548
549         if (PTR_ERR(data) != -EAGAIN && PTR_ERR(data) != -ENOENT)
550                 return ERR_CAST(data);
551
552         new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
553         if (new_tbl)
554                 return new_tbl;
555
556         if (PTR_ERR(data) != -ENOENT)
557                 return ERR_CAST(data);
558
559         if (unlikely(rht_grow_above_max(ht, tbl)))
560                 return ERR_PTR(-E2BIG);
561
562         if (unlikely(rht_grow_above_100(ht, tbl)))
563                 return ERR_PTR(-EAGAIN);
564
565         head = rht_ptr(bkt, tbl, hash);
566
567         RCU_INIT_POINTER(obj->next, head);
568         if (ht->rhlist) {
569                 struct rhlist_head *list;
570
571                 list = container_of(obj, struct rhlist_head, rhead);
572                 RCU_INIT_POINTER(list->next, NULL);
573         }
574
575         /* bkt is always the head of the list, so it holds
576          * the lock, which we need to preserve
577          */
578         rht_assign_locked(bkt, obj);
579
580         atomic_inc(&ht->nelems);
581         if (rht_grow_above_75(ht, tbl))
582                 schedule_work(&ht->run_work);
583
584         return NULL;
585 }
586
587 static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
588                                    struct rhash_head *obj)
589 {
590         struct bucket_table *new_tbl;
591         struct bucket_table *tbl;
592         struct rhash_lock_head __rcu **bkt;
593         unsigned int hash;
594         void *data;
595
596         new_tbl = rcu_dereference(ht->tbl);
597
598         do {
599                 tbl = new_tbl;
600                 hash = rht_head_hashfn(ht, tbl, obj, ht->p);
601                 if (rcu_access_pointer(tbl->future_tbl))
602                         /* Failure is OK */
603                         bkt = rht_bucket_var(tbl, hash);
604                 else
605                         bkt = rht_bucket_insert(ht, tbl, hash);
606                 if (bkt == NULL) {
607                         new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
608                         data = ERR_PTR(-EAGAIN);
609                 } else {
610                         rht_lock(tbl, bkt);
611                         data = rhashtable_lookup_one(ht, bkt, tbl,
612                                                      hash, key, obj);
613                         new_tbl = rhashtable_insert_one(ht, bkt, tbl,
614                                                         hash, obj, data);
615                         if (PTR_ERR(new_tbl) != -EEXIST)
616                                 data = ERR_CAST(new_tbl);
617
618                         rht_unlock(tbl, bkt);
619                 }
620         } while (!IS_ERR_OR_NULL(new_tbl));
621
622         if (PTR_ERR(data) == -EAGAIN)
623                 data = ERR_PTR(rhashtable_insert_rehash(ht, tbl) ?:
624                                -EAGAIN);
625
626         return data;
627 }
628
629 void *rhashtable_insert_slow(struct rhashtable *ht, const void *key,
630                              struct rhash_head *obj)
631 {
632         void *data;
633
634         do {
635                 rcu_read_lock();
636                 data = rhashtable_try_insert(ht, key, obj);
637                 rcu_read_unlock();
638         } while (PTR_ERR(data) == -EAGAIN);
639
640         return data;
641 }
642 EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
643
644 /**
645  * rhashtable_walk_enter - Initialise an iterator
646  * @ht:         Table to walk over
647  * @iter:       Hash table Iterator
648  *
649  * This function prepares a hash table walk.
650  *
651  * Note that if you restart a walk after rhashtable_walk_stop you
652  * may see the same object twice.  Also, you may miss objects if
653  * there are removals in between rhashtable_walk_stop and the next
654  * call to rhashtable_walk_start.
655  *
656  * For a completely stable walk you should construct your own data
657  * structure outside the hash table.
658  *
659  * This function may be called from any process context, including
660  * non-preemptable context, but cannot be called from softirq or
661  * hardirq context.
662  *
663  * You must call rhashtable_walk_exit after this function returns.
664  */
665 void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter)
666 {
667         iter->ht = ht;
668         iter->p = NULL;
669         iter->slot = 0;
670         iter->skip = 0;
671         iter->end_of_table = 0;
672
673         spin_lock(&ht->lock);
674         iter->walker.tbl =
675                 rcu_dereference_protected(ht->tbl, lockdep_is_held(&ht->lock));
676         list_add(&iter->walker.list, &iter->walker.tbl->walkers);
677         spin_unlock(&ht->lock);
678 }
679 EXPORT_SYMBOL_GPL(rhashtable_walk_enter);
680
681 /**
682  * rhashtable_walk_exit - Free an iterator
683  * @iter:       Hash table Iterator
684  *
685  * This function frees resources allocated by rhashtable_walk_enter.
686  */
687 void rhashtable_walk_exit(struct rhashtable_iter *iter)
688 {
689         spin_lock(&iter->ht->lock);
690         if (iter->walker.tbl)
691                 list_del(&iter->walker.list);
692         spin_unlock(&iter->ht->lock);
693 }
694 EXPORT_SYMBOL_GPL(rhashtable_walk_exit);
695
696 /**
697  * rhashtable_walk_start_check - Start a hash table walk
698  * @iter:       Hash table iterator
699  *
700  * Start a hash table walk at the current iterator position.  Note that we take
701  * the RCU lock in all cases including when we return an error.  So you must
702  * always call rhashtable_walk_stop to clean up.
703  *
704  * Returns zero if successful.
705  *
706  * Returns -EAGAIN if resize event occured.  Note that the iterator
707  * will rewind back to the beginning and you may use it immediately
708  * by calling rhashtable_walk_next.
709  *
710  * rhashtable_walk_start is defined as an inline variant that returns
711  * void. This is preferred in cases where the caller would ignore
712  * resize events and always continue.
713  */
714 int rhashtable_walk_start_check(struct rhashtable_iter *iter)
715         __acquires(RCU)
716 {
717         struct rhashtable *ht = iter->ht;
718         bool rhlist = ht->rhlist;
719
720         rcu_read_lock();
721
722         spin_lock(&ht->lock);
723         if (iter->walker.tbl)
724                 list_del(&iter->walker.list);
725         spin_unlock(&ht->lock);
726
727         if (iter->end_of_table)
728                 return 0;
729         if (!iter->walker.tbl) {
730                 iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht);
731                 iter->slot = 0;
732                 iter->skip = 0;
733                 return -EAGAIN;
734         }
735
736         if (iter->p && !rhlist) {
737                 /*
738                  * We need to validate that 'p' is still in the table, and
739                  * if so, update 'skip'
740                  */
741                 struct rhash_head *p;
742                 int skip = 0;
743                 rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
744                         skip++;
745                         if (p == iter->p) {
746                                 iter->skip = skip;
747                                 goto found;
748                         }
749                 }
750                 iter->p = NULL;
751         } else if (iter->p && rhlist) {
752                 /* Need to validate that 'list' is still in the table, and
753                  * if so, update 'skip' and 'p'.
754                  */
755                 struct rhash_head *p;
756                 struct rhlist_head *list;
757                 int skip = 0;
758                 rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
759                         for (list = container_of(p, struct rhlist_head, rhead);
760                              list;
761                              list = rcu_dereference(list->next)) {
762                                 skip++;
763                                 if (list == iter->list) {
764                                         iter->p = p;
765                                         iter->skip = skip;
766                                         goto found;
767                                 }
768                         }
769                 }
770                 iter->p = NULL;
771         }
772 found:
773         return 0;
774 }
775 EXPORT_SYMBOL_GPL(rhashtable_walk_start_check);
776
777 /**
778  * __rhashtable_walk_find_next - Find the next element in a table (or the first
779  * one in case of a new walk).
780  *
781  * @iter:       Hash table iterator
782  *
783  * Returns the found object or NULL when the end of the table is reached.
784  *
785  * Returns -EAGAIN if resize event occurred.
786  */
787 static void *__rhashtable_walk_find_next(struct rhashtable_iter *iter)
788 {
789         struct bucket_table *tbl = iter->walker.tbl;
790         struct rhlist_head *list = iter->list;
791         struct rhashtable *ht = iter->ht;
792         struct rhash_head *p = iter->p;
793         bool rhlist = ht->rhlist;
794
795         if (!tbl)
796                 return NULL;
797
798         for (; iter->slot < tbl->size; iter->slot++) {
799                 int skip = iter->skip;
800
801                 rht_for_each_rcu(p, tbl, iter->slot) {
802                         if (rhlist) {
803                                 list = container_of(p, struct rhlist_head,
804                                                     rhead);
805                                 do {
806                                         if (!skip)
807                                                 goto next;
808                                         skip--;
809                                         list = rcu_dereference(list->next);
810                                 } while (list);
811
812                                 continue;
813                         }
814                         if (!skip)
815                                 break;
816                         skip--;
817                 }
818
819 next:
820                 if (!rht_is_a_nulls(p)) {
821                         iter->skip++;
822                         iter->p = p;
823                         iter->list = list;
824                         return rht_obj(ht, rhlist ? &list->rhead : p);
825                 }
826
827                 iter->skip = 0;
828         }
829
830         iter->p = NULL;
831
832         /* Ensure we see any new tables. */
833         smp_rmb();
834
835         iter->walker.tbl = rht_dereference_rcu(tbl->future_tbl, ht);
836         if (iter->walker.tbl) {
837                 iter->slot = 0;
838                 iter->skip = 0;
839                 return ERR_PTR(-EAGAIN);
840         } else {
841                 iter->end_of_table = true;
842         }
843
844         return NULL;
845 }
846
847 /**
848  * rhashtable_walk_next - Return the next object and advance the iterator
849  * @iter:       Hash table iterator
850  *
851  * Note that you must call rhashtable_walk_stop when you are finished
852  * with the walk.
853  *
854  * Returns the next object or NULL when the end of the table is reached.
855  *
856  * Returns -EAGAIN if resize event occurred.  Note that the iterator
857  * will rewind back to the beginning and you may continue to use it.
858  */
859 void *rhashtable_walk_next(struct rhashtable_iter *iter)
860 {
861         struct rhlist_head *list = iter->list;
862         struct rhashtable *ht = iter->ht;
863         struct rhash_head *p = iter->p;
864         bool rhlist = ht->rhlist;
865
866         if (p) {
867                 if (!rhlist || !(list = rcu_dereference(list->next))) {
868                         p = rcu_dereference(p->next);
869                         list = container_of(p, struct rhlist_head, rhead);
870                 }
871                 if (!rht_is_a_nulls(p)) {
872                         iter->skip++;
873                         iter->p = p;
874                         iter->list = list;
875                         return rht_obj(ht, rhlist ? &list->rhead : p);
876                 }
877
878                 /* At the end of this slot, switch to next one and then find
879                  * next entry from that point.
880                  */
881                 iter->skip = 0;
882                 iter->slot++;
883         }
884
885         return __rhashtable_walk_find_next(iter);
886 }
887 EXPORT_SYMBOL_GPL(rhashtable_walk_next);
888
889 /**
890  * rhashtable_walk_peek - Return the next object but don't advance the iterator
891  * @iter:       Hash table iterator
892  *
893  * Returns the next object or NULL when the end of the table is reached.
894  *
895  * Returns -EAGAIN if resize event occurred.  Note that the iterator
896  * will rewind back to the beginning and you may continue to use it.
897  */
898 void *rhashtable_walk_peek(struct rhashtable_iter *iter)
899 {
900         struct rhlist_head *list = iter->list;
901         struct rhashtable *ht = iter->ht;
902         struct rhash_head *p = iter->p;
903
904         if (p)
905                 return rht_obj(ht, ht->rhlist ? &list->rhead : p);
906
907         /* No object found in current iter, find next one in the table. */
908
909         if (iter->skip) {
910                 /* A nonzero skip value points to the next entry in the table
911                  * beyond that last one that was found. Decrement skip so
912                  * we find the current value. __rhashtable_walk_find_next
913                  * will restore the original value of skip assuming that
914                  * the table hasn't changed.
915                  */
916                 iter->skip--;
917         }
918
919         return __rhashtable_walk_find_next(iter);
920 }
921 EXPORT_SYMBOL_GPL(rhashtable_walk_peek);
922
923 /**
924  * rhashtable_walk_stop - Finish a hash table walk
925  * @iter:       Hash table iterator
926  *
927  * Finish a hash table walk.  Does not reset the iterator to the start of the
928  * hash table.
929  */
930 void rhashtable_walk_stop(struct rhashtable_iter *iter)
931         __releases(RCU)
932 {
933         struct rhashtable *ht;
934         struct bucket_table *tbl = iter->walker.tbl;
935
936         if (!tbl)
937                 goto out;
938
939         ht = iter->ht;
940
941         spin_lock(&ht->lock);
942         if (rcu_head_after_call_rcu(&tbl->rcu, bucket_table_free_rcu))
943                 /* This bucket table is being freed, don't re-link it. */
944                 iter->walker.tbl = NULL;
945         else
946                 list_add(&iter->walker.list, &tbl->walkers);
947         spin_unlock(&ht->lock);
948
949 out:
950         rcu_read_unlock();
951 }
952 EXPORT_SYMBOL_GPL(rhashtable_walk_stop);
953
954 static size_t rounded_hashtable_size(const struct rhashtable_params *params)
955 {
956         size_t retsize;
957
958         if (params->nelem_hint)
959                 retsize = max(roundup_pow_of_two(params->nelem_hint * 4 / 3),
960                               (unsigned long)params->min_size);
961         else
962                 retsize = max(HASH_DEFAULT_SIZE,
963                               (unsigned long)params->min_size);
964
965         return retsize;
966 }
967
968 static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed)
969 {
970         return jhash2(key, length, seed);
971 }
972
973 /**
974  * rhashtable_init - initialize a new hash table
975  * @ht:         hash table to be initialized
976  * @params:     configuration parameters
977  *
978  * Initializes a new hash table based on the provided configuration
979  * parameters. A table can be configured either with a variable or
980  * fixed length key:
981  *
982  * Configuration Example 1: Fixed length keys
983  * struct test_obj {
984  *      int                     key;
985  *      void *                  my_member;
986  *      struct rhash_head       node;
987  * };
988  *
989  * struct rhashtable_params params = {
990  *      .head_offset = offsetof(struct test_obj, node),
991  *      .key_offset = offsetof(struct test_obj, key),
992  *      .key_len = sizeof(int),
993  *      .hashfn = jhash,
994  * };
995  *
996  * Configuration Example 2: Variable length keys
997  * struct test_obj {
998  *      [...]
999  *      struct rhash_head       node;
1000  * };
1001  *
1002  * u32 my_hash_fn(const void *data, u32 len, u32 seed)
1003  * {
1004  *      struct test_obj *obj = data;
1005  *
1006  *      return [... hash ...];
1007  * }
1008  *
1009  * struct rhashtable_params params = {
1010  *      .head_offset = offsetof(struct test_obj, node),
1011  *      .hashfn = jhash,
1012  *      .obj_hashfn = my_hash_fn,
1013  * };
1014  */
1015 int rhashtable_init(struct rhashtable *ht,
1016                     const struct rhashtable_params *params)
1017 {
1018         struct bucket_table *tbl;
1019         size_t size;
1020
1021         if ((!params->key_len && !params->obj_hashfn) ||
1022             (params->obj_hashfn && !params->obj_cmpfn))
1023                 return -EINVAL;
1024
1025         memset(ht, 0, sizeof(*ht));
1026         mutex_init(&ht->mutex);
1027         spin_lock_init(&ht->lock);
1028         memcpy(&ht->p, params, sizeof(*params));
1029
1030         if (params->min_size)
1031                 ht->p.min_size = roundup_pow_of_two(params->min_size);
1032
1033         /* Cap total entries at 2^31 to avoid nelems overflow. */
1034         ht->max_elems = 1u << 31;
1035
1036         if (params->max_size) {
1037                 ht->p.max_size = rounddown_pow_of_two(params->max_size);
1038                 if (ht->p.max_size < ht->max_elems / 2)
1039                         ht->max_elems = ht->p.max_size * 2;
1040         }
1041
1042         ht->p.min_size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE);
1043
1044         size = rounded_hashtable_size(&ht->p);
1045
1046         ht->key_len = ht->p.key_len;
1047         if (!params->hashfn) {
1048                 ht->p.hashfn = jhash;
1049
1050                 if (!(ht->key_len & (sizeof(u32) - 1))) {
1051                         ht->key_len /= sizeof(u32);
1052                         ht->p.hashfn = rhashtable_jhash2;
1053                 }
1054         }
1055
1056         /*
1057          * This is api initialization and thus we need to guarantee the
1058          * initial rhashtable allocation. Upon failure, retry with the
1059          * smallest possible size with __GFP_NOFAIL semantics.
1060          */
1061         tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
1062         if (unlikely(tbl == NULL)) {
1063                 size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE);
1064                 tbl = bucket_table_alloc(ht, size, GFP_KERNEL | __GFP_NOFAIL);
1065         }
1066
1067         atomic_set(&ht->nelems, 0);
1068
1069         RCU_INIT_POINTER(ht->tbl, tbl);
1070
1071         INIT_WORK(&ht->run_work, rht_deferred_worker);
1072
1073         return 0;
1074 }
1075 EXPORT_SYMBOL_GPL(rhashtable_init);
1076
1077 /**
1078  * rhltable_init - initialize a new hash list table
1079  * @hlt:        hash list table to be initialized
1080  * @params:     configuration parameters
1081  *
1082  * Initializes a new hash list table.
1083  *
1084  * See documentation for rhashtable_init.
1085  */
1086 int rhltable_init(struct rhltable *hlt, const struct rhashtable_params *params)
1087 {
1088         int err;
1089
1090         err = rhashtable_init(&hlt->ht, params);
1091         hlt->ht.rhlist = true;
1092         return err;
1093 }
1094 EXPORT_SYMBOL_GPL(rhltable_init);
1095
1096 static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj,
1097                                 void (*free_fn)(void *ptr, void *arg),
1098                                 void *arg)
1099 {
1100         struct rhlist_head *list;
1101
1102         if (!ht->rhlist) {
1103                 free_fn(rht_obj(ht, obj), arg);
1104                 return;
1105         }
1106
1107         list = container_of(obj, struct rhlist_head, rhead);
1108         do {
1109                 obj = &list->rhead;
1110                 list = rht_dereference(list->next, ht);
1111                 free_fn(rht_obj(ht, obj), arg);
1112         } while (list);
1113 }
1114
1115 /**
1116  * rhashtable_free_and_destroy - free elements and destroy hash table
1117  * @ht:         the hash table to destroy
1118  * @free_fn:    callback to release resources of element
1119  * @arg:        pointer passed to free_fn
1120  *
1121  * Stops an eventual async resize. If defined, invokes free_fn for each
1122  * element to releasal resources. Please note that RCU protected
1123  * readers may still be accessing the elements. Releasing of resources
1124  * must occur in a compatible manner. Then frees the bucket array.
1125  *
1126  * This function will eventually sleep to wait for an async resize
1127  * to complete. The caller is responsible that no further write operations
1128  * occurs in parallel.
1129  */
1130 void rhashtable_free_and_destroy(struct rhashtable *ht,
1131                                  void (*free_fn)(void *ptr, void *arg),
1132                                  void *arg)
1133 {
1134         struct bucket_table *tbl, *next_tbl;
1135         unsigned int i;
1136
1137         cancel_work_sync(&ht->run_work);
1138
1139         mutex_lock(&ht->mutex);
1140         tbl = rht_dereference(ht->tbl, ht);
1141 restart:
1142         if (free_fn) {
1143                 for (i = 0; i < tbl->size; i++) {
1144                         struct rhash_head *pos, *next;
1145
1146                         cond_resched();
1147                         for (pos = rht_ptr_exclusive(rht_bucket(tbl, i)),
1148                              next = !rht_is_a_nulls(pos) ?
1149                                         rht_dereference(pos->next, ht) : NULL;
1150                              !rht_is_a_nulls(pos);
1151                              pos = next,
1152                              next = !rht_is_a_nulls(pos) ?
1153                                         rht_dereference(pos->next, ht) : NULL)
1154                                 rhashtable_free_one(ht, pos, free_fn, arg);
1155                 }
1156         }
1157
1158         next_tbl = rht_dereference(tbl->future_tbl, ht);
1159         bucket_table_free(tbl);
1160         if (next_tbl) {
1161                 tbl = next_tbl;
1162                 goto restart;
1163         }
1164         mutex_unlock(&ht->mutex);
1165 }
1166 EXPORT_SYMBOL_GPL(rhashtable_free_and_destroy);
1167
1168 void rhashtable_destroy(struct rhashtable *ht)
1169 {
1170         return rhashtable_free_and_destroy(ht, NULL, NULL);
1171 }
1172 EXPORT_SYMBOL_GPL(rhashtable_destroy);
1173
1174 struct rhash_lock_head __rcu **__rht_bucket_nested(
1175         const struct bucket_table *tbl, unsigned int hash)
1176 {
1177         const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
1178         unsigned int index = hash & ((1 << tbl->nest) - 1);
1179         unsigned int size = tbl->size >> tbl->nest;
1180         unsigned int subhash = hash;
1181         union nested_table *ntbl;
1182
1183         ntbl = nested_table_top(tbl);
1184         ntbl = rht_dereference_bucket_rcu(ntbl[index].table, tbl, hash);
1185         subhash >>= tbl->nest;
1186
1187         while (ntbl && size > (1 << shift)) {
1188                 index = subhash & ((1 << shift) - 1);
1189                 ntbl = rht_dereference_bucket_rcu(ntbl[index].table,
1190                                                   tbl, hash);
1191                 size >>= shift;
1192                 subhash >>= shift;
1193         }
1194
1195         if (!ntbl)
1196                 return NULL;
1197
1198         return &ntbl[subhash].bucket;
1199
1200 }
1201 EXPORT_SYMBOL_GPL(__rht_bucket_nested);
1202
1203 struct rhash_lock_head __rcu **rht_bucket_nested(
1204         const struct bucket_table *tbl, unsigned int hash)
1205 {
1206         static struct rhash_lock_head __rcu *rhnull;
1207
1208         if (!rhnull)
1209                 INIT_RHT_NULLS_HEAD(rhnull);
1210         return __rht_bucket_nested(tbl, hash) ?: &rhnull;
1211 }
1212 EXPORT_SYMBOL_GPL(rht_bucket_nested);
1213
1214 struct rhash_lock_head __rcu **rht_bucket_nested_insert(
1215         struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash)
1216 {
1217         const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
1218         unsigned int index = hash & ((1 << tbl->nest) - 1);
1219         unsigned int size = tbl->size >> tbl->nest;
1220         union nested_table *ntbl;
1221
1222         ntbl = nested_table_top(tbl);
1223         hash >>= tbl->nest;
1224         ntbl = nested_table_alloc(ht, &ntbl[index].table,
1225                                   size <= (1 << shift));
1226
1227         while (ntbl && size > (1 << shift)) {
1228                 index = hash & ((1 << shift) - 1);
1229                 size >>= shift;
1230                 hash >>= shift;
1231                 ntbl = nested_table_alloc(ht, &ntbl[index].table,
1232                                           size <= (1 << shift));
1233         }
1234
1235         if (!ntbl)
1236                 return NULL;
1237
1238         return &ntbl[hash].bucket;
1239
1240 }
1241 EXPORT_SYMBOL_GPL(rht_bucket_nested_insert);