bpf: Call free_htab_elem() after htab_unlock_bucket()
authorHou Tao <houtao1@huawei.com>
Wed, 6 Nov 2024 06:35:40 +0000 (14:35 +0800)
committerAndrii Nakryiko <andrii@kernel.org>
Mon, 11 Nov 2024 16:18:30 +0000 (08:18 -0800)
For htab of maps, when the map is removed from the htab, it may hold the
last reference of the map. bpf_map_fd_put_ptr() will invoke
bpf_map_free_id() to free the id of the removed map element. However,
bpf_map_fd_put_ptr() is invoked while holding a bucket lock
(raw_spin_lock_t), and bpf_map_free_id() attempts to acquire map_idr_lock
(spinlock_t), triggering the following lockdep warning:

  =============================
  [ BUG: Invalid wait context ]
  6.11.0-rc4+ #49 Not tainted
  -----------------------------
  test_maps/4881 is trying to lock:
  ffffffff84884578 (map_idr_lock){+...}-{3:3}, at: bpf_map_free_id.part.0+0x21/0x70
  other info that might help us debug this:
  context-{5:5}
  2 locks held by test_maps/4881:
   #0: ffffffff846caf60 (rcu_read_lock){....}-{1:3}, at: bpf_fd_htab_map_update_elem+0xf9/0x270
   #1: ffff888149ced148 (&htab->lockdep_key#2){....}-{2:2}, at: htab_map_update_elem+0x178/0xa80
  stack backtrace:
  CPU: 0 UID: 0 PID: 4881 Comm: test_maps Not tainted 6.11.0-rc4+ #49
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), ...
  Call Trace:
   <TASK>
   dump_stack_lvl+0x6e/0xb0
   dump_stack+0x10/0x20
   __lock_acquire+0x73e/0x36c0
   lock_acquire+0x182/0x450
   _raw_spin_lock_irqsave+0x43/0x70
   bpf_map_free_id.part.0+0x21/0x70
   bpf_map_put+0xcf/0x110
   bpf_map_fd_put_ptr+0x9a/0xb0
   free_htab_elem+0x69/0xe0
   htab_map_update_elem+0x50f/0xa80
   bpf_fd_htab_map_update_elem+0x131/0x270
   htab_map_update_elem+0x50f/0xa80
   bpf_fd_htab_map_update_elem+0x131/0x270
   bpf_map_update_value+0x266/0x380
   __sys_bpf+0x21bb/0x36b0
   __x64_sys_bpf+0x45/0x60
   x64_sys_call+0x1b2a/0x20d0
   do_syscall_64+0x5d/0x100
   entry_SYSCALL_64_after_hwframe+0x76/0x7e

One way to fix the lockdep warning is using raw_spinlock_t for
map_idr_lock as well. However, bpf_map_alloc_id() invokes
idr_alloc_cyclic() after acquiring map_idr_lock, it will trigger a
similar lockdep warning because the slab's lock (s->cpu_slab->lock) is
still a spinlock.

Instead of changing map_idr_lock's type, fix the issue by invoking
htab_put_fd_value() after htab_unlock_bucket(). However, only deferring
the invocation of htab_put_fd_value() is not enough, because the old map
pointers in htab of maps can not be saved during batched deletion.
Therefore, also defer the invocation of free_htab_elem(), so these
to-be-freed elements could be linked together similar to lru map.

There are four callers for ->map_fd_put_ptr:

(1) alloc_htab_elem() (through htab_put_fd_value())
It invokes ->map_fd_put_ptr() under a raw_spinlock_t. The invocation of
htab_put_fd_value() can not simply move after htab_unlock_bucket(),
because the old element has already been stashed in htab->extra_elems.
It may be reused immediately after htab_unlock_bucket() and the
invocation of htab_put_fd_value() after htab_unlock_bucket() may release
the newly-added element incorrectly. Therefore, saving the map pointer
of the old element for htab of maps before unlocking the bucket and
releasing the map_ptr after unlock. Beside the map pointer in the old
element, should do the same thing for the special fields in the old
element as well.

(2) free_htab_elem() (through htab_put_fd_value())
Its caller includes __htab_map_lookup_and_delete_elem(),
htab_map_delete_elem() and __htab_map_lookup_and_delete_batch().

For htab_map_delete_elem(), simply invoke free_htab_elem() after
htab_unlock_bucket(). For __htab_map_lookup_and_delete_batch(), just
like lru map, linking the to-be-freed element into node_to_free list
and invoking free_htab_elem() for these element after unlock. It is safe
to reuse batch_flink as the link for node_to_free, because these
elements have been removed from the hash llist.

Because htab of maps doesn't support lookup_and_delete operation,
__htab_map_lookup_and_delete_elem() doesn't have the problem, so kept
it as is.

(3) fd_htab_map_free()
It invokes ->map_fd_put_ptr without raw_spinlock_t.

(4) bpf_fd_htab_map_update_elem()
It invokes ->map_fd_put_ptr without raw_spinlock_t.

After moving free_htab_elem() outside htab bucket lock scope, using
pcpu_freelist_push() instead of __pcpu_freelist_push() to disable
the irq before freeing elements, and protecting the invocations of
bpf_mem_cache_free() with migrate_{disable|enable} pair.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20241106063542.357743-2-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
kernel/bpf/hashtab.c

index b14b874..3ec941a 100644 (file)
@@ -896,9 +896,12 @@ find_first_elem:
 static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
 {
        check_and_free_fields(htab, l);
+
+       migrate_disable();
        if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
                bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr);
        bpf_mem_cache_free(&htab->ma, l);
+       migrate_enable();
 }
 
 static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
@@ -948,7 +951,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
        if (htab_is_prealloc(htab)) {
                bpf_map_dec_elem_count(&htab->map);
                check_and_free_fields(htab, l);
-               __pcpu_freelist_push(&htab->freelist, &l->fnode);
+               pcpu_freelist_push(&htab->freelist, &l->fnode);
        } else {
                dec_elem_count(htab);
                htab_elem_free(htab, l);
@@ -1018,7 +1021,6 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
                         */
                        pl_new = this_cpu_ptr(htab->extra_elems);
                        l_new = *pl_new;
-                       htab_put_fd_value(htab, old_elem);
                        *pl_new = old_elem;
                } else {
                        struct pcpu_freelist_node *l;
@@ -1105,6 +1107,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
        struct htab_elem *l_new = NULL, *l_old;
        struct hlist_nulls_head *head;
        unsigned long flags;
+       void *old_map_ptr;
        struct bucket *b;
        u32 key_size, hash;
        int ret;
@@ -1183,12 +1186,27 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
        hlist_nulls_add_head_rcu(&l_new->hash_node, head);
        if (l_old) {
                hlist_nulls_del_rcu(&l_old->hash_node);
+
+               /* l_old has already been stashed in htab->extra_elems, free
+                * its special fields before it is available for reuse. Also
+                * save the old map pointer in htab of maps before unlock
+                * and release it after unlock.
+                */
+               old_map_ptr = NULL;
+               if (htab_is_prealloc(htab)) {
+                       if (map->ops->map_fd_put_ptr)
+                               old_map_ptr = fd_htab_map_get_ptr(map, l_old);
+                       check_and_free_fields(htab, l_old);
+               }
+       }
+       htab_unlock_bucket(htab, b, hash, flags);
+       if (l_old) {
+               if (old_map_ptr)
+                       map->ops->map_fd_put_ptr(map, old_map_ptr, true);
                if (!htab_is_prealloc(htab))
                        free_htab_elem(htab, l_old);
-               else
-                       check_and_free_fields(htab, l_old);
        }
-       ret = 0;
+       return 0;
 err:
        htab_unlock_bucket(htab, b, hash, flags);
        return ret;
@@ -1432,15 +1450,15 @@ static long htab_map_delete_elem(struct bpf_map *map, void *key)
                return ret;
 
        l = lookup_elem_raw(head, hash, key, key_size);
-
-       if (l) {
+       if (l)
                hlist_nulls_del_rcu(&l->hash_node);
-               free_htab_elem(htab, l);
-       } else {
+       else
                ret = -ENOENT;
-       }
 
        htab_unlock_bucket(htab, b, hash, flags);
+
+       if (l)
+               free_htab_elem(htab, l);
        return ret;
 }
 
@@ -1853,13 +1871,14 @@ again_nocopy:
                         * may cause deadlock. See comments in function
                         * prealloc_lru_pop(). Let us do bpf_lru_push_free()
                         * after releasing the bucket lock.
+                        *
+                        * For htab of maps, htab_put_fd_value() in
+                        * free_htab_elem() may acquire a spinlock with bucket
+                        * lock being held and it violates the lock rule, so
+                        * invoke free_htab_elem() after unlock as well.
                         */
-                       if (is_lru_map) {
-                               l->batch_flink = node_to_free;
-                               node_to_free = l;
-                       } else {
-                               free_htab_elem(htab, l);
-                       }
+                       l->batch_flink = node_to_free;
+                       node_to_free = l;
                }
                dst_key += key_size;
                dst_val += value_size;
@@ -1871,7 +1890,10 @@ again_nocopy:
        while (node_to_free) {
                l = node_to_free;
                node_to_free = node_to_free->batch_flink;
-               htab_lru_push_free(htab, l);
+               if (is_lru_map)
+                       htab_lru_push_free(htab, l);
+               else
+                       free_htab_elem(htab, l);
        }
 
 next_batch: