bpf, sockmap: Don't sleep while holding RCU lock on tear-down
authorJakub Sitnicki <jakub@cloudflare.com>
Thu, 6 Feb 2020 11:16:50 +0000 (12:16 +0100)
committerDaniel Borkmann <daniel@iogearbox.net>
Fri, 7 Feb 2020 21:36:26 +0000 (22:36 +0100)
rcu_read_lock is needed to protect access to psock inside sock_map_unref
when tearing down the map. However, we can't afford to sleep in lock_sock
while in RCU read-side critical section. Grab the RCU lock only after we
have locked the socket.

This fixes RCU warnings triggerable on a VM with 1 vCPU when free'ing a
sockmap/sockhash that contains at least one socket:

| =============================
| WARNING: suspicious RCU usage
5.5.0-04005-g8fc91b972b73 #450 Not tainted
| -----------------------------
| include/linux/rcupdate.h:272 Illegal context switch in RCU read-side critical section!
|
| other info that might help us debug this:
|
|
| rcu_scheduler_active = 2, debug_locks = 1
| 4 locks held by kworker/0:1/62:
|  #0: ffff88813b019748 ((wq_completion)events){+.+.}, at: process_one_work+0x1d7/0x5e0
|  #1: ffffc900000abe50 ((work_completion)(&map->work)){+.+.}, at: process_one_work+0x1d7/0x5e0
|  #2: ffffffff82065d20 (rcu_read_lock){....}, at: sock_map_free+0x5/0x170
|  #3: ffff8881368c5df8 (&stab->lock){+...}, at: sock_map_free+0x64/0x170
|
| stack backtrace:
| CPU: 0 PID: 62 Comm: kworker/0:1 Not tainted 5.5.0-04005-g8fc91b972b73 #450
| Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014
| Workqueue: events bpf_map_free_deferred
| Call Trace:
|  dump_stack+0x71/0xa0
|  ___might_sleep+0x105/0x190
|  lock_sock_nested+0x28/0x90
|  sock_map_free+0x95/0x170
|  bpf_map_free_deferred+0x58/0x80
|  process_one_work+0x260/0x5e0
|  worker_thread+0x4d/0x3e0
|  kthread+0x108/0x140
|  ? process_one_work+0x5e0/0x5e0
|  ? kthread_park+0x90/0x90
|  ret_from_fork+0x3a/0x50

| =============================
| WARNING: suspicious RCU usage
5.5.0-04005-g8fc91b972b73-dirty #452 Not tainted
| -----------------------------
| include/linux/rcupdate.h:272 Illegal context switch in RCU read-side critical section!
|
| other info that might help us debug this:
|
|
| rcu_scheduler_active = 2, debug_locks = 1
| 4 locks held by kworker/0:1/62:
|  #0: ffff88813b019748 ((wq_completion)events){+.+.}, at: process_one_work+0x1d7/0x5e0
|  #1: ffffc900000abe50 ((work_completion)(&map->work)){+.+.}, at: process_one_work+0x1d7/0x5e0
|  #2: ffffffff82065d20 (rcu_read_lock){....}, at: sock_hash_free+0x5/0x1d0
|  #3: ffff888139966e00 (&htab->buckets[i].lock){+...}, at: sock_hash_free+0x92/0x1d0
|
| stack backtrace:
| CPU: 0 PID: 62 Comm: kworker/0:1 Not tainted 5.5.0-04005-g8fc91b972b73-dirty #452
| Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014
| Workqueue: events bpf_map_free_deferred
| Call Trace:
|  dump_stack+0x71/0xa0
|  ___might_sleep+0x105/0x190
|  lock_sock_nested+0x28/0x90
|  sock_hash_free+0xec/0x1d0
|  bpf_map_free_deferred+0x58/0x80
|  process_one_work+0x260/0x5e0
|  worker_thread+0x4d/0x3e0
|  kthread+0x108/0x140
|  ? process_one_work+0x5e0/0x5e0
|  ? kthread_park+0x90/0x90
|  ret_from_fork+0x3a/0x50

Fixes: 7e81a3530206 ("bpf: Sockmap, ensure sock lock held during tear down")
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20200206111652.694507-2-jakub@cloudflare.com
net/core/sock_map.c

index 36a2433..9925044 100644 (file)
@@ -234,7 +234,6 @@ static void sock_map_free(struct bpf_map *map)
        int i;
 
        synchronize_rcu();
-       rcu_read_lock();
        raw_spin_lock_bh(&stab->lock);
        for (i = 0; i < stab->map.max_entries; i++) {
                struct sock **psk = &stab->sks[i];
@@ -243,12 +242,13 @@ static void sock_map_free(struct bpf_map *map)
                sk = xchg(psk, NULL);
                if (sk) {
                        lock_sock(sk);
+                       rcu_read_lock();
                        sock_map_unref(sk, psk);
+                       rcu_read_unlock();
                        release_sock(sk);
                }
        }
        raw_spin_unlock_bh(&stab->lock);
-       rcu_read_unlock();
 
        synchronize_rcu();
 
@@ -863,19 +863,19 @@ static void sock_hash_free(struct bpf_map *map)
        int i;
 
        synchronize_rcu();
-       rcu_read_lock();
        for (i = 0; i < htab->buckets_num; i++) {
                bucket = sock_hash_select_bucket(htab, i);
                raw_spin_lock_bh(&bucket->lock);
                hlist_for_each_entry_safe(elem, node, &bucket->head, node) {
                        hlist_del_rcu(&elem->node);
                        lock_sock(elem->sk);
+                       rcu_read_lock();
                        sock_map_unref(elem->sk, elem);
+                       rcu_read_unlock();
                        release_sock(elem->sk);
                }
                raw_spin_unlock_bh(&bucket->lock);
        }
-       rcu_read_unlock();
 
        bpf_map_area_free(htab->buckets);
        kfree(htab);