bcachefs: Really don't hold btree locks while btree IOs are in flight
authorKent Overstreet <kent.overstreet@gmail.com>
Sun, 11 Jul 2021 03:03:15 +0000 (23:03 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:09:08 +0000 (17:09 -0400)
This is something we've attempted to stick to for quite some time, as it
helps guarantee filesystem latency - but there's a few remaining paths
that this patch fixes.

This is also necessary for an upcoming patch to update btree pointers
after every btree write - since the btree write completion path will now
be doing btree operations.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/btree_cache.c
fs/bcachefs/btree_io.c
fs/bcachefs/btree_io.h
fs/bcachefs/btree_update_interior.c
fs/bcachefs/debug.c

index 15f597a..051d286 100644 (file)
@@ -187,6 +187,17 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
        int ret = 0;
 
        lockdep_assert_held(&bc->lock);
+wait_on_io:
+       if (b->flags & ((1U << BTREE_NODE_dirty)|
+                       (1U << BTREE_NODE_read_in_flight)|
+                       (1U << BTREE_NODE_write_in_flight))) {
+               if (!flush)
+                       return -ENOMEM;
+
+               /* XXX: waiting on IO with btree cache lock held */
+               bch2_btree_node_wait_on_read(b);
+               bch2_btree_node_wait_on_write(b);
+       }
 
        if (!six_trylock_intent(&b->c.lock))
                return -ENOMEM;
@@ -194,25 +205,26 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
        if (!six_trylock_write(&b->c.lock))
                goto out_unlock_intent;
 
+       /* recheck under lock */
+       if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
+                       (1U << BTREE_NODE_write_in_flight))) {
+               if (!flush)
+                       goto out_unlock;
+               six_unlock_write(&b->c.lock);
+               six_unlock_intent(&b->c.lock);
+               goto wait_on_io;
+       }
+
        if (btree_node_noevict(b))
                goto out_unlock;
 
        if (!btree_node_may_write(b))
                goto out_unlock;
 
-       if (btree_node_dirty(b) &&
-           test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
-               goto out_unlock;
-
-       if (btree_node_dirty(b) ||
-           btree_node_write_in_flight(b) ||
-           btree_node_read_in_flight(b)) {
-               if (!flush)
+       if (btree_node_dirty(b)) {
+               if (!flush ||
+                   test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
                        goto out_unlock;
-
-               wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
-                              TASK_UNINTERRUPTIBLE);
-
                /*
                 * Using the underscore version because we don't want to compact
                 * bsets after the write, since this node is about to be evicted
@@ -224,8 +236,9 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
                else
                        __bch2_btree_node_write(c, b);
 
-               /* wait for any in flight btree write */
-               btree_node_wait_on_io(b);
+               six_unlock_write(&b->c.lock);
+               six_unlock_intent(&b->c.lock);
+               goto wait_on_io;
        }
 out:
        if (b->hash_val && !ret)
@@ -581,6 +594,7 @@ got_node:
        }
 
        BUG_ON(btree_node_hashed(b));
+       BUG_ON(btree_node_dirty(b));
        BUG_ON(btree_node_write_in_flight(b));
 out:
        b->flags                = 0;
@@ -634,6 +648,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 {
        struct btree_cache *bc = &c->btree_cache;
        struct btree *b;
+       u32 seq;
 
        BUG_ON(level + 1 >= BTREE_MAX_DEPTH);
        /*
@@ -663,31 +678,31 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
                return NULL;
        }
 
+       set_btree_node_read_in_flight(b);
+
+       six_unlock_write(&b->c.lock);
+       seq = b->c.lock.state.seq;
+       six_unlock_intent(&b->c.lock);
+
        /* Unlock before doing IO: */
        if (iter && sync)
                bch2_trans_unlock(iter->trans);
 
        bch2_btree_node_read(c, b, sync);
 
-       six_unlock_write(&b->c.lock);
-
-       if (!sync) {
-               six_unlock_intent(&b->c.lock);
+       if (!sync)
                return NULL;
-       }
 
        /*
         * XXX: this will probably always fail because btree_iter_relock()
         * currently fails for iterators that aren't pointed at a valid btree
         * node
         */
-       if (iter && !bch2_trans_relock(iter->trans)) {
-               six_unlock_intent(&b->c.lock);
+       if (iter && !bch2_trans_relock(iter->trans))
                return ERR_PTR(-EINTR);
-       }
 
-       if (lock_type == SIX_LOCK_read)
-               six_lock_downgrade(&b->c.lock);
+       if (!six_relock_type(&b->c.lock, lock_type, seq))
+               return ERR_PTR(-EINTR);
 
        return b;
 }
@@ -831,11 +846,12 @@ lock_node:
        }
 
        if (unlikely(btree_node_read_in_flight(b))) {
+               u32 seq = b->c.lock.state.seq;
+
                six_unlock_type(&b->c.lock, lock_type);
                bch2_trans_unlock(iter->trans);
 
-               wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
-                              TASK_UNINTERRUPTIBLE);
+               bch2_btree_node_wait_on_read(b);
 
                /*
                 * XXX: check if this always fails - btree_iter_relock()
@@ -844,7 +860,9 @@ lock_node:
                 */
                if (iter && !bch2_trans_relock(iter->trans))
                        return ERR_PTR(-EINTR);
-               goto retry;
+
+               if (!six_relock_type(&b->c.lock, lock_type, seq))
+                       goto retry;
        }
 
        prefetch(b->aux_data);
@@ -923,8 +941,7 @@ lock_node:
        }
 
        /* XXX: waiting on IO with btree locks held: */
-       wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
-                      TASK_UNINTERRUPTIBLE);
+       __bch2_btree_node_wait_on_read(b);
 
        prefetch(b->aux_data);
 
@@ -979,16 +996,24 @@ void bch2_btree_node_evict(struct bch_fs *c, const struct bkey_i *k)
        b = btree_cache_find(bc, k);
        if (!b)
                return;
+wait_on_io:
+       /* not allowed to wait on io with btree locks held: */
+
+       /* XXX we're called from btree_gc which will be holding other btree
+        * nodes locked
+        * */
+       __bch2_btree_node_wait_on_read(b);
+       __bch2_btree_node_wait_on_write(b);
 
        six_lock_intent(&b->c.lock, NULL, NULL);
        six_lock_write(&b->c.lock, NULL, NULL);
 
-       wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
-                      TASK_UNINTERRUPTIBLE);
-       __bch2_btree_node_write(c, b);
-
-       /* wait for any in flight btree write */
-       btree_node_wait_on_io(b);
+       if (btree_node_dirty(b)) {
+               __bch2_btree_node_write(c, b);
+               six_unlock_write(&b->c.lock);
+               six_unlock_intent(&b->c.lock);
+               goto wait_on_io;
+       }
 
        BUG_ON(btree_node_dirty(b));
 
index 0095c78..2974b2a 100644 (file)
 
 #include <linux/sched/mm.h>
 
+void bch2_btree_node_io_unlock(struct btree *b)
+{
+       EBUG_ON(!btree_node_write_in_flight(b));
+
+       clear_btree_node_write_in_flight(b);
+       wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
+}
+
+void bch2_btree_node_io_lock(struct btree *b)
+{
+       wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
+                           TASK_UNINTERRUPTIBLE);
+}
+
+void __bch2_btree_node_wait_on_read(struct btree *b)
+{
+       wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
+                      TASK_UNINTERRUPTIBLE);
+}
+
+void __bch2_btree_node_wait_on_write(struct btree *b)
+{
+       wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
+                      TASK_UNINTERRUPTIBLE);
+}
+
+void bch2_btree_node_wait_on_read(struct btree *b)
+{
+       wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
+                      TASK_UNINTERRUPTIBLE);
+}
+
+void bch2_btree_node_wait_on_write(struct btree *b)
+{
+       wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
+                      TASK_UNINTERRUPTIBLE);
+}
+
 static void verify_no_dups(struct btree *b,
                           struct bkey_packed *start,
                           struct bkey_packed *end)
@@ -432,7 +470,8 @@ void bch2_btree_init_next(struct btree_trans *trans,
        EBUG_ON(iter && iter->l[b->c.level].b != b);
        BUG_ON(bset_written(b, bset(b, &b->set[1])));
 
-       if (b->nsets == MAX_BSETS) {
+       if (b->nsets == MAX_BSETS &&
+           !btree_node_write_in_flight(b)) {
                unsigned log_u64s[] = {
                        ilog2(bset_u64s(&b->set[0])),
                        ilog2(bset_u64s(&b->set[1])),
@@ -1402,8 +1441,6 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
        btree_pos_to_text(&PBUF(buf), c, b);
        trace_btree_read(c, b);
 
-       set_btree_node_read_in_flight(b);
-
        if (bch2_verify_all_btree_replicas &&
            !btree_node_read_all_replicas(c, b, sync))
                return;
@@ -1480,6 +1517,8 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
        bkey_copy(&b->key, k);
        BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id));
 
+       set_btree_node_read_in_flight(b);
+
        bch2_btree_node_read(c, b, true);
 
        if (btree_node_read_error(b)) {
@@ -1525,7 +1564,7 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b)
        struct btree_write *w = btree_prev_write(b);
 
        bch2_btree_complete_write(c, b, w);
-       btree_node_io_unlock(b);
+       bch2_btree_node_io_unlock(b);
 }
 
 static void bch2_btree_node_write_error(struct bch_fs *c,
@@ -1707,6 +1746,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
        bool validate_before_checksum = false;
        void *data;
 
+       BUG_ON(btree_node_write_in_flight(b));
+
        if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
                return;
 
@@ -1734,7 +1775,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
                         * XXX waiting on btree writes with btree locks held -
                         * this can deadlock, and we hit the write error path
                         */
-                       btree_node_wait_on_io(b);
+                       bch2_btree_node_wait_on_write(b);
                        continue;
                }
 
index fae6762..89fd4ab 100644 (file)
@@ -52,24 +52,12 @@ struct btree_write_bio {
        struct bch_write_bio    wbio;
 };
 
-static inline void btree_node_io_unlock(struct btree *b)
-{
-       EBUG_ON(!btree_node_write_in_flight(b));
-       clear_btree_node_write_in_flight(b);
-       wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
-}
-
-static inline void btree_node_io_lock(struct btree *b)
-{
-       wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
-                           TASK_UNINTERRUPTIBLE);
-}
-
-static inline void btree_node_wait_on_io(struct btree *b)
-{
-       wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
-                      TASK_UNINTERRUPTIBLE);
-}
+void bch2_btree_node_io_unlock(struct btree *);
+void bch2_btree_node_io_lock(struct btree *);
+void __bch2_btree_node_wait_on_read(struct btree *);
+void __bch2_btree_node_wait_on_write(struct btree *);
+void bch2_btree_node_wait_on_read(struct btree *);
+void bch2_btree_node_wait_on_write(struct btree *);
 
 static inline bool btree_node_may_write(struct btree *b)
 {
@@ -169,7 +157,7 @@ static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
                }
 
                six_unlock_type(&b->c.lock, lock_held);
-               btree_node_wait_on_io(b);
+               bch2_btree_node_wait_on_write(b);
                btree_node_lock_type(c, b, lock_held);
        }
 }
index a5d3973..37dadba 100644 (file)
@@ -567,7 +567,7 @@ static void btree_update_nodes_written(struct btree_update *as)
                six_unlock_read(&old->c.lock);
 
                if (seq == as->old_nodes_seq[i])
-                       btree_node_wait_on_io(old);
+                       bch2_btree_node_wait_on_write(old);
        }
 
        /*
index cbadb38..6a28de3 100644 (file)
@@ -133,7 +133,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
        if (c->opts.nochanges)
                return;
 
-       btree_node_io_lock(b);
+       bch2_btree_node_io_lock(b);
        mutex_lock(&c->verify_lock);
 
        if (!c->verify_ondisk) {
@@ -176,7 +176,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
        }
 out:
        mutex_unlock(&c->verify_lock);
-       btree_node_io_unlock(b);
+       bch2_btree_node_io_unlock(b);
 }
 
 #ifdef CONFIG_DEBUG_FS