From: Kent Overstreet Date: Sun, 11 Jul 2021 03:03:15 +0000 (-0400) Subject: bcachefs: Really don't hold btree locks while btree IOs are in flight X-Git-Tag: microblaze-v6.8~204^2~1480 X-Git-Url: http://git.monstr.eu/?a=commitdiff_plain;h=19d5432445fd0898231063518c375f7551d7c3d2;p=linux-2.6-microblaze.git bcachefs: Really don't hold btree locks while btree IOs are in flight This is something we've attempted to stick to for quite some time, as it helps guarantee filesystem latency - but there's a few remaining paths that this patch fixes. This is also necessary for an upcoming patch to update btree pointers after every btree write - since the btree write completion path will now be doing btree operations. Signed-off-by: Kent Overstreet --- diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 15f597ab03e7..051d2867ad63 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -187,6 +187,17 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) int ret = 0; lockdep_assert_held(&bc->lock); +wait_on_io: + if (b->flags & ((1U << BTREE_NODE_dirty)| + (1U << BTREE_NODE_read_in_flight)| + (1U << BTREE_NODE_write_in_flight))) { + if (!flush) + return -ENOMEM; + + /* XXX: waiting on IO with btree cache lock held */ + bch2_btree_node_wait_on_read(b); + bch2_btree_node_wait_on_write(b); + } if (!six_trylock_intent(&b->c.lock)) return -ENOMEM; @@ -194,25 +205,26 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) if (!six_trylock_write(&b->c.lock)) goto out_unlock_intent; + /* recheck under lock */ + if (b->flags & ((1U << BTREE_NODE_read_in_flight)| + (1U << BTREE_NODE_write_in_flight))) { + if (!flush) + goto out_unlock; + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + goto wait_on_io; + } + if (btree_node_noevict(b)) goto out_unlock; if (!btree_node_may_write(b)) goto out_unlock; - if (btree_node_dirty(b) && - test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) - goto out_unlock; - - if (btree_node_dirty(b) || - btree_node_write_in_flight(b) || - btree_node_read_in_flight(b)) { - if (!flush) + if (btree_node_dirty(b)) { + if (!flush || + test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) goto out_unlock; - - wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, - TASK_UNINTERRUPTIBLE); - /* * Using the underscore version because we don't want to compact * bsets after the write, since this node is about to be evicted @@ -224,8 +236,9 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) else __bch2_btree_node_write(c, b); - /* wait for any in flight btree write */ - btree_node_wait_on_io(b); + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + goto wait_on_io; } out: if (b->hash_val && !ret) @@ -581,6 +594,7 @@ got_node: } BUG_ON(btree_node_hashed(b)); + BUG_ON(btree_node_dirty(b)); BUG_ON(btree_node_write_in_flight(b)); out: b->flags = 0; @@ -634,6 +648,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, { struct btree_cache *bc = &c->btree_cache; struct btree *b; + u32 seq; BUG_ON(level + 1 >= BTREE_MAX_DEPTH); /* @@ -663,31 +678,31 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, return NULL; } + set_btree_node_read_in_flight(b); + + six_unlock_write(&b->c.lock); + seq = b->c.lock.state.seq; + six_unlock_intent(&b->c.lock); + /* Unlock before doing IO: */ if (iter && sync) bch2_trans_unlock(iter->trans); bch2_btree_node_read(c, b, sync); - six_unlock_write(&b->c.lock); - - if (!sync) { - six_unlock_intent(&b->c.lock); + if (!sync) return NULL; - } /* * XXX: this will probably always fail because btree_iter_relock() * currently fails for iterators that aren't pointed at a valid btree * node */ - if (iter && !bch2_trans_relock(iter->trans)) { - six_unlock_intent(&b->c.lock); + if (iter && !bch2_trans_relock(iter->trans)) return ERR_PTR(-EINTR); - } - if (lock_type == SIX_LOCK_read) - six_lock_downgrade(&b->c.lock); + if (!six_relock_type(&b->c.lock, lock_type, seq)) + return ERR_PTR(-EINTR); return b; } @@ -831,11 +846,12 @@ lock_node: } if (unlikely(btree_node_read_in_flight(b))) { + u32 seq = b->c.lock.state.seq; + six_unlock_type(&b->c.lock, lock_type); bch2_trans_unlock(iter->trans); - wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, - TASK_UNINTERRUPTIBLE); + bch2_btree_node_wait_on_read(b); /* * XXX: check if this always fails - btree_iter_relock() @@ -844,7 +860,9 @@ lock_node: */ if (iter && !bch2_trans_relock(iter->trans)) return ERR_PTR(-EINTR); - goto retry; + + if (!six_relock_type(&b->c.lock, lock_type, seq)) + goto retry; } prefetch(b->aux_data); @@ -923,8 +941,7 @@ lock_node: } /* XXX: waiting on IO with btree locks held: */ - wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, - TASK_UNINTERRUPTIBLE); + __bch2_btree_node_wait_on_read(b); prefetch(b->aux_data); @@ -979,16 +996,24 @@ void bch2_btree_node_evict(struct bch_fs *c, const struct bkey_i *k) b = btree_cache_find(bc, k); if (!b) return; +wait_on_io: + /* not allowed to wait on io with btree locks held: */ + + /* XXX we're called from btree_gc which will be holding other btree + * nodes locked + * */ + __bch2_btree_node_wait_on_read(b); + __bch2_btree_node_wait_on_write(b); six_lock_intent(&b->c.lock, NULL, NULL); six_lock_write(&b->c.lock, NULL, NULL); - wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, - TASK_UNINTERRUPTIBLE); - __bch2_btree_node_write(c, b); - - /* wait for any in flight btree write */ - btree_node_wait_on_io(b); + if (btree_node_dirty(b)) { + __bch2_btree_node_write(c, b); + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + goto wait_on_io; + } BUG_ON(btree_node_dirty(b)); diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 0095c789f6e5..2974b2ad6966 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -22,6 +22,44 @@ #include +void bch2_btree_node_io_unlock(struct btree *b) +{ + EBUG_ON(!btree_node_write_in_flight(b)); + + clear_btree_node_write_in_flight(b); + wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); +} + +void bch2_btree_node_io_lock(struct btree *b) +{ + wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight, + TASK_UNINTERRUPTIBLE); +} + +void __bch2_btree_node_wait_on_read(struct btree *b) +{ + wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, + TASK_UNINTERRUPTIBLE); +} + +void __bch2_btree_node_wait_on_write(struct btree *b) +{ + wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, + TASK_UNINTERRUPTIBLE); +} + +void bch2_btree_node_wait_on_read(struct btree *b) +{ + wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, + TASK_UNINTERRUPTIBLE); +} + +void bch2_btree_node_wait_on_write(struct btree *b) +{ + wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, + TASK_UNINTERRUPTIBLE); +} + static void verify_no_dups(struct btree *b, struct bkey_packed *start, struct bkey_packed *end) @@ -432,7 +470,8 @@ void bch2_btree_init_next(struct btree_trans *trans, EBUG_ON(iter && iter->l[b->c.level].b != b); BUG_ON(bset_written(b, bset(b, &b->set[1]))); - if (b->nsets == MAX_BSETS) { + if (b->nsets == MAX_BSETS && + !btree_node_write_in_flight(b)) { unsigned log_u64s[] = { ilog2(bset_u64s(&b->set[0])), ilog2(bset_u64s(&b->set[1])), @@ -1402,8 +1441,6 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, btree_pos_to_text(&PBUF(buf), c, b); trace_btree_read(c, b); - set_btree_node_read_in_flight(b); - if (bch2_verify_all_btree_replicas && !btree_node_read_all_replicas(c, b, sync)) return; @@ -1480,6 +1517,8 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, bkey_copy(&b->key, k); BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id)); + set_btree_node_read_in_flight(b); + bch2_btree_node_read(c, b, true); if (btree_node_read_error(b)) { @@ -1525,7 +1564,7 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b) struct btree_write *w = btree_prev_write(b); bch2_btree_complete_write(c, b, w); - btree_node_io_unlock(b); + bch2_btree_node_io_unlock(b); } static void bch2_btree_node_write_error(struct bch_fs *c, @@ -1707,6 +1746,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) bool validate_before_checksum = false; void *data; + BUG_ON(btree_node_write_in_flight(b)); + if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) return; @@ -1734,7 +1775,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) * XXX waiting on btree writes with btree locks held - * this can deadlock, and we hit the write error path */ - btree_node_wait_on_io(b); + bch2_btree_node_wait_on_write(b); continue; } diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h index fae67622c127..89fd4aba5218 100644 --- a/fs/bcachefs/btree_io.h +++ b/fs/bcachefs/btree_io.h @@ -52,24 +52,12 @@ struct btree_write_bio { struct bch_write_bio wbio; }; -static inline void btree_node_io_unlock(struct btree *b) -{ - EBUG_ON(!btree_node_write_in_flight(b)); - clear_btree_node_write_in_flight(b); - wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); -} - -static inline void btree_node_io_lock(struct btree *b) -{ - wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight, - TASK_UNINTERRUPTIBLE); -} - -static inline void btree_node_wait_on_io(struct btree *b) -{ - wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, - TASK_UNINTERRUPTIBLE); -} +void bch2_btree_node_io_unlock(struct btree *); +void bch2_btree_node_io_lock(struct btree *); +void __bch2_btree_node_wait_on_read(struct btree *); +void __bch2_btree_node_wait_on_write(struct btree *); +void bch2_btree_node_wait_on_read(struct btree *); +void bch2_btree_node_wait_on_write(struct btree *); static inline bool btree_node_may_write(struct btree *b) { @@ -169,7 +157,7 @@ static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b, } six_unlock_type(&b->c.lock, lock_held); - btree_node_wait_on_io(b); + bch2_btree_node_wait_on_write(b); btree_node_lock_type(c, b, lock_held); } } diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index a5d3973e163a..37dadbae41e5 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -567,7 +567,7 @@ static void btree_update_nodes_written(struct btree_update *as) six_unlock_read(&old->c.lock); if (seq == as->old_nodes_seq[i]) - btree_node_wait_on_io(old); + bch2_btree_node_wait_on_write(old); } /* diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index cbadb38f680f..6a28de30ea3b 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -133,7 +133,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) if (c->opts.nochanges) return; - btree_node_io_lock(b); + bch2_btree_node_io_lock(b); mutex_lock(&c->verify_lock); if (!c->verify_ondisk) { @@ -176,7 +176,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) } out: mutex_unlock(&c->verify_lock); - btree_node_io_unlock(b); + bch2_btree_node_io_unlock(b); } #ifdef CONFIG_DEBUG_FS