bcachefs: Improve bset compaction
authorKent Overstreet <kent.overstreet@gmail.com>
Tue, 6 Apr 2021 19:33:19 +0000 (15:33 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:08:59 +0000 (17:08 -0400)
The previous patch that fixed btree nodes being written too aggressively
now meant that we weren't sorting btree node bsets optimally - this
patch fixes that.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/btree_cache.c
fs/bcachefs/btree_io.c
fs/bcachefs/btree_io.h
fs/bcachefs/btree_update_interior.h

index 2ec668c..8ed8610 100644 (file)
@@ -215,7 +215,7 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
                if (bch2_verify_btree_ondisk)
                        bch2_btree_node_write(c, b, SIX_LOCK_intent);
                else
-                       __bch2_btree_node_write(c, b, SIX_LOCK_read);
+                       __bch2_btree_node_write(c, b);
 
                /* wait for any in flight btree write */
                btree_node_wait_on_io(b);
index 3b45389..fd90e43 100644 (file)
@@ -241,7 +241,6 @@ bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
 }
 
 static void btree_node_sort(struct bch_fs *c, struct btree *b,
-                           struct btree_iter *iter,
                            unsigned start_idx,
                            unsigned end_idx,
                            bool filter_whiteouts)
@@ -377,8 +376,7 @@ void bch2_btree_sort_into(struct bch_fs *c,
  * We're about to add another bset to the btree node, so if there's currently
  * too many bsets - sort some of them together:
  */
-static bool btree_node_compact(struct bch_fs *c, struct btree *b,
-                              struct btree_iter *iter)
+static bool btree_node_compact(struct bch_fs *c, struct btree *b)
 {
        unsigned unwritten_idx;
        bool ret = false;
@@ -390,13 +388,13 @@ static bool btree_node_compact(struct bch_fs *c, struct btree *b,
                        break;
 
        if (b->nsets - unwritten_idx > 1) {
-               btree_node_sort(c, b, iter, unwritten_idx,
+               btree_node_sort(c, b, unwritten_idx,
                                b->nsets, false);
                ret = true;
        }
 
        if (unwritten_idx > 1) {
-               btree_node_sort(c, b, iter, 0, unwritten_idx, false);
+               btree_node_sort(c, b, 0, unwritten_idx, false);
                ret = true;
        }
 
@@ -426,12 +424,30 @@ void bch2_btree_init_next(struct bch_fs *c, struct btree *b,
                          struct btree_iter *iter)
 {
        struct btree_node_entry *bne;
-       bool did_sort;
+       bool reinit_iter = false;
 
        EBUG_ON(!(b->c.lock.state.seq & 1));
        EBUG_ON(iter && iter->l[b->c.level].b != b);
+       BUG_ON(bset_written(b, bset(b, &b->set[1])));
+
+       if (b->nsets == MAX_BSETS) {
+               unsigned log_u64s[] = {
+                       ilog2(bset_u64s(&b->set[0])),
+                       ilog2(bset_u64s(&b->set[1])),
+                       ilog2(bset_u64s(&b->set[2])),
+               };
+
+               if (log_u64s[1] >= (log_u64s[0] + log_u64s[2]) / 2) {
+                       bch2_btree_node_write(c, b, SIX_LOCK_write);
+                       reinit_iter = true;
+               }
+       }
+
+       if (b->nsets == MAX_BSETS &&
+           btree_node_compact(c, b))
+               reinit_iter = true;
 
-       did_sort = btree_node_compact(c, b, iter);
+       BUG_ON(b->nsets >= MAX_BSETS);
 
        bne = want_new_bset(c, b);
        if (bne)
@@ -439,7 +455,7 @@ void bch2_btree_init_next(struct bch_fs *c, struct btree *b,
 
        bch2_btree_build_aux_trees(b);
 
-       if (iter && did_sort)
+       if (iter && reinit_iter)
                bch2_btree_iter_reinit_node(iter, b);
 }
 
@@ -1324,8 +1340,7 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
        return ret;
 }
 
-void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
-                           enum six_lock_type lock_type_held)
+void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
 {
        struct btree_write_bio *wbio;
        struct bset_tree *t;
@@ -1596,7 +1611,7 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
         * single bset:
         */
        if (b->nsets > 1) {
-               btree_node_sort(c, b, NULL, 0, b->nsets, true);
+               btree_node_sort(c, b, 0, b->nsets, true);
                invalidated_iter = true;
        } else {
                invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL);
@@ -1626,13 +1641,12 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
  * Use this one if the node is intent locked:
  */
 void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
-                         enum six_lock_type lock_type_held)
+                          enum six_lock_type lock_type_held)
 {
-       BUG_ON(lock_type_held == SIX_LOCK_write);
-
        if (lock_type_held == SIX_LOCK_intent ||
-           six_lock_tryupgrade(&b->c.lock)) {
-               __bch2_btree_node_write(c, b, SIX_LOCK_intent);
+           (lock_type_held == SIX_LOCK_read &&
+            six_lock_tryupgrade(&b->c.lock))) {
+               __bch2_btree_node_write(c, b);
 
                /* don't cycle lock unnecessarily: */
                if (btree_node_just_written(b) &&
@@ -1644,7 +1658,10 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
                if (lock_type_held == SIX_LOCK_read)
                        six_lock_downgrade(&b->c.lock);
        } else {
-               __bch2_btree_node_write(c, b, SIX_LOCK_read);
+               __bch2_btree_node_write(c, b);
+               if (lock_type_held == SIX_LOCK_write &&
+                   btree_node_just_written(b))
+                       bch2_btree_post_write_cleanup(c, b);
        }
 }
 
index 9c14cd3..95c3516 100644 (file)
@@ -144,8 +144,7 @@ void bch2_btree_complete_write(struct bch_fs *, struct btree *,
                              struct btree_write *);
 void bch2_btree_write_error_work(struct work_struct *);
 
-void __bch2_btree_node_write(struct bch_fs *, struct btree *,
-                           enum six_lock_type);
+void __bch2_btree_node_write(struct bch_fs *, struct btree *);
 bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
 
 void bch2_btree_node_write(struct bch_fs *, struct btree *,
index f2925b0..7eef3db 100644 (file)
@@ -256,13 +256,15 @@ static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c,
        return remaining;
 }
 
+#define BTREE_WRITE_SET_U64s_BITS      9
+
 static inline unsigned btree_write_set_buffer(struct btree *b)
 {
        /*
         * Could buffer up larger amounts of keys for btrees with larger keys,
         * pending benchmarking:
         */
-       return 4 << 10;
+       return 8 << BTREE_WRITE_SET_U64s_BITS;
 }
 
 static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,