bcachefs: Go RW before bch2_check_lrus()
authorKent Overstreet <kent.overstreet@gmail.com>
Thu, 21 Apr 2022 17:13:57 +0000 (13:13 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:09:32 +0000 (17:09 -0400)
btree updates before going RW are expensive if they're in random order,
since they use the list of keys for journal replay to insert, which is
just a gap buffer.

This patch improves the bucket invalidate path so that if
bch2_check_lrus() hasn't finished it only prints warnings instead of
doing an emergency shutdown, which means we can now set BCH_FS_MAY_GO_RW
before bch2_check_lrus().

Also, the filesystem state bits are reorganized a bit.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
fs/bcachefs/alloc_background.c
fs/bcachefs/bcachefs.h
fs/bcachefs/lru.c
fs/bcachefs/recovery.c

index d9cf676..eb03b41 100644 (file)
@@ -382,7 +382,8 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
                                return -EINVAL;
                        }
 
-                       if (!a.v->io_time[READ]) {
+                       if (!a.v->io_time[READ] &&
+                           test_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags)) {
                                pr_buf(err, "cached bucket with read_time == 0");
                                return -EINVAL;
                        }
@@ -588,7 +589,6 @@ int bch2_trans_mark_alloc(struct btree_trans *trans,
            !new_a->io_time[READ])
                new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
 
-
        old_lru = alloc_lru_idx(old_a);
        new_lru = alloc_lru_idx(*new_a);
 
@@ -1088,6 +1088,7 @@ static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca)
 
        bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru,
                             POS(ca->dev_idx, 0), 0);
+next_lru:
        k = bch2_btree_iter_peek(&lru_iter);
        ret = bkey_err(k);
        if (ret)
@@ -1096,9 +1097,20 @@ static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca)
        if (!k.k || k.k->p.inode != ca->dev_idx)
                goto out;
 
-       if (bch2_trans_inconsistent_on(k.k->type != KEY_TYPE_lru, trans,
-                                      "non lru key in lru btree"))
-               goto out;
+       if (k.k->type != KEY_TYPE_lru) {
+               pr_buf(&buf, "non lru key in lru btree:\n  ");
+               bch2_bkey_val_to_text(&buf, c, k);
+
+               if (!test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) {
+                       bch_err(c, "%s", buf.buf);
+                       bch2_btree_iter_advance(&lru_iter);
+                       goto next_lru;
+               } else {
+                       bch2_trans_inconsistent(trans, "%s", buf.buf);
+                       ret = -EINVAL;
+                       goto out;
+               }
+       }
 
        idx     = k.k->p.offset;
        bucket  = le64_to_cpu(bkey_s_c_to_lru(k).v->idx);
@@ -1111,13 +1123,19 @@ static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca)
 
        if (idx != alloc_lru_idx(a->v)) {
                pr_buf(&buf, "alloc key does not point back to lru entry when invalidating bucket:\n  ");
-
                bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
                pr_buf(&buf, "\n  ");
                bch2_bkey_val_to_text(&buf, c, k);
-               bch2_trans_inconsistent(trans, "%s", buf.buf);
-               ret = -EINVAL;
-               goto out;
+
+               if (!test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) {
+                       bch_err(c, "%s", buf.buf);
+                       bch2_btree_iter_advance(&lru_iter);
+                       goto next_lru;
+               } else {
+                       bch2_trans_inconsistent(trans, "%s", buf.buf);
+                       ret = -EINVAL;
+                       goto out;
+               }
        }
 
        SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
index 5dda57a..127323b 100644 (file)
@@ -494,11 +494,6 @@ struct bch_dev {
 
 enum {
        /* startup: */
-       BCH_FS_CLEAN_SHUTDOWN,
-       BCH_FS_INITIAL_GC_DONE,
-       BCH_FS_INITIAL_GC_UNFIXED,
-       BCH_FS_TOPOLOGY_REPAIR_DONE,
-       BCH_FS_FSCK_DONE,
        BCH_FS_STARTED,
        BCH_FS_MAY_GO_RW,
        BCH_FS_RW,
@@ -508,16 +503,22 @@ enum {
        BCH_FS_STOPPING,
        BCH_FS_EMERGENCY_RO,
        BCH_FS_WRITE_DISABLE_COMPLETE,
+       BCH_FS_CLEAN_SHUTDOWN,
+
+       /* fsck passes: */
+       BCH_FS_TOPOLOGY_REPAIR_DONE,
+       BCH_FS_INITIAL_GC_DONE,         /* kill when we enumerate fsck passes */
+       BCH_FS_CHECK_LRUS_DONE,
+       BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE,
+       BCH_FS_FSCK_DONE,
+       BCH_FS_INITIAL_GC_UNFIXED,      /* kill when we enumerate fsck errors */
+       BCH_FS_NEED_ANOTHER_GC,
 
        /* errors: */
        BCH_FS_ERROR,
        BCH_FS_TOPOLOGY_ERROR,
        BCH_FS_ERRORS_FIXED,
        BCH_FS_ERRORS_NOT_FIXED,
-
-       /* misc: */
-       BCH_FS_NEED_ANOTHER_GC,
-       BCH_FS_DELETED_NODES,
 };
 
 struct btree_debug {
index fe9d157..ce23b38 100644 (file)
@@ -204,7 +204,9 @@ int bch2_check_lrus(struct bch_fs *c, bool initial)
 
        for_each_btree_key(&trans, iter, BTREE_ID_lru, POS_MIN,
                           BTREE_ITER_PREFETCH, k, ret) {
-               ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+               ret = __bch2_trans_do(&trans, NULL, NULL,
+                                     BTREE_INSERT_NOFAIL|
+                                     BTREE_INSERT_LAZY_RW,
                        bch2_check_lru_key(&trans, &iter, initial));
                if (ret)
                        break;
index e2474ff..5831ab5 100644 (file)
@@ -994,7 +994,6 @@ static int bch2_fs_initialize_subvolumes(struct bch_fs *c)
        if (ret)
                return ret;
 
-
        bkey_subvolume_init(&root_volume.k_i);
        root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL;
        root_volume.v.flags     = 0;
@@ -1096,6 +1095,12 @@ int bch2_fs_recovery(struct bch_fs *c)
                }
        }
 
+       if (c->opts.fsck && c->opts.norecovery) {
+               bch_err(c, "cannot select both norecovery and fsck");
+               ret = -EINVAL;
+               goto err;
+       }
+
        ret = bch2_blacklist_table_initialize(c);
        if (ret) {
                bch_err(c, "error initializing blacklist table");
@@ -1189,6 +1194,13 @@ use_clean:
        if (ret)
                goto err;
 
+       /*
+        * Skip past versions that might have possibly been used (as nonces),
+        * but hadn't had their pointers written:
+        */
+       if (c->sb.encryption_type && !c->sb.clean)
+               atomic64_add(1 << 16, &c->key_version);
+
        ret = read_btree_roots(c);
        if (ret)
                goto err;
@@ -1211,12 +1223,7 @@ use_clean:
                goto err;
        bch_verbose(c, "stripes_read done");
 
-       /*
-        * If we're not running fsck, this ensures bch2_fsck_err() calls are
-        * instead interpreted as bch2_inconsistent_err() calls:
-        */
-       if (!c->opts.fsck)
-               set_bit(BCH_FS_FSCK_DONE, &c->flags);
+       bch2_stripes_heap_start(c);
 
        if (c->opts.fsck) {
                bool metadata_only = c->opts.norecovery;
@@ -1228,6 +1235,8 @@ use_clean:
                        goto err;
                bch_verbose(c, "done checking allocations");
 
+               set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
+
                bch_info(c, "checking need_discard and freespace btrees");
                err = "error checking need_discard and freespace btrees";
                ret = bch2_check_alloc_info(c);
@@ -1235,55 +1244,60 @@ use_clean:
                        goto err;
                bch_verbose(c, "done checking need_discard and freespace btrees");
 
+               set_bit(BCH_FS_MAY_GO_RW, &c->flags);
+
+               bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr);
+               err = "journal replay failed";
+               ret = bch2_journal_replay(c);
+               if (ret)
+                       goto err;
+               if (c->opts.verbose || !c->sb.clean)
+                       bch_info(c, "journal replay done");
+
                bch_info(c, "checking lrus");
                err = "error checking lrus";
                ret = bch2_check_lrus(c, true);
                if (ret)
                        goto err;
                bch_verbose(c, "done checking lrus");
-       }
 
-       bch2_stripes_heap_start(c);
+               set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags);
 
-       set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
-       set_bit(BCH_FS_MAY_GO_RW, &c->flags);
-
-       /*
-        * Skip past versions that might have possibly been used (as nonces),
-        * but hadn't had their pointers written:
-        */
-       if (c->sb.encryption_type && !c->sb.clean)
-               atomic64_add(1 << 16, &c->key_version);
-
-       if (c->opts.norecovery)
-               goto out;
-
-       bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr);
-       err = "journal replay failed";
-       ret = bch2_journal_replay(c);
-       if (ret)
-               goto err;
-       if (c->opts.verbose || !c->sb.clean)
-               bch_info(c, "journal replay done");
-
-       err = "error initializing freespace";
-       ret = bch2_fs_freespace_init(c);
-       if (ret)
-               goto err;
-
-       if (c->opts.fsck) {
                bch_info(c, "checking alloc to lru refs");
                err = "error checking alloc to lru refs";
                ret = bch2_check_alloc_to_lru_refs(c);
                if (ret)
                        goto err;
+               set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags);
 
                ret = bch2_check_lrus(c, true);
                if (ret)
                        goto err;
                bch_verbose(c, "done checking alloc to lru refs");
+       } else {
+               set_bit(BCH_FS_MAY_GO_RW, &c->flags);
+               set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
+               set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags);
+               set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags);
+               set_bit(BCH_FS_FSCK_DONE, &c->flags);
+
+               if (c->opts.norecovery)
+                       goto out;
+
+               bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr);
+               err = "journal replay failed";
+               ret = bch2_journal_replay(c);
+               if (ret)
+                       goto err;
+               if (c->opts.verbose || !c->sb.clean)
+                       bch_info(c, "journal replay done");
        }
 
+       err = "error initializing freespace";
+       ret = bch2_fs_freespace_init(c);
+       if (ret)
+               goto err;
+
        if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
                bch2_fs_lazy_rw(c);