bcachefs: Fragmentation LRU
authorKent Overstreet <kent.overstreet@linux.dev>
Mon, 5 Dec 2022 15:24:19 +0000 (10:24 -0500)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:09:53 +0000 (17:09 -0400)
Now that we have much more efficient updates to the LRU btree, this
patch adds a new LRU that indexes buckets by fragmentation.

This means copygc no longer has to scan every bucket to find buckets
that need to be evacuated.

Changes:
 - A new field in bch_alloc_v4, fragmentation_lru - this corresponds to
   the bucket's position in the fragmentation LRU. We add a new field
   for this instead of calculating it as needed because we may make the
   fragmentation LRU optional; this field indicates whether a bucket is
   on the fragmentation LRU.

   Also, zoned devices will introduce variable bucket sizes; explicitly
   recording the LRU position will be safer for them.

 - A new copygc path for using the fragmentation LRU instead of
   scanning every bucket and building up an in-memory heap.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
13 files changed:
fs/bcachefs/alloc_background.c
fs/bcachefs/alloc_background.h
fs/bcachefs/bcachefs.h
fs/bcachefs/bcachefs_format.h
fs/bcachefs/buckets_types.h
fs/bcachefs/lru.c
fs/bcachefs/lru.h
fs/bcachefs/move.c
fs/bcachefs/move.h
fs/bcachefs/movinggc.c
fs/bcachefs/recovery.c
fs/bcachefs/super.c
fs/bcachefs/trace.h

index af3e55f..aefe72d 100644 (file)
@@ -415,6 +415,8 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
        prt_newline(out);
        prt_printf(out, "io_time[WRITE]    %llu",       a->io_time[WRITE]);
        prt_newline(out);
+       prt_printf(out, "fragmentation     %llu",       a->fragmentation_lru);
+       prt_newline(out);
        prt_printf(out, "bp_start          %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a));
        prt_newline(out);
 
@@ -910,8 +912,8 @@ int bch2_trans_mark_alloc(struct btree_trans *trans,
            !new_a->io_time[READ])
                new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
 
-       old_lru = alloc_lru_idx(*old_a);
-       new_lru = alloc_lru_idx(*new_a);
+       old_lru = alloc_lru_idx_read(*old_a);
+       new_lru = alloc_lru_idx_read(*new_a);
 
        if (old_lru != new_lru) {
                ret = bch2_lru_change(trans, new->k.p.inode,
@@ -921,6 +923,18 @@ int bch2_trans_mark_alloc(struct btree_trans *trans,
                        return ret;
        }
 
+       new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a,
+                                       bch_dev_bkey_exists(c, new->k.p.inode));
+
+       if (old_a->fragmentation_lru != new_a->fragmentation_lru) {
+               ret = bch2_lru_change(trans,
+                               BCH_LRU_FRAGMENTATION_START,
+                               bucket_to_u64(new->k.p),
+                               old_a->fragmentation_lru, new_a->fragmentation_lru);
+               if (ret)
+                       return ret;
+       }
+
        if (old_a->gen != new_a->gen) {
                ret = bch2_bucket_gen_update(trans, new->k.p, new_a->gen);
                if (ret)
@@ -1777,7 +1791,7 @@ static int invalidate_one_bucket(struct btree_trans *trans,
                goto out;
 
        /* We expect harmless races here due to the btree write buffer: */
-       if (lru_pos_time(lru_iter->pos) != alloc_lru_idx(a->v))
+       if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v))
                goto out;
 
        BUG_ON(a->v.data_type != BCH_DATA_cached);
index b3c2f1e..96ac8f3 100644 (file)
@@ -64,11 +64,24 @@ static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a,
                                 a.stripe, a, data_type);
 }
 
-static inline u64 alloc_lru_idx(struct bch_alloc_v4 a)
+static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a)
 {
        return a.data_type == BCH_DATA_cached ? a.io_time[READ] : 0;
 }
 
+static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a,
+                                             struct bch_dev *ca)
+{
+       if (a.data_type != BCH_DATA_btree &&
+           a.data_type != BCH_DATA_user)
+               return 0;
+
+       if (a.dirty_sectors >= ca->mi.bucket_size)
+               return 0;
+
+       return div_u64((u64) a.dirty_sectors * (1ULL << 31), ca->mi.bucket_size);
+}
+
 static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a)
 {
        return ((u64) alloc_gc_gen(a) >> 4) << 56;
index 84b30ad..5dc4b0c 100644 (file)
@@ -927,7 +927,6 @@ struct bch_fs {
 
        /* COPYGC */
        struct task_struct      *copygc_thread;
-       copygc_heap             copygc_heap;
        struct write_point      copygc_write_point;
        s64                     copygc_wait;
        bool                    copygc_running;
index 99f9fbd..9524ff0 100644 (file)
@@ -992,6 +992,7 @@ struct bch_alloc_v4 {
        __u64                   io_time[2];
        __u32                   stripe;
        __u32                   nr_external_backpointers;
+       __u64                   fragmentation_lru;
 } __packed __aligned(8);
 
 #define BCH_ALLOC_V4_U64s_V0   6
@@ -1563,7 +1564,8 @@ struct bch_sb_field_journal_seq_blacklist {
        x(inode_v3,                     23)             \
        x(unwritten_extents,            24)             \
        x(bucket_gens,                  25)             \
-       x(lru_v2,                       26)
+       x(lru_v2,                       26)             \
+       x(fragmentation_lru,            27)
 
 enum bcachefs_metadata_version {
        bcachefs_metadata_version_min = 9,
index 1dbba7d..2a9dab9 100644 (file)
@@ -89,15 +89,4 @@ struct disk_reservation {
        unsigned                nr_replicas;
 };
 
-struct copygc_heap_entry {
-       u8                      dev;
-       u8                      gen;
-       u8                      replicas;
-       u32                     fragmentation;
-       u32                     sectors;
-       u64                     bucket;
-};
-
-typedef HEAP(struct copygc_heap_entry) copygc_heap;
-
 #endif /* _BUCKETS_TYPES_H */
index c121a7c..e913b90 100644 (file)
@@ -93,6 +93,13 @@ int bch2_lru_change(struct btree_trans *trans,
                bch2_lru_set(trans, lru_id, dev_bucket, new_time);
 }
 
+static const char * const bch2_lru_types[] = {
+#define x(n) #n,
+       BCH_LRU_TYPES()
+#undef x
+       NULL
+};
+
 static int bch2_check_lru_key(struct btree_trans *trans,
                              struct btree_iter *lru_iter,
                              struct bkey_s_c lru_k,
@@ -105,7 +112,9 @@ static int bch2_check_lru_key(struct btree_trans *trans,
        const struct bch_alloc_v4 *a;
        struct printbuf buf1 = PRINTBUF;
        struct printbuf buf2 = PRINTBUF;
+       enum bch_lru_type type = lru_type(lru_k);
        struct bpos alloc_pos = u64_to_bucket(lru_k.k->p.offset);
+       u64 idx;
        int ret;
 
        if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_pos), c,
@@ -121,9 +130,17 @@ static int bch2_check_lru_key(struct btree_trans *trans,
 
        a = bch2_alloc_to_v4(k, &a_convert);
 
+       switch (type) {
+       case BCH_LRU_read:
+               idx = alloc_lru_idx_read(*a);
+               break;
+       case BCH_LRU_fragmentation:
+               idx = a->fragmentation_lru;
+               break;
+       }
+
        if (lru_k.k->type != KEY_TYPE_set ||
-           a->data_type != BCH_DATA_cached ||
-           a->io_time[READ] != lru_pos_time(lru_k.k->p)) {}
+           lru_pos_time(lru_k.k->p) != idx) {
                if (!bpos_eq(*last_flushed_pos, lru_k.k->p)) {
                        *last_flushed_pos = lru_k.k->p;
                        ret = bch2_btree_write_buffer_flush_sync(trans) ?:
@@ -131,17 +148,14 @@ static int bch2_check_lru_key(struct btree_trans *trans,
                        goto out;
                }
 
-               if (fsck_err_on(lru_k.k->type != KEY_TYPE_set ||
-                               a->data_type != BCH_DATA_cached ||
-                               a->io_time[READ] != lru_pos_time(lru_k.k->p), c,
-                               "incorrect lru entry (time %llu) %s\n"
-                               "  for %s",
-                               lru_pos_time(lru_k.k->p),
-                               (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf),
-                               (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) {
+               if (fsck_err(c, "incorrect lru entry: lru %s time %llu\n"
+                            "  %s\n"
+                            "  for %s",
+                            bch2_lru_types[type],
+                            lru_pos_time(lru_k.k->p),
+                            (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf),
+                            (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf)))
                        ret = bch2_btree_delete_at(trans, lru_iter, 0);
-                       if (ret)
-                               goto err;
        }
 out:
 err:
index b8d9848..78a6076 100644 (file)
@@ -22,6 +22,27 @@ static inline u64 lru_pos_time(struct bpos pos)
        return pos.inode & ~(~0ULL << LRU_TIME_BITS);
 }
 
+#define BCH_LRU_TYPES()                \
+       x(read)                 \
+       x(fragmentation)
+
+enum bch_lru_type {
+#define x(n) BCH_LRU_##n,
+       BCH_LRU_TYPES()
+#undef x
+};
+
+#define BCH_LRU_FRAGMENTATION_START    ((1U << 16) - 1)
+
+static inline enum bch_lru_type lru_type(struct bkey_s_c l)
+{
+       u16 lru_id = l.k->p.inode >> 48;
+
+       if (lru_id == BCH_LRU_FRAGMENTATION_START)
+               return BCH_LRU_fragmentation;
+       return BCH_LRU_read;
+}
+
 int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
 void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
index 67f861e..c964643 100644 (file)
@@ -652,13 +652,13 @@ failed_to_evacuate:
        printbuf_exit(&buf);
 }
 
-int __bch2_evacuate_bucket(struct moving_context *ctxt,
+int __bch2_evacuate_bucket(struct btree_trans *trans,
+                          struct moving_context *ctxt,
                           struct bpos bucket, int gen,
                           struct data_update_opts _data_opts)
 {
        struct bch_fs *c = ctxt->c;
        struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
-       struct btree_trans trans;
        struct btree_iter iter;
        struct bkey_buf sk;
        struct bch_backpointer bp;
@@ -667,17 +667,17 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
        struct bkey_s_c k;
        struct data_update_opts data_opts;
        unsigned dirty_sectors, bucket_size;
+       u64 fragmentation;
        u64 bp_offset = 0, cur_inum = U64_MAX;
        int ret = 0;
 
        bch2_bkey_buf_init(&sk);
-       bch2_trans_init(&trans, c, 0, 0);
 
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc,
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
                             bucket, BTREE_ITER_CACHED);
-       ret = lockrestart_do(&trans,
+       ret = lockrestart_do(trans,
                        bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
-       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_iter_exit(trans, &iter);
 
        if (ret) {
                bch_err(c, "%s: error looking up alloc key: %s", __func__, bch2_err_str(ret));
@@ -687,17 +687,18 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
        a = bch2_alloc_to_v4(k, &a_convert);
        dirty_sectors = a->dirty_sectors;
        bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size;
+       fragmentation = a->fragmentation_lru;
 
-       ret = bch2_btree_write_buffer_flush(&trans);
+       ret = bch2_btree_write_buffer_flush(trans);
        if (ret) {
                bch_err(c, "%s: error flushing btree write buffer: %s", __func__, bch2_err_str(ret));
                goto err;
        }
 
-       while (!(ret = move_ratelimit(&trans, ctxt))) {
-               bch2_trans_begin(&trans);
+       while (!(ret = move_ratelimit(trans, ctxt))) {
+               bch2_trans_begin(trans);
 
-               ret = bch2_get_next_backpointer(&trans, bucket, gen,
+               ret = bch2_get_next_backpointer(trans, bucket, gen,
                                                &bp_offset, &bp,
                                                BTREE_ITER_CACHED);
                if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -712,7 +713,7 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
                        struct bkey_s_c k;
                        unsigned i = 0;
 
-                       k = bch2_backpointer_get_key(&trans, &iter,
+                       k = bch2_backpointer_get_key(trans, &iter,
                                                bucket, bp_offset, bp);
                        ret = bkey_err(k);
                        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -725,9 +726,9 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
                        bch2_bkey_buf_reassemble(&sk, c, k);
                        k = bkey_i_to_s_c(sk.k);
 
-                       ret = move_get_io_opts(&trans, &io_opts, k, &cur_inum);
+                       ret = move_get_io_opts(trans, &io_opts, k, &cur_inum);
                        if (ret) {
-                               bch2_trans_iter_exit(&trans, &iter);
+                               bch2_trans_iter_exit(trans, &iter);
                                continue;
                        }
 
@@ -741,15 +742,15 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
                                i++;
                        }
 
-                       ret = bch2_move_extent(&trans, &iter, ctxt, io_opts,
+                       ret = bch2_move_extent(trans, &iter, ctxt, io_opts,
                                               bp.btree_id, k, data_opts);
-                       bch2_trans_iter_exit(&trans, &iter);
+                       bch2_trans_iter_exit(trans, &iter);
 
                        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                                continue;
                        if (ret == -ENOMEM) {
                                /* memory allocation failure, wait for some IO to finish */
-                               bch2_move_ctxt_wait_for_io(ctxt, &trans);
+                               bch2_move_ctxt_wait_for_io(ctxt, trans);
                                continue;
                        }
                        if (ret)
@@ -761,7 +762,7 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
                } else {
                        struct btree *b;
 
-                       b = bch2_backpointer_get_node(&trans, &iter,
+                       b = bch2_backpointer_get_node(trans, &iter,
                                                bucket, bp_offset, bp);
                        ret = PTR_ERR_OR_ZERO(b);
                        if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
@@ -773,8 +774,8 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
                        if (!b)
                                goto next;
 
-                       ret = bch2_btree_node_rewrite(&trans, &iter, b, 0);
-                       bch2_trans_iter_exit(&trans, &iter);
+                       ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
+                       bch2_trans_iter_exit(trans, &iter);
 
                        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                                continue;
@@ -791,17 +792,16 @@ next:
                bp_offset++;
        }
 
-       trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, ret);
+       trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret);
 
        if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && gen >= 0) {
-               bch2_trans_unlock(&trans);
+               bch2_trans_unlock(trans);
                move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads));
                closure_sync(&ctxt->cl);
                if (!ctxt->write_error)
-                       verify_bucket_evacuated(&trans, bucket, gen);
+                       verify_bucket_evacuated(trans, bucket, gen);
        }
 err:
-       bch2_trans_exit(&trans);
        bch2_bkey_buf_exit(&sk, c);
        return ret;
 }
@@ -814,12 +814,15 @@ int bch2_evacuate_bucket(struct bch_fs *c,
                         struct write_point_specifier wp,
                         bool wait_on_copygc)
 {
+       struct btree_trans trans;
        struct moving_context ctxt;
        int ret;
 
+       bch2_trans_init(&trans, c, 0, 0);
        bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
-       ret = __bch2_evacuate_bucket(&ctxt, bucket, gen, data_opts);
+       ret = __bch2_evacuate_bucket(&trans, &ctxt, bucket, gen, data_opts);
        bch2_moving_ctxt_exit(&ctxt);
+       bch2_trans_exit(&trans);
 
        return ret;
 }
index aef6138..c5a7c0a 100644 (file)
@@ -66,7 +66,8 @@ int bch2_move_data(struct bch_fs *,
                   bool,
                   move_pred_fn, void *);
 
-int __bch2_evacuate_bucket(struct moving_context *,
+int __bch2_evacuate_bucket(struct btree_trans *,
+                          struct moving_context *,
                           struct bpos, int,
                           struct data_update_opts);
 int bch2_evacuate_bucket(struct bch_fs *, struct bpos, int,
index b420b79..74e57f6 100644 (file)
@@ -10,6 +10,7 @@
 #include "alloc_foreground.h"
 #include "btree_iter.h"
 #include "btree_update.h"
+#include "btree_write_buffer.h"
 #include "buckets.h"
 #include "clock.h"
 #include "disk_groups.h"
@@ -19,6 +20,7 @@
 #include "eytzinger.h"
 #include "io.h"
 #include "keylist.h"
+#include "lru.h"
 #include "move.h"
 #include "movinggc.h"
 #include "super-io.h"
 #include <linux/sort.h>
 #include <linux/wait.h>
 
-static inline int fragmentation_cmp(copygc_heap *heap,
-                                  struct copygc_heap_entry l,
-                                  struct copygc_heap_entry r)
+static int bch2_bucket_is_movable(struct btree_trans *trans,
+                                 struct bpos bucket, u64 time, u8 *gen)
 {
-       return cmp_int(l.fragmentation, r.fragmentation);
-}
-
-static int find_buckets_to_copygc(struct bch_fs *c)
-{
-       copygc_heap *h = &c->copygc_heap;
-       struct btree_trans trans;
        struct btree_iter iter;
        struct bkey_s_c k;
+       struct bch_alloc_v4 _a;
+       const struct bch_alloc_v4 *a;
        int ret;
 
-       bch2_trans_init(&trans, c, 0, 0);
+       if (bch2_bucket_is_open(trans->c, bucket.inode, bucket.offset))
+               return 0;
 
-       /*
-        * Find buckets with lowest sector counts, skipping completely
-        * empty buckets, by building a maxheap sorted by sector count,
-        * and repeatedly replacing the maximum element until all
-        * buckets have been visited.
-        */
-       h->used = 0;
-
-       for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
-                          BTREE_ITER_PREFETCH, k, ret) {
-               struct bch_dev *ca = bch_dev_bkey_exists(c, iter.pos.inode);
-               struct copygc_heap_entry e;
-               struct bch_alloc_v4 a_convert;
-               const struct bch_alloc_v4 *a;
-
-               a = bch2_alloc_to_v4(k, &a_convert);
-
-               if ((a->data_type != BCH_DATA_btree &&
-                    a->data_type != BCH_DATA_user) ||
-                   a->dirty_sectors >= ca->mi.bucket_size ||
-                   bch2_bucket_is_open(c, iter.pos.inode, iter.pos.offset))
-                       continue;
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, 0);
+       k = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(k);
+       bch2_trans_iter_exit(trans, &iter);
+
+       if (ret)
+               return ret;
 
-               e = (struct copygc_heap_entry) {
-                       .dev            = iter.pos.inode,
-                       .gen            = a->gen,
-                       .replicas       = 1 + a->stripe_redundancy,
-                       .fragmentation  = div_u64((u64) a->dirty_sectors * (1ULL << 31),
-                                                 ca->mi.bucket_size),
-                       .sectors        = a->dirty_sectors,
-                       .bucket         = iter.pos.offset,
-               };
-               heap_add_or_replace(h, e, -fragmentation_cmp, NULL);
+       a = bch2_alloc_to_v4(k, &_a);
+       *gen = a->gen;
+       ret = (a->data_type == BCH_DATA_btree ||
+              a->data_type == BCH_DATA_user) &&
+               a->fragmentation_lru &&
+               a->fragmentation_lru <= time;
 
+       if (ret) {
+               struct printbuf buf = PRINTBUF;
+
+               bch2_bkey_val_to_text(&buf, trans->c, k);
+               pr_debug("%s", buf.buf);
+               printbuf_exit(&buf);
        }
-       bch2_trans_iter_exit(&trans, &iter);
 
-       bch2_trans_exit(&trans);
        return ret;
 }
 
+static int bch2_copygc_next_bucket(struct btree_trans *trans,
+                                  struct bpos *bucket, u8 *gen, struct bpos *pos)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret;
+
+       ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru,
+                                 bpos_max(*pos, lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0)),
+                                 lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX),
+                                 0, k, ({
+               *bucket = u64_to_bucket(k.k->p.offset);
+
+               bch2_bucket_is_movable(trans, *bucket, lru_pos_time(k.k->p), gen);
+       }));
+
+       *pos = iter.pos;
+       if (ret < 0)
+               return ret;
+       return ret ? 0 : -ENOENT;
+}
+
 static int bch2_copygc(struct bch_fs *c)
 {
-       copygc_heap *h = &c->copygc_heap;
-       struct copygc_heap_entry e;
        struct bch_move_stats move_stats;
-       struct bch_dev *ca;
-       unsigned dev_idx;
-       size_t heap_size = 0;
+       struct btree_trans trans;
        struct moving_context ctxt;
        struct data_update_opts data_opts = {
                .btree_insert_flags = BTREE_INSERT_USE_RESERVE|JOURNAL_WATERMARK_copygc,
        };
+       struct bpos bucket;
+       struct bpos pos;
+       u8 gen = 0;
+       unsigned nr_evacuated;
        int ret = 0;
 
        bch2_move_stats_init(&move_stats, "copygc");
-
-       for_each_rw_member(ca, c, dev_idx)
-               heap_size += ca->mi.nbuckets >> 7;
-
-       if (h->size < heap_size) {
-               free_heap(&c->copygc_heap);
-               if (!init_heap(&c->copygc_heap, heap_size, GFP_KERNEL)) {
-                       bch_err(c, "error allocating copygc heap");
-                       return 0;
-               }
-       }
-
-       ret = find_buckets_to_copygc(c);
-       if (ret) {
-               bch2_fs_fatal_error(c, "error walking buckets to copygc!");
-               return ret;
-       }
-
-       if (!h->used) {
-               s64 wait = S64_MAX, dev_wait;
-               u64 dev_min_wait_fragmented = 0;
-               u64 dev_min_wait_allowed = 0;
-               int dev_min_wait = -1;
-
-               for_each_rw_member(ca, c, dev_idx) {
-                       struct bch_dev_usage usage = bch2_dev_usage_read(ca);
-                       s64 allowed = ((__dev_buckets_available(ca, usage, RESERVE_none) *
-                                              ca->mi.bucket_size) >> 1);
-                       s64 fragmented = usage.d[BCH_DATA_user].fragmented;
-
-                       dev_wait = max(0LL, allowed - fragmented);
-
-                       if (dev_min_wait < 0 || dev_wait < wait) {
-                               dev_min_wait = dev_idx;
-                               dev_min_wait_fragmented = fragmented;
-                               dev_min_wait_allowed    = allowed;
-                       }
-               }
-
-               bch_err_ratelimited(c, "copygc requested to run but found no buckets to move! dev %u fragmented %llu allowed %llu",
-                                   dev_min_wait, dev_min_wait_fragmented, dev_min_wait_allowed);
-               return 0;
-       }
-
-       heap_resort(h, fragmentation_cmp, NULL);
-
        bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats,
                              writepoint_ptr(&c->copygc_write_point),
                              false);
+       bch2_trans_init(&trans, c, 0, 0);
+
+       ret = bch2_btree_write_buffer_flush(&trans);
+       BUG_ON(ret);
 
-       /* not correct w.r.t. device removal */
-       while (h->used && !ret) {
-               BUG_ON(!heap_pop(h, e, -fragmentation_cmp, NULL));
-               ret = __bch2_evacuate_bucket(&ctxt, POS(e.dev, e.bucket), e.gen,
-                                            data_opts);
+       for (nr_evacuated = 0, pos = POS_MIN;
+            nr_evacuated < 32 && !ret;
+            nr_evacuated++, pos = bpos_nosnap_successor(pos)) {
+               ret = bch2_copygc_next_bucket(&trans, &bucket, &gen, &pos) ?:
+                       __bch2_evacuate_bucket(&trans, &ctxt, bucket, gen, data_opts);
+               if (bkey_eq(pos, POS_MAX))
+                       break;
        }
 
+       bch2_trans_exit(&trans);
        bch2_moving_ctxt_exit(&ctxt);
 
+       /* no entries in LRU btree found, or got to end: */
+       if (ret == -ENOENT)
+               ret = 0;
+
        if (ret < 0 && !bch2_err_matches(ret, EROFS))
                bch_err(c, "error from bch2_move_data() in copygc: %s", bch2_err_str(ret));
 
index 178f064..1976d5f 100644 (file)
@@ -1105,6 +1105,9 @@ int bch2_fs_recovery(struct bch_fs *c)
                        c->opts.version_upgrade = true;
                        c->opts.fsck            = true;
                        c->opts.fix_errors      = FSCK_OPT_YES;
+               } else if (c->sb.version < bcachefs_metadata_version_fragmentation_lru) {
+                       bch_info(c, "version prior to backpointers, upgrade required");
+                       c->opts.version_upgrade = true;
                }
        }
 
index 58517f6..f703e41 100644 (file)
@@ -487,7 +487,6 @@ static void __bch2_fs_free(struct bch_fs *c)
        kfree(rcu_dereference_protected(c->disk_groups, 1));
        kfree(c->journal_seq_blacklist_table);
        kfree(c->unused_inode_hints);
-       free_heap(&c->copygc_heap);
 
        if (c->io_complete_wq)
                destroy_workqueue(c->io_complete_wq);
index 24dd2de..30b1090 100644 (file)
@@ -723,8 +723,8 @@ TRACE_EVENT(move_data,
 TRACE_EVENT(evacuate_bucket,
        TP_PROTO(struct bch_fs *c, struct bpos *bucket,
                 unsigned sectors, unsigned bucket_size,
-                int ret),
-       TP_ARGS(c, bucket, sectors, bucket_size, ret),
+                u64 fragmentation, int ret),
+       TP_ARGS(c, bucket, sectors, bucket_size, fragmentation, ret),
 
        TP_STRUCT__entry(
                __field(dev_t,          dev             )
@@ -732,6 +732,7 @@ TRACE_EVENT(evacuate_bucket,
                __field(u64,            bucket          )
                __field(u32,            sectors         )
                __field(u32,            bucket_size     )
+               __field(u64,            fragmentation   )
                __field(int,            ret             )
        ),
 
@@ -741,14 +742,15 @@ TRACE_EVENT(evacuate_bucket,
                __entry->bucket                 = bucket->offset;
                __entry->sectors                = sectors;
                __entry->bucket_size            = bucket_size;
+               __entry->fragmentation          = fragmentation;
                __entry->ret                    = ret;
        ),
 
-       TP_printk("%d,%d %llu:%llu sectors %u/%u ret %i",
+       TP_printk("%d,%d %llu:%llu sectors %u/%u fragmentation %llu ret %i",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->member, __entry->bucket,
                  __entry->sectors, __entry->bucket_size,
-                 __entry->ret)
+                 __entry->fragmentation, __entry->ret)
 );
 
 TRACE_EVENT(copygc,