bcachefs: Fsck for reflink refcounts
authorKent Overstreet <kent.overstreet@gmail.com>
Sun, 23 May 2021 06:31:33 +0000 (02:31 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:09:05 +0000 (17:09 -0400)
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
fs/bcachefs/bcachefs.h
fs/bcachefs/btree_gc.c
fs/bcachefs/buckets.c
fs/bcachefs/reflink.c
fs/bcachefs/reflink.h

index 6962b3d..9bd6036 100644 (file)
@@ -391,6 +391,14 @@ struct gc_pos {
        unsigned                level;
 };
 
+struct reflink_gc {
+       u64             offset;
+       u32             size;
+       u32             refcount;
+};
+
+typedef GENRADIX(struct reflink_gc) reflink_gc_table;
+
 struct io_count {
        u64                     sectors[2][BCH_DATA_NR];
 };
@@ -806,6 +814,9 @@ mempool_t           bio_bounce_pages;
 
        /* REFLINK */
        u64                     reflink_hint;
+       reflink_gc_table        reflink_gc_table;
+       size_t                  reflink_gc_nr;
+       size_t                  reflink_gc_idx;
 
        /* VFS IO PATH - fs-io.c */
        struct bio_set          writepage_bioset;
index 5b839cc..5a2acab 100644 (file)
@@ -23,6 +23,7 @@
 #include "keylist.h"
 #include "move.h"
 #include "recovery.h"
+#include "reflink.h"
 #include "replicas.h"
 #include "super-io.h"
 #include "trace.h"
@@ -1285,6 +1286,201 @@ static int bch2_gc_start(struct bch_fs *c,
        return 0;
 }
 
+static int bch2_gc_reflink_done_initial_fn(struct bch_fs *c, struct bkey_s_c k)
+{
+       struct reflink_gc *r;
+       const __le64 *refcount = bkey_refcount_c(k);
+       char buf[200];
+       int ret = 0;
+
+       if (!refcount)
+               return 0;
+
+       r = genradix_ptr(&c->reflink_gc_table, c->reflink_gc_idx++);
+       if (!r)
+               return -ENOMEM;
+
+       if (!r ||
+           r->offset != k.k->p.offset ||
+           r->size != k.k->size) {
+               bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
+               return -EINVAL;
+       }
+
+       if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
+                       "reflink key has wrong refcount:\n"
+                       "  %s\n"
+                       "  should be %u",
+                       (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
+                       r->refcount)) {
+               struct bkey_i *new;
+
+               new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
+               if (!new) {
+                       ret = -ENOMEM;
+                       goto fsck_err;
+               }
+
+               bkey_reassemble(new, k);
+
+               if (!r->refcount) {
+                       new->k.type = KEY_TYPE_deleted;
+                       new->k.size = 0;
+               } else {
+                       *bkey_refcount(new) = cpu_to_le64(r->refcount);
+               }
+
+               ret = bch2_journal_key_insert(c, BTREE_ID_reflink, 0, new);
+               if (ret)
+                       kfree(new);
+       }
+fsck_err:
+       return ret;
+}
+
+static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
+                               bool metadata_only)
+{
+       struct btree_trans trans;
+       struct btree_iter *iter;
+       struct bkey_s_c k;
+       struct reflink_gc *r;
+       size_t idx = 0;
+       char buf[200];
+       int ret = 0;
+
+       if (metadata_only)
+               return 0;
+
+       if (initial) {
+               c->reflink_gc_idx = 0;
+
+               ret = bch2_btree_and_journal_walk(c, BTREE_ID_reflink,
+                               bch2_gc_reflink_done_initial_fn);
+               goto out;
+       }
+
+       bch2_trans_init(&trans, c, 0, 0);
+
+       for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
+                          BTREE_ITER_PREFETCH, k, ret) {
+               const __le64 *refcount = bkey_refcount_c(k);
+
+               if (!refcount)
+                       continue;
+
+               r = genradix_ptr(&c->reflink_gc_table, idx);
+               if (!r ||
+                   r->offset != k.k->p.offset ||
+                   r->size != k.k->size) {
+                       bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
+                       ret = -EINVAL;
+                       break;
+               }
+
+               if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
+                               "reflink key has wrong refcount:\n"
+                               "  %s\n"
+                               "  should be %u",
+                               (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
+                               r->refcount)) {
+                       struct bkey_i *new;
+
+                       new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
+                       if (!new) {
+                               ret = -ENOMEM;
+                               break;
+                       }
+
+                       bkey_reassemble(new, k);
+
+                       if (!r->refcount)
+                               new->k.type = KEY_TYPE_deleted;
+                       else
+                               *bkey_refcount(new) = cpu_to_le64(r->refcount);
+
+                       ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+                                       __bch2_btree_insert(&trans, BTREE_ID_reflink, new));
+                       kfree(new);
+
+                       if (ret)
+                               break;
+               }
+       }
+fsck_err:
+       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_exit(&trans);
+out:
+       genradix_free(&c->reflink_gc_table);
+       c->reflink_gc_nr = 0;
+       return ret;
+}
+
+static int bch2_gc_reflink_start_initial_fn(struct bch_fs *c, struct bkey_s_c k)
+{
+
+       struct reflink_gc *r;
+       const __le64 *refcount = bkey_refcount_c(k);
+
+       if (!refcount)
+               return 0;
+
+       r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++,
+                              GFP_KERNEL);
+       if (!r)
+               return -ENOMEM;
+
+       r->offset       = k.k->p.offset;
+       r->size         = k.k->size;
+       r->refcount     = 0;
+       return 0;
+}
+
+static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
+                                bool metadata_only)
+{
+       struct btree_trans trans;
+       struct btree_iter *iter;
+       struct bkey_s_c k;
+       struct reflink_gc *r;
+       int ret;
+
+       if (metadata_only)
+               return 0;
+
+       genradix_free(&c->reflink_gc_table);
+       c->reflink_gc_nr = 0;
+
+       if (initial)
+               return bch2_btree_and_journal_walk(c, BTREE_ID_reflink,
+                               bch2_gc_reflink_start_initial_fn);
+
+       bch2_trans_init(&trans, c, 0, 0);
+
+       for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
+                          BTREE_ITER_PREFETCH, k, ret) {
+               const __le64 *refcount = bkey_refcount_c(k);
+
+               if (!refcount)
+                       continue;
+
+               r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++,
+                                      GFP_KERNEL);
+               if (!r) {
+                       ret = -ENOMEM;
+                       break;
+               }
+
+               r->offset       = k.k->p.offset;
+               r->size         = k.k->size;
+               r->refcount     = 0;
+       }
+       bch2_trans_iter_put(&trans, iter);
+
+       bch2_trans_exit(&trans);
+       return 0;
+}
+
 /**
  * bch2_gc - walk _all_ references to buckets, and recompute them:
  *
@@ -1319,7 +1515,8 @@ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
        closure_wait_event(&c->btree_interior_update_wait,
                           !bch2_btree_interior_updates_nr_pending(c));
 again:
-       ret = bch2_gc_start(c, metadata_only);
+       ret   = bch2_gc_start(c, metadata_only) ?:
+               bch2_gc_reflink_start(c, initial, metadata_only);
        if (ret)
                goto out;
 
@@ -1381,7 +1578,8 @@ out:
                bch2_journal_block(&c->journal);
 
                percpu_down_write(&c->mark_lock);
-               ret = bch2_gc_done(c, initial, metadata_only);
+               ret   = bch2_gc_reflink_done(c, initial, metadata_only) ?:
+                       bch2_gc_done(c, initial, metadata_only);
 
                bch2_journal_unblock(&c->journal);
        } else {
index b452ff0..ba6b1e7 100644 (file)
@@ -14,6 +14,7 @@
 #include "ec.h"
 #include "error.h"
 #include "movinggc.h"
+#include "reflink.h"
 #include "replicas.h"
 #include "trace.h"
 
@@ -1076,6 +1077,124 @@ static int bch2_mark_stripe(struct bch_fs *c,
        return 0;
 }
 
+static int __reflink_p_frag_references(struct bkey_s_c_reflink_p p,
+                                      u64 p_start, u64 p_end,
+                                      u64 v_start, u64 v_end)
+{
+       if (p_start == p_end)
+               return false;
+
+       p_start += le64_to_cpu(p.v->idx);
+       p_end   += le64_to_cpu(p.v->idx);
+
+       if (p_end <= v_start)
+               return false;
+       if (p_start >= v_end)
+               return false;
+       return true;
+}
+
+static int reflink_p_frag_references(struct bkey_s_c_reflink_p p,
+                                    u64 start, u64 end,
+                                    struct bkey_s_c k)
+{
+       return __reflink_p_frag_references(p, start, end,
+                                          bkey_start_offset(k.k),
+                                          k.k->p.offset);
+}
+
+static int __bch2_mark_reflink_p(struct bch_fs *c,
+                       struct bkey_s_c_reflink_p p,
+                       u64 idx, unsigned sectors,
+                       unsigned front_frag,
+                       unsigned back_frag,
+                       unsigned flags,
+                       size_t *r_idx)
+{
+       struct reflink_gc *r;
+       int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
+       int frags_referenced;
+
+       while (1) {
+               if (*r_idx >= c->reflink_gc_nr)
+                       goto not_found;
+               r = genradix_ptr(&c->reflink_gc_table, *r_idx);
+               BUG_ON(!r);
+
+               if (r->offset > idx)
+                       break;
+               (*r_idx)++;
+       }
+
+       frags_referenced =
+               __reflink_p_frag_references(p, 0, front_frag,
+                                           r->offset - r->size, r->offset) +
+               __reflink_p_frag_references(p, back_frag, p.k->size,
+                                           r->offset - r->size, r->offset);
+
+       if (frags_referenced == 2) {
+               BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE_SPLIT));
+               add = -add;
+       } else if (frags_referenced == 1) {
+               BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE));
+               add = 0;
+       }
+
+       BUG_ON((s64) r->refcount + add < 0);
+
+       r->refcount += add;
+       return min_t(u64, sectors, r->offset - idx);
+not_found:
+       bch2_fs_inconsistent(c,
+               "%llu:%llu len %u points to nonexistent indirect extent %llu",
+               p.k->p.inode, p.k->p.offset, p.k->size, idx);
+       bch2_inconsistent_error(c);
+       return -EIO;
+}
+
+static int bch2_mark_reflink_p(struct bch_fs *c,
+                              struct bkey_s_c_reflink_p p, unsigned offset,
+                              s64 sectors, unsigned flags)
+{
+       u64 idx = le64_to_cpu(p.v->idx) + offset;
+       struct reflink_gc *ref;
+       size_t l, r, m;
+       unsigned front_frag, back_frag;
+       s64 ret = 0;
+
+       if (sectors < 0)
+               sectors = -sectors;
+
+       BUG_ON(offset + sectors > p.k->size);
+
+       front_frag = offset;
+       back_frag = offset + sectors;
+
+       l = 0;
+       r = c->reflink_gc_nr;
+       while (l < r) {
+               m = l + (r - l) / 2;
+
+               ref = genradix_ptr(&c->reflink_gc_table, m);
+               if (ref->offset <= idx)
+                       l = m + 1;
+               else
+                       r = m;
+       }
+
+       while (sectors) {
+               ret = __bch2_mark_reflink_p(c, p, idx, sectors,
+                               front_frag, back_frag, flags, &l);
+               if (ret < 0)
+                       return ret;
+
+               idx     += ret;
+               sectors -= ret;
+       }
+
+       return 0;
+}
+
 static int bch2_mark_key_locked(struct bch_fs *c,
                   struct bkey_s_c old,
                   struct bkey_s_c new,
@@ -1131,6 +1250,10 @@ static int bch2_mark_key_locked(struct bch_fs *c,
                fs_usage->persistent_reserved[replicas - 1]     += sectors;
                break;
        }
+       case KEY_TYPE_reflink_p:
+               ret = bch2_mark_reflink_p(c, bkey_s_c_to_reflink_p(k),
+                                         offset, sectors, flags);
+               break;
        }
 
        preempt_enable();
@@ -1693,35 +1816,6 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans,
        return ret;
 }
 
-static __le64 *bkey_refcount(struct bkey_i *k)
-{
-       switch (k->k.type) {
-       case KEY_TYPE_reflink_v:
-               return &bkey_i_to_reflink_v(k)->v.refcount;
-       case KEY_TYPE_indirect_inline_data:
-               return &bkey_i_to_indirect_inline_data(k)->v.refcount;
-       default:
-               return NULL;
-       }
-}
-
-static bool reflink_p_frag_references(struct bkey_s_c_reflink_p p,
-                                     u64 start, u64 end,
-                                     struct bkey_s_c k)
-{
-       if (start == end)
-               return false;
-
-       start   += le64_to_cpu(p.v->idx);
-       end     += le64_to_cpu(p.v->idx);
-
-       if (end <= bkey_start_offset(k.k))
-               return false;
-       if (start >= k.k->p.offset)
-               return false;
-       return true;
-}
-
 static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
                        struct bkey_s_c_reflink_p p,
                        u64 idx, unsigned sectors,
index c624fab..e986b52 100644 (file)
@@ -151,7 +151,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
 
        set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k));
 
-       refcount        = (void *) &r_v->v;
+       refcount        = bkey_refcount(r_v);
        *refcount       = 0;
        memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k));
 
index 9d5e7dc..bfc7856 100644 (file)
@@ -34,6 +34,30 @@ void bch2_indirect_inline_data_to_text(struct printbuf *,
        .val_to_text    = bch2_indirect_inline_data_to_text,    \
 }
 
+static inline const __le64 *bkey_refcount_c(struct bkey_s_c k)
+{
+       switch (k.k->type) {
+       case KEY_TYPE_reflink_v:
+               return &bkey_s_c_to_reflink_v(k).v->refcount;
+       case KEY_TYPE_indirect_inline_data:
+               return &bkey_s_c_to_indirect_inline_data(k).v->refcount;
+       default:
+               return NULL;
+       }
+}
+
+static inline __le64 *bkey_refcount(struct bkey_i *k)
+{
+       switch (k->k.type) {
+       case KEY_TYPE_reflink_v:
+               return &bkey_i_to_reflink_v(k)->v.refcount;
+       case KEY_TYPE_indirect_inline_data:
+               return &bkey_i_to_indirect_inline_data(k)->v.refcount;
+       default:
+               return NULL;
+       }
+}
+
 s64 bch2_remap_range(struct bch_fs *, struct bpos, struct bpos,
                     u64, u64 *, u64, s64 *);