bcachefs: Account for internal fragmentation better
authorKent Overstreet <kent.overstreet@gmail.com>
Tue, 24 Jul 2018 18:54:39 +0000 (14:54 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:08:08 +0000 (17:08 -0400)
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/btree_gc.c
fs/bcachefs/btree_update_interior.c
fs/bcachefs/buckets.c
fs/bcachefs/buckets.h
fs/bcachefs/buckets_types.h
fs/bcachefs/chardev.c
fs/bcachefs/fs.c
fs/bcachefs/sysfs.c

index a82677d..1fbb9c6 100644 (file)
@@ -493,7 +493,8 @@ static void bch2_gc_start(struct bch_fs *c)
                struct bch_fs_usage *p =
                        per_cpu_ptr(c->usage_percpu, cpu);
 
-               memset(p->s, 0, sizeof(p->s));
+               memset(p->replicas, 0, sizeof(p->replicas));
+               memset(p->buckets, 0, sizeof(p->buckets));
        }
 
        percpu_up_write(&c->usage_lock);
index aba01a7..a37b5ed 100644 (file)
@@ -184,7 +184,8 @@ found:
         */
        replicas = bch2_extent_nr_dirty_ptrs(k);
        if (replicas)
-               stats->s[replicas - 1].data[BCH_DATA_BTREE] -= c->opts.btree_node_size;
+               stats->replicas[replicas - 1].data[BCH_DATA_BTREE] -=
+                       c->opts.btree_node_size;
 
        /*
         * We're dropping @k from the btree, but it's still live until the
index c0dc0ce..56b197b 100644 (file)
@@ -73,6 +73,8 @@
 
 #include <linux/preempt.h>
 
+static inline u64 __bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
+
 #ifdef DEBUG_BUCKETS
 
 #define lg_local_lock  lg_global_lock
@@ -84,18 +86,24 @@ static void bch2_fs_stats_verify(struct bch_fs *c)
                __bch2_fs_usage_read(c);
        unsigned i, j;
 
-       for (i = 0; i < ARRAY_SIZE(stats.s); i++) {
-               for (j = 0; j < ARRAY_SIZE(stats.s[i].data); j++)
-                       if ((s64) stats.s[i].data[j] < 0)
-                               panic("replicas %u %s underflow: %lli\n",
+       for (i = 0; i < ARRAY_SIZE(stats.replicas); i++) {
+               for (j = 0; j < ARRAY_SIZE(stats.replicas[i].data); j++)
+                       if ((s64) stats.replicas[i].data[j] < 0)
+                               panic("replicas %u %s sectors underflow: %lli\n",
                                      i + 1, bch_data_types[j],
-                                     stats.s[i].data[j]);
+                                     stats.replicas[i].data[j]);
 
-               if ((s64) stats.s[i].persistent_reserved < 0)
+               if ((s64) stats.replicas[i].persistent_reserved < 0)
                        panic("replicas %u reserved underflow: %lli\n",
-                             i + 1, stats.s[i].persistent_reserved);
+                             i + 1, stats.replicas[i].persistent_reserved);
        }
 
+       for (j = 0; j < ARRAY_SIZE(stats.buckets); j++)
+               if ((s64) stats.replicas[i].data_buckets[j] < 0)
+                       panic("%s buckets underflow: %lli\n",
+                             bch_data_types[j],
+                             stats.buckets[j]);
+
        if ((s64) stats.online_reserved < 0)
                panic("sectors_online_reserved underflow: %lli\n",
                      stats.online_reserved);
@@ -238,6 +246,7 @@ bch2_fs_usage_read(struct bch_fs *c)
 }
 
 struct fs_usage_sum {
+       u64     hidden;
        u64     data;
        u64     reserved;
 };
@@ -247,14 +256,21 @@ static inline struct fs_usage_sum __fs_usage_sum(struct bch_fs_usage stats)
        struct fs_usage_sum sum = { 0 };
        unsigned i, j;
 
-       for (i = 0; i < ARRAY_SIZE(stats.s); i++) {
-               u64 a = 0;
+       /*
+        * For superblock and journal we count bucket usage, not sector usage,
+        * because any internal fragmentation should _not_ be counted as
+        * free space:
+        */
+       for (j = 1; j < BCH_DATA_BTREE; j++)
+               sum.hidden += stats.buckets[j];
 
-               for (j = 0; j < ARRAY_SIZE(stats.s[i].data); j++)
-                       a += stats.s[i].data[j];
+       for (i = 0; i < ARRAY_SIZE(stats.replicas); i++) {
+               for (j = BCH_DATA_BTREE;
+                    j < ARRAY_SIZE(stats.replicas[i].data);
+                    j++)
+                       sum.data += stats.replicas[i].data[j] * (i + 1);
 
-               sum.data        += a * (i + 1);
-               sum.reserved    += stats.s[i].persistent_reserved * (i + 1);
+               sum.reserved += stats.replicas[i].persistent_reserved * (i + 1);
        }
 
        sum.reserved += stats.online_reserved;
@@ -270,14 +286,14 @@ static u64 reserve_factor(u64 r)
 
 static u64 avail_factor(u64 r)
 {
-       return (r << RESERVE_FACTOR) / (1 << RESERVE_FACTOR) + 1;
+       return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1);
 }
 
-u64 __bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats)
+static inline u64 __bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats)
 {
        struct fs_usage_sum sum = __fs_usage_sum(stats);
 
-       return sum.data + reserve_factor(sum.reserved);
+       return sum.hidden + sum.data + reserve_factor(sum.reserved);
 }
 
 u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats)
@@ -285,9 +301,9 @@ u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats)
        return min(c->capacity, __bch2_fs_sectors_used(c, stats));
 }
 
-u64 bch2_fs_sectors_free(struct bch_fs *c, struct bch_fs_usage stats)
+static u64 bch2_fs_sectors_free(struct bch_fs *c, struct bch_fs_usage stats)
 {
-       return avail_factor(c->capacity - bch2_fs_sectors_used(c, stats));
+       return c->capacity - bch2_fs_sectors_used(c, stats);
 }
 
 static inline int is_unavailable_bucket(struct bucket_mark m)
@@ -323,9 +339,9 @@ static bool bucket_became_unavailable(struct bch_fs *c,
 }
 
 void bch2_fs_usage_apply(struct bch_fs *c,
-                       struct bch_fs_usage *stats,
-                       struct disk_reservation *disk_res,
-                       struct gc_pos gc_pos)
+                        struct bch_fs_usage *stats,
+                        struct disk_reservation *disk_res,
+                        struct gc_pos gc_pos)
 {
        struct fs_usage_sum sum = __fs_usage_sum(*stats);
        s64 added = sum.data + sum.reserved;
@@ -358,6 +374,7 @@ void bch2_fs_usage_apply(struct bch_fs *c,
 }
 
 static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
+                                 struct bch_fs_usage *stats,
                                  struct bucket_mark old, struct bucket_mark new)
 {
        struct bch_dev_usage *dev_usage;
@@ -374,6 +391,9 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
                        bch2_data_types[new.data_type]);
        }
 
+       stats->buckets[bucket_type(old)] -= ca->mi.bucket_size;
+       stats->buckets[bucket_type(new)] += ca->mi.bucket_size;
+
        preempt_disable();
        dev_usage = this_cpu_ptr(ca->usage_percpu);
 
@@ -399,17 +419,18 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
        bch2_dev_stats_verify(ca);
 }
 
-#define bucket_data_cmpxchg(c, ca, g, new, expr)               \
+#define bucket_data_cmpxchg(c, ca, stats, g, new, expr)                \
 ({                                                             \
        struct bucket_mark _old = bucket_cmpxchg(g, new, expr); \
                                                                \
-       bch2_dev_usage_update(c, ca, _old, new);                \
+       bch2_dev_usage_update(c, ca, stats, _old, new);         \
        _old;                                                   \
 })
 
 void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
                            size_t b, struct bucket_mark *old)
 {
+       struct bch_fs_usage *stats = this_cpu_ptr(c->usage_percpu);
        struct bucket *g;
        struct bucket_mark new;
 
@@ -417,7 +438,7 @@ void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
 
        g = bucket(ca, b);
 
-       *old = bucket_data_cmpxchg(c, ca, g, new, ({
+       *old = bucket_data_cmpxchg(c, ca, stats, g, new, ({
                BUG_ON(!is_available_bucket(new));
 
                new.owned_by_allocator  = 1;
@@ -436,6 +457,7 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
                            size_t b, bool owned_by_allocator,
                            struct gc_pos pos, unsigned flags)
 {
+       struct bch_fs_usage *stats = this_cpu_ptr(c->usage_percpu);
        struct bucket *g;
        struct bucket_mark old, new;
 
@@ -446,7 +468,7 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
            gc_will_visit(c, pos))
                return;
 
-       old = bucket_data_cmpxchg(c, ca, g, new, ({
+       old = bucket_data_cmpxchg(c, ca, stats, g, new, ({
                new.owned_by_allocator  = owned_by_allocator;
        }));
 
@@ -466,10 +488,12 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
                               unsigned sectors, struct gc_pos pos,
                               unsigned flags)
 {
+       struct bch_fs_usage *stats;
        struct bucket *g;
        struct bucket_mark old, new;
 
-       BUG_ON(!type);
+       BUG_ON(type != BCH_DATA_SB &&
+              type != BCH_DATA_JOURNAL);
 
        if (likely(c)) {
                percpu_rwsem_assert_held(&c->usage_lock);
@@ -479,16 +503,17 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
                        return;
        }
 
-       rcu_read_lock();
+       preempt_disable();
+       stats = this_cpu_ptr(c->usage_percpu);
 
        g = bucket(ca, b);
-       old = bucket_data_cmpxchg(c, ca, g, new, ({
+       old = bucket_data_cmpxchg(c, ca, stats, g, new, ({
                new.data_type = type;
                checked_add(new.dirty_sectors, sectors);
-               new.dirty_sectors += sectors;
        }));
 
-       rcu_read_unlock();
+       stats->replicas[0].data[type] += sectors;
+       preempt_enable();
 
        BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
               bucket_became_unavailable(c, old, new));
@@ -589,7 +614,7 @@ static void bch2_mark_pointer(struct bch_fs *c,
                              old.v.counter,
                              new.v.counter)) != old.v.counter);
 
-       bch2_dev_usage_update(c, ca, old, new);
+       bch2_dev_usage_update(c, ca, stats, old, new);
 
        BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
               bucket_became_unavailable(c, old, new));
@@ -601,6 +626,10 @@ void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
                   struct bch_fs_usage *stats,
                   u64 journal_seq, unsigned flags)
 {
+       unsigned replicas = bch2_extent_nr_dirty_ptrs(k);
+
+       BUG_ON(replicas && replicas - 1 > ARRAY_SIZE(stats->replicas));
+
        /*
         * synchronization w.r.t. GC:
         *
@@ -643,32 +672,22 @@ void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
                struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
                const struct bch_extent_ptr *ptr;
                struct bch_extent_crc_unpacked crc;
-               unsigned replicas = 0;
 
                BUG_ON(!sectors);
 
-               extent_for_each_ptr_crc(e, ptr, crc) {
+               extent_for_each_ptr_crc(e, ptr, crc)
                        bch2_mark_pointer(c, e, ptr, crc, sectors, data_type,
                                          stats, journal_seq, flags);
-                       replicas += !ptr->cached;
-               }
 
-               if (replicas) {
-                       BUG_ON(replicas - 1 > ARRAY_SIZE(stats->s));
-                       stats->s[replicas - 1].data[data_type] += sectors;
-               }
+               if (replicas)
+                       stats->replicas[replicas - 1].data[data_type] += sectors;
                break;
        }
-       case BCH_RESERVATION: {
-               struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
-
-               if (r.v->nr_replicas) {
-                       BUG_ON(r.v->nr_replicas - 1 > ARRAY_SIZE(stats->s));
-                       stats->s[r.v->nr_replicas - 1].persistent_reserved += sectors;
-               }
+       case BCH_RESERVATION:
+               if (replicas)
+                       stats->replicas[replicas - 1].persistent_reserved += sectors;
                break;
        }
-       }
        percpu_up_read(&c->usage_lock);
 }
 
@@ -681,7 +700,7 @@ static u64 __recalc_sectors_available(struct bch_fs *c)
        for_each_possible_cpu(cpu)
                per_cpu_ptr(c->usage_percpu, cpu)->available_cache = 0;
 
-       return bch2_fs_sectors_free(c, bch2_fs_usage_read(c));
+       return avail_factor(bch2_fs_sectors_free(c, bch2_fs_usage_read(c)));
 }
 
 /* Used by gc when it's starting: */
index 016201b..9aeccbb 100644 (file)
@@ -173,9 +173,7 @@ struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *);
 void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
                         struct disk_reservation *, struct gc_pos);
 
-u64 __bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
 u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
-u64 bch2_fs_sectors_free(struct bch_fs *, struct bch_fs_usage);
 
 static inline bool is_available_bucket(struct bucket_mark mark)
 {
index d528194..9968570 100644 (file)
@@ -69,7 +69,9 @@ struct bch_fs_usage {
        struct {
                u64             data[BCH_DATA_NR];
                u64             persistent_reserved;
-       }                       s[BCH_REPLICAS_MAX];
+       }                       replicas[BCH_REPLICAS_MAX];
+
+       u64                     buckets[BCH_DATA_NR];
 };
 
 /*
index 283828f..db0f990 100644 (file)
@@ -404,10 +404,10 @@ static long bch2_ioctl_usage(struct bch_fs *c,
 
                for (i = 0; i < BCH_REPLICAS_MAX; i++) {
                        dst.persistent_reserved[i] =
-                               src.s[i].persistent_reserved;
+                               src.replicas[i].persistent_reserved;
 
                        for (j = 0; j < BCH_DATA_NR; j++)
-                               dst.sectors[j][i] = src.s[i].data[j];
+                               dst.sectors[j][i] = src.replicas[i].data[j];
                }
 
                ret = copy_to_user(&user_arg->fs, &dst, sizeof(dst));
index 5963f88..67ddad9 100644 (file)
@@ -1428,13 +1428,16 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
        struct bch_fs *c = sb->s_fs_info;
+       struct bch_fs_usage usage = bch2_fs_usage_read(c);
+       u64 hidden_metadata = usage.buckets[BCH_DATA_SB] +
+               usage.buckets[BCH_DATA_JOURNAL];
+       unsigned shift = sb->s_blocksize_bits - 9;
        u64 fsid;
 
        buf->f_type     = BCACHEFS_STATFS_MAGIC;
        buf->f_bsize    = sb->s_blocksize;
-       buf->f_blocks   = c->capacity >> PAGE_SECTOR_SHIFT;
-       buf->f_bfree    = bch2_fs_sectors_free(c, bch2_fs_usage_read(c)) >>
-                          PAGE_SECTOR_SHIFT;
+       buf->f_blocks   = (c->capacity - hidden_metadata) >> shift;
+       buf->f_bfree    = (c->capacity - bch2_fs_sectors_used(c, usage)) >> shift;
        buf->f_bavail   = buf->f_bfree;
        buf->f_files    = atomic_long_read(&c->nr_inodes);
        buf->f_ffree    = U64_MAX;
index db8af44..4ce7168 100644 (file)
@@ -238,7 +238,7 @@ static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
                         "capacity:\t\t%llu\n",
                         c->capacity);
 
-       for (replicas = 0; replicas < ARRAY_SIZE(stats.s); replicas++) {
+       for (replicas = 0; replicas < ARRAY_SIZE(stats.replicas); replicas++) {
                out += scnprintf(out, end - out,
                                 "%u replicas:\n",
                                 replicas + 1);
@@ -247,12 +247,20 @@ static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
                        out += scnprintf(out, end - out,
                                         "\t%s:\t\t%llu\n",
                                         bch2_data_types[type],
-                                        stats.s[replicas].data[type]);
+                                        stats.replicas[replicas].data[type]);
                out += scnprintf(out, end - out,
                                 "\treserved:\t%llu\n",
-                                stats.s[replicas].persistent_reserved);
+                                stats.replicas[replicas].persistent_reserved);
        }
 
+       out += scnprintf(out, end - out, "bucket usage\n");
+
+       for (type = BCH_DATA_SB; type < BCH_DATA_NR; type++)
+               out += scnprintf(out, end - out,
+                                "\t%s:\t\t%llu\n",
+                                bch2_data_types[type],
+                                stats.buckets[type]);
+
        out += scnprintf(out, end - out,
                         "online reserved:\t%llu\n",
                         stats.online_reserved);