btrfs: stop doing excessive space reservation for csum deletion
authorFilipe Manana <fdmanana@suse.com>
Fri, 8 Sep 2023 17:20:37 +0000 (18:20 +0100)
committerDavid Sterba <dsterba@suse.com>
Thu, 12 Oct 2023 14:44:06 +0000 (16:44 +0200)
Currently when reserving space for deleting the csum items for a data
extent, when adding or updating a delayed ref head, we determine how
many leaves of csum items we can have and then pass that number to the
helper btrfs_calc_delayed_ref_bytes(). This helper is used for calculating
space for all tree modifications we need when running delayed references,
however the amount of space it computes is excessive for deleting csum
items because:

1) It uses btrfs_calc_insert_metadata_size() which is excessive because
   we only need to delete csum items from the csum tree, we don't need
   to insert any items, so btrfs_calc_metadata_size() is all we need (as
   it computes space needed to delete an item);

2) If the free space tree is enabled, it doubles the amount of space,
   which is pointless for csum deletion since we don't need to touch the
   free space tree or any other tree other than the csum tree.

So improve on this by tracking how many csum deletions we have and using
a new helper to calculate space for csum deletions (just a wrapper around
btrfs_calc_metadata_size() with a comment). This reduces the amount of
space we need to reserve for csum deletions by a factor of 4, and it helps
reduce the number of times we have to block space reservations and have
the reclaim task enter the space flushing algorithm (flush delayed items,
flush delayed refs, etc) in order to satisfy tickets.

For example this results in a total time decrease when unlinking (or
truncating) files with many extents, as we end up having to block on space
metadata reservations less often. Example test:

  $ cat test.sh
  #!/bin/bash

  DEV=/dev/nullb0
  MNT=/mnt/test

  umount $DEV &> /dev/null
  mkfs.btrfs -f $DEV
  # Use compression to quickly create files with a lot of extents
  # (each with a size of 128K).
  mount -o compress=lzo $DEV $MNT

  # 100G gives at least 983040 extents with a size of 128K.
  xfs_io -f -c "pwrite -S 0xab -b 1M 0 120G" $MNT/foobar

  # Flush all delalloc and clear all metadata from memory.
  umount $MNT
  mount -o compress=lzo $DEV $MNT

  start=$(date +%s%N)
  rm -f $MNT/foobar
  end=$(date +%s%N)
  dur=$(( (end - start) / 1000000 ))
  echo "rm took $dur milliseconds"

  umount $MNT

Before this change rm took: 7504 milliseconds
After this change rm took:  6574 milliseconds  (-12.4%)

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
fs/btrfs/block-group.c
fs/btrfs/delayed-ref.c
fs/btrfs/delayed-ref.h
fs/btrfs/disk-io.c
fs/btrfs/extent-tree.c
fs/btrfs/transaction.c
fs/btrfs/transaction.h

index fb506ee..82c77db 100644 (file)
@@ -1286,7 +1286,7 @@ out:
        /* Once for the lookup reference */
        btrfs_put_block_group(block_group);
        if (remove_rsv)
-               btrfs_delayed_refs_rsv_release(fs_info, 1);
+               btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
        btrfs_free_path(path);
        return ret;
 }
@@ -2709,7 +2709,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
 
                /* Already aborted the transaction if it failed. */
 next:
-               btrfs_delayed_refs_rsv_release(fs_info, 1);
+               btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
                list_del_init(&block_group->bg_list);
                clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags);
        }
@@ -3370,7 +3370,7 @@ again:
                if (should_put)
                        btrfs_put_block_group(cache);
                if (drop_reserve)
-                       btrfs_delayed_refs_rsv_release(fs_info, 1);
+                       btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
                /*
                 * Avoid blocking other tasks for too long. It might even save
                 * us from writing caches for block groups that are going to be
@@ -3517,7 +3517,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
                /* If its not on the io list, we need to put the block group */
                if (should_put)
                        btrfs_put_block_group(cache);
-               btrfs_delayed_refs_rsv_release(fs_info, 1);
+               btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
                spin_lock(&cur_trans->dirty_bgs_lock);
        }
        spin_unlock(&cur_trans->dirty_bgs_lock);
index e975892..ecfbc2d 100644 (file)
@@ -57,17 +57,21 @@ bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info)
  * Release a ref head's reservation.
  *
  * @fs_info:  the filesystem
- * @nr:       number of items to drop
+ * @nr_refs:  number of delayed refs to drop
+ * @nr_csums: number of csum items to drop
  *
  * Drops the delayed ref head's count from the delayed refs rsv and free any
  * excess reservation we had.
  */
-void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
+void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr_refs, int nr_csums)
 {
        struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
-       const u64 num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, nr);
+       u64 num_bytes;
        u64 released;
 
+       num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, nr_refs);
+       num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums);
+
        released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
        if (released)
                trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
@@ -77,8 +81,9 @@ void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
 /*
  * Adjust the size of the delayed refs rsv.
  *
- * This is to be called anytime we may have adjusted trans->delayed_ref_updates,
- * it'll calculate the additional size and add it to the delayed_refs_rsv.
+ * This is to be called anytime we may have adjusted trans->delayed_ref_updates
+ * or trans->delayed_ref_csum_deletions, it'll calculate the additional size and
+ * add it to the delayed_refs_rsv.
  */
 void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
 {
@@ -86,17 +91,19 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
        struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
        u64 num_bytes;
 
-       if (!trans->delayed_ref_updates)
-               return;
+       num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, trans->delayed_ref_updates);
+       num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info,
+                                                      trans->delayed_ref_csum_deletions);
 
-       num_bytes = btrfs_calc_delayed_ref_bytes(fs_info,
-                                                trans->delayed_ref_updates);
+       if (num_bytes == 0)
+               return;
 
        spin_lock(&delayed_rsv->lock);
        delayed_rsv->size += num_bytes;
        delayed_rsv->full = false;
        spin_unlock(&delayed_rsv->lock);
        trans->delayed_ref_updates = 0;
+       trans->delayed_ref_csum_deletions = 0;
 }
 
 /*
@@ -434,7 +441,7 @@ static inline void drop_delayed_ref(struct btrfs_fs_info *fs_info,
                list_del(&ref->add_list);
        btrfs_put_delayed_ref(ref);
        atomic_dec(&delayed_refs->num_entries);
-       btrfs_delayed_refs_rsv_release(fs_info, 1);
+       btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
 }
 
 static bool merge_ref(struct btrfs_fs_info *fs_info,
@@ -710,11 +717,11 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
 
                if (existing->total_ref_mod >= 0 && old_ref_mod < 0) {
                        delayed_refs->pending_csums -= existing->num_bytes;
-                       btrfs_delayed_refs_rsv_release(fs_info, csum_leaves);
+                       btrfs_delayed_refs_rsv_release(fs_info, 0, csum_leaves);
                }
                if (existing->total_ref_mod < 0 && old_ref_mod >= 0) {
                        delayed_refs->pending_csums += existing->num_bytes;
-                       trans->delayed_ref_updates += csum_leaves;
+                       trans->delayed_ref_csum_deletions += csum_leaves;
                }
        }
 
@@ -834,7 +841,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
                 */
                if (head_ref->is_data && head_ref->ref_mod < 0) {
                        delayed_refs->pending_csums += head_ref->num_bytes;
-                       trans->delayed_ref_updates +=
+                       trans->delayed_ref_csum_deletions +=
                                btrfs_csum_bytes_to_leaves(trans->fs_info,
                                                           head_ref->num_bytes);
                }
index 2f275ad..783f84c 100644 (file)
@@ -283,6 +283,17 @@ static inline u64 btrfs_calc_delayed_ref_bytes(const struct btrfs_fs_info *fs_in
        return num_bytes;
 }
 
+static inline u64 btrfs_calc_delayed_ref_csum_bytes(const struct btrfs_fs_info *fs_info,
+                                                   int num_csum_items)
+{
+       /*
+        * Deleting csum items does not result in new nodes/leaves and does not
+        * require changing the free space tree, only the csum tree, so this is
+        * all we need.
+        */
+       return btrfs_calc_metadata_size(fs_info, num_csum_items);
+}
+
 static inline void btrfs_init_generic_ref(struct btrfs_ref *generic_ref,
                                int action, u64 bytenr, u64 len, u64 parent)
 {
@@ -407,7 +418,7 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head(
 
 int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq);
 
-void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr);
+void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr_refs, int nr_csums);
 void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans);
 int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
                                  enum btrfs_reserve_flush_enum flush);
index c9d1df5..43a6ca7 100644 (file)
@@ -4563,7 +4563,7 @@ static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
                                list_del(&ref->add_list);
                        atomic_dec(&delayed_refs->num_entries);
                        btrfs_put_delayed_ref(ref);
-                       btrfs_delayed_refs_rsv_release(fs_info, 1);
+                       btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
                }
                if (head->must_insert_reserved)
                        pin_bytes = true;
@@ -4761,7 +4761,7 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
 
                spin_unlock(&cur_trans->dirty_bgs_lock);
                btrfs_put_block_group(cache);
-               btrfs_delayed_refs_rsv_release(fs_info, 1);
+               btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
                spin_lock(&cur_trans->dirty_bgs_lock);
        }
        spin_unlock(&cur_trans->dirty_bgs_lock);
index 4b596f6..4135e6e 100644 (file)
@@ -1824,16 +1824,16 @@ u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
         * to drop the csum leaves for this update from our delayed_refs_rsv.
         */
        if (head->total_ref_mod < 0 && head->is_data) {
-               int nr_items;
+               int nr_csums;
 
                spin_lock(&delayed_refs->lock);
                delayed_refs->pending_csums -= head->num_bytes;
                spin_unlock(&delayed_refs->lock);
-               nr_items = btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
+               nr_csums = btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
 
-               btrfs_delayed_refs_rsv_release(fs_info, nr_items);
+               btrfs_delayed_refs_rsv_release(fs_info, 0, nr_csums);
 
-               return btrfs_calc_delayed_ref_bytes(fs_info, nr_items);
+               return btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums);
        }
 
        return 0;
@@ -1985,7 +1985,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
 
                ret = run_one_delayed_ref(trans, ref, extent_op,
                                          must_insert_reserved);
-               btrfs_delayed_refs_rsv_release(fs_info, 1);
+               btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
                *bytes_released += btrfs_calc_delayed_ref_bytes(fs_info, 1);
                btrfs_free_delayed_extent_op(extent_op);
                if (ret) {
index 9811f12..a627a4d 100644 (file)
@@ -2085,7 +2085,7 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
        struct btrfs_block_group *block_group, *tmp;
 
        list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
-               btrfs_delayed_refs_rsv_release(fs_info, 1);
+               btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
                list_del_init(&block_group->bg_list);
        }
 }
index 93869cd..68c89b1 100644 (file)
@@ -120,6 +120,7 @@ struct btrfs_trans_handle {
        u64 bytes_reserved;
        u64 chunk_bytes_reserved;
        unsigned long delayed_ref_updates;
+       unsigned long delayed_ref_csum_deletions;
        struct btrfs_transaction *transaction;
        struct btrfs_block_rsv *block_rsv;
        struct btrfs_block_rsv *orig_rsv;