btrfs: add structure to keep track of extent range in end_bio_extent_readpage
authorQu Wenruo <wqu@suse.com>
Fri, 13 Nov 2020 12:51:28 +0000 (20:51 +0800)
committerDavid Sterba <dsterba@suse.com>
Tue, 8 Dec 2020 14:54:13 +0000 (15:54 +0100)
In end_bio_extent_readpage() we had a strange dance around
extent_start/extent_len.

Hidden behind the strange dance is, it's just calling
endio_readpage_release_extent() on each bvec range.

Here is an example to explain the original work flow:

  Bio is for inode 257, containing 2 pages, for range [1M, 1M+8K)

  end_bio_extent_extent_readpage() entered
  |- extent_start = 0;
  |- extent_end = 0;
  |- bio_for_each_segment_all() {
  |  |- /* Got the 1st bvec */
  |  |- start = SZ_1M;
  |  |- end = SZ_1M + SZ_4K - 1;
  |  |- update = 1;
  |  |- if (extent_len == 0) {
  |  |  |- extent_start = start; /* SZ_1M */
  |  |  |- extent_len = end + 1 - start; /* SZ_1M */
  |  |  }
  |  |
  |  |- /* Got the 2nd bvec */
  |  |- start = SZ_1M + 4K;
  |  |- end = SZ_1M + 4K - 1;
  |  |- update = 1;
  |  |- if (extent_start + extent_len == start) {
  |  |  |- extent_len += end + 1 - start; /* SZ_8K */
  |  |  }
  |  } /* All bio vec iterated */
  |
  |- if (extent_len) {
     |- endio_readpage_release_extent(tree, extent_start, extent_len,
      update);
/* extent_start == SZ_1M, extent_len == SZ_8K, uptodate = 1 */

As the above flow shows, the existing code in end_bio_extent_readpage()
is accumulates extent_start/extent_len, and when the contiguous range
stops, calls endio_readpage_release_extent() for the range.

However current behavior has something not really considered:

- The inode can change
  For bio, its pages don't need to have contiguous page_offset.
  This means, even pages from different inodes can be packed into one
  bio.

- bvec cross page boundary
  There is a feature called multi-page bvec, where bvec->bv_len can go
  beyond bvec->bv_page boundary.

- Poor readability

This patch will address the problem:

- Introduce a proper structure, processed_extent, to record processed
  extent range

- Integrate inode/start/end/uptodate check into
  endio_readpage_release_extent()

- Add more comment on each step.
  This should greatly improve the readability, now in
  end_bio_extent_readpage() there are only two
  endio_readpage_release_extent() calls.

- Add inode check for contiguity
  Now we also ensure the inode is the same one before checking if the
  range is contiguous.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
fs/btrfs/extent_io.c

index 07db1af..519f745 100644 (file)
@@ -2769,16 +2769,77 @@ static void end_bio_extent_writepage(struct bio *bio)
        bio_put(bio);
 }
 
-static void
-endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
-                             int uptodate)
+/*
+ * Record previously processed extent range
+ *
+ * For endio_readpage_release_extent() to handle a full extent range, reducing
+ * the extent io operations.
+ */
+struct processed_extent {
+       struct btrfs_inode *inode;
+       /* Start of the range in @inode */
+       u64 start;
+       /* End of the range in in @inode */
+       u64 end;
+       bool uptodate;
+};
+
+/*
+ * Try to release processed extent range
+ *
+ * May not release the extent range right now if the current range is
+ * contiguous to processed extent.
+ *
+ * Will release processed extent when any of @inode, @uptodate, the range is
+ * no longer contiguous to the processed range.
+ *
+ * Passing @inode == NULL will force processed extent to be released.
+ */
+static void endio_readpage_release_extent(struct processed_extent *processed,
+                             struct btrfs_inode *inode, u64 start, u64 end,
+                             bool uptodate)
 {
        struct extent_state *cached = NULL;
-       u64 end = start + len - 1;
+       struct extent_io_tree *tree;
+
+       /* The first extent, initialize @processed */
+       if (!processed->inode)
+               goto update;
 
-       if (uptodate && tree->track_uptodate)
-               set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC);
-       unlock_extent_cached_atomic(tree, start, end, &cached);
+       /*
+        * Contiguous to processed extent, just uptodate the end.
+        *
+        * Several things to notice:
+        *
+        * - bio can be merged as long as on-disk bytenr is contiguous
+        *   This means we can have page belonging to other inodes, thus need to
+        *   check if the inode still matches.
+        * - bvec can contain range beyond current page for multi-page bvec
+        *   Thus we need to do processed->end + 1 >= start check
+        */
+       if (processed->inode == inode && processed->uptodate == uptodate &&
+           processed->end + 1 >= start && end >= processed->end) {
+               processed->end = end;
+               return;
+       }
+
+       tree = &processed->inode->io_tree;
+       /*
+        * Now we don't have range contiguous to the processed range, release
+        * the processed range now.
+        */
+       if (processed->uptodate && tree->track_uptodate)
+               set_extent_uptodate(tree, processed->start, processed->end,
+                                   &cached, GFP_ATOMIC);
+       unlock_extent_cached_atomic(tree, processed->start, processed->end,
+                                   &cached);
+
+update:
+       /* Update processed to current range */
+       processed->inode = inode;
+       processed->start = start;
+       processed->end = end;
+       processed->uptodate = uptodate;
 }
 
 /*
@@ -2798,12 +2859,11 @@ static void end_bio_extent_readpage(struct bio *bio)
        int uptodate = !bio->bi_status;
        struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
        struct extent_io_tree *tree, *failure_tree;
+       struct processed_extent processed = { 0 };
        u64 offset = 0;
        u64 start;
        u64 end;
        u64 len;
-       u64 extent_start = 0;
-       u64 extent_len = 0;
        int mirror;
        int ret;
        struct bvec_iter_all iter_all;
@@ -2912,32 +2972,11 @@ readpage_ok:
                unlock_page(page);
                offset += len;
 
-               if (unlikely(!uptodate)) {
-                       if (extent_len) {
-                               endio_readpage_release_extent(tree,
-                                                             extent_start,
-                                                             extent_len, 1);
-                               extent_start = 0;
-                               extent_len = 0;
-                       }
-                       endio_readpage_release_extent(tree, start,
-                                                     end - start + 1, 0);
-               } else if (!extent_len) {
-                       extent_start = start;
-                       extent_len = end + 1 - start;
-               } else if (extent_start + extent_len == start) {
-                       extent_len += end + 1 - start;
-               } else {
-                       endio_readpage_release_extent(tree, extent_start,
-                                                     extent_len, uptodate);
-                       extent_start = start;
-                       extent_len = end + 1 - start;
-               }
+               endio_readpage_release_extent(&processed, BTRFS_I(inode),
+                                             start, end, uptodate);
        }
-
-       if (extent_len)
-               endio_readpage_release_extent(tree, extent_start, extent_len,
-                                             uptodate);
+       /* Release the last extent */
+       endio_readpage_release_extent(&processed, NULL, 0, 0, false);
        btrfs_io_bio_free_csum(io_bio);
        bio_put(bio);
 }