Merge tag 'net-next-5.15' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev...
[linux-2.6-microblaze.git] / fs / btrfs / extent_io.c
index 9e81d25..aaddd72 100644 (file)
@@ -13,6 +13,7 @@
 #include <linux/pagevec.h>
 #include <linux/prefetch.h>
 #include <linux/cleancache.h>
+#include <linux/fsverity.h>
 #include "misc.h"
 #include "extent_io.h"
 #include "extent-io-tree.h"
@@ -172,6 +173,8 @@ int __must_check submit_one_bio(struct bio *bio, int mirror_num,
 
        bio->bi_private = NULL;
 
+       /* Caller should ensure the bio has at least some range added */
+       ASSERT(bio->bi_iter.bi_size);
        if (is_data_inode(tree->private_data))
                ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num,
                                            bio_flags);
@@ -2245,18 +2248,6 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
        return bitset;
 }
 
-/*
- * helper function to set a given page up to date if all the
- * extents in the tree for that page are up to date
- */
-static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
-{
-       u64 start = page_offset(page);
-       u64 end = start + PAGE_SIZE - 1;
-       if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
-               SetPageUptodate(page);
-}
-
 int free_io_failure(struct extent_io_tree *failure_tree,
                    struct extent_io_tree *io_tree,
                    struct io_failure_record *rec)
@@ -2688,7 +2679,15 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
               start + len <= page_offset(page) + PAGE_SIZE);
 
        if (uptodate) {
-               btrfs_page_set_uptodate(fs_info, page, start, len);
+               if (fsverity_active(page->mapping->host) &&
+                   !PageError(page) &&
+                   !PageUptodate(page) &&
+                   start < i_size_read(page->mapping->host) &&
+                   !fsverity_verify_page(page)) {
+                       btrfs_page_set_error(fs_info, page, start, len);
+               } else {
+                       btrfs_page_set_uptodate(fs_info, page, start, len);
+               }
        } else {
                btrfs_page_clear_uptodate(fs_info, page, start, len);
                btrfs_page_set_error(fs_info, page, start, len);
@@ -2779,7 +2778,7 @@ next:
 void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
 {
        struct btrfs_inode *inode;
-       int uptodate = (err == 0);
+       const bool uptodate = (err == 0);
        int ret = 0;
 
        ASSERT(page && page->mapping);
@@ -2787,8 +2786,14 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
        btrfs_writepage_endio_finish_ordered(inode, page, start, end, uptodate);
 
        if (!uptodate) {
-               ClearPageUptodate(page);
-               SetPageError(page);
+               const struct btrfs_fs_info *fs_info = inode->root->fs_info;
+               u32 len;
+
+               ASSERT(end + 1 - start <= U32_MAX);
+               len = end + 1 - start;
+
+               btrfs_page_clear_uptodate(fs_info, page, start, len);
+               btrfs_page_set_error(fs_info, page, start, len);
                ret = err < 0 ? err : -EIO;
                mapping_set_error(page->mapping, ret);
        }
@@ -3097,7 +3102,7 @@ readpage_ok:
                /* Update page status and unlock */
                end_page_read(page, uptodate, start, len);
                endio_readpage_release_extent(&processed, BTRFS_I(inode),
-                                             start, end, uptodate);
+                                             start, end, PageUptodate(page));
        }
        /* Release the last extent */
        endio_readpage_release_extent(&processed, NULL, 0, 0, false);
@@ -3153,11 +3158,13 @@ struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs)
        return bio;
 }
 
-struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
+struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
 {
        struct bio *bio;
        struct btrfs_io_bio *btrfs_bio;
 
+       ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
+
        /* this will never fail when it's backed by a bioset */
        bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset);
        ASSERT(bio);
@@ -3181,20 +3188,22 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
  * @size:      portion of page that we want to write
  * @prev_bio_flags:  flags of previous bio to see if we can merge the current one
  * @bio_flags: flags of the current bio to see if we can merge them
- * @return:    true if page was added, false otherwise
  *
  * Attempt to add a page to bio considering stripe alignment etc.
  *
- * Return true if successfully page added. Otherwise, return false.
+ * Return >= 0 for the number of bytes added to the bio.
+ * Can return 0 if the current bio is already at stripe/zone boundary.
+ * Return <0 for error.
  */
-static bool btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
-                              struct page *page,
-                              u64 disk_bytenr, unsigned int size,
-                              unsigned int pg_offset,
-                              unsigned long bio_flags)
+static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
+                             struct page *page,
+                             u64 disk_bytenr, unsigned int size,
+                             unsigned int pg_offset,
+                             unsigned long bio_flags)
 {
        struct bio *bio = bio_ctrl->bio;
        u32 bio_size = bio->bi_iter.bi_size;
+       u32 real_size;
        const sector_t sector = disk_bytenr >> SECTOR_SHIFT;
        bool contig;
        int ret;
@@ -3203,29 +3212,36 @@ static bool btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
        /* The limit should be calculated when bio_ctrl->bio is allocated */
        ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary);
        if (bio_ctrl->bio_flags != bio_flags)
-               return false;
+               return 0;
 
        if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED)
                contig = bio->bi_iter.bi_sector == sector;
        else
                contig = bio_end_sector(bio) == sector;
        if (!contig)
-               return false;
+               return 0;
 
-       if (bio_size + size > bio_ctrl->len_to_oe_boundary ||
-           bio_size + size > bio_ctrl->len_to_stripe_boundary)
-               return false;
+       real_size = min(bio_ctrl->len_to_oe_boundary,
+                       bio_ctrl->len_to_stripe_boundary) - bio_size;
+       real_size = min(real_size, size);
+
+       /*
+        * If real_size is 0, never call bio_add_*_page(), as even size is 0,
+        * bio will still execute its endio function on the page!
+        */
+       if (real_size == 0)
+               return 0;
 
        if (bio_op(bio) == REQ_OP_ZONE_APPEND)
-               ret = bio_add_zone_append_page(bio, page, size, pg_offset);
+               ret = bio_add_zone_append_page(bio, page, real_size, pg_offset);
        else
-               ret = bio_add_page(bio, page, size, pg_offset);
+               ret = bio_add_page(bio, page, real_size, pg_offset);
 
-       return ret == size;
+       return ret;
 }
 
 static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
-                              struct btrfs_inode *inode)
+                              struct btrfs_inode *inode, u64 file_offset)
 {
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        struct btrfs_io_geometry geom;
@@ -3266,9 +3282,8 @@ static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
                return 0;
        }
 
-       ASSERT(fs_info->max_zone_append_size > 0);
        /* Ordered extent not yet created, so we're good */
-       ordered = btrfs_lookup_ordered_extent(inode, logical);
+       ordered = btrfs_lookup_ordered_extent(inode, file_offset);
        if (!ordered) {
                bio_ctrl->len_to_oe_boundary = U32_MAX;
                return 0;
@@ -3280,6 +3295,62 @@ static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
        return 0;
 }
 
+static int alloc_new_bio(struct btrfs_inode *inode,
+                        struct btrfs_bio_ctrl *bio_ctrl,
+                        struct writeback_control *wbc,
+                        unsigned int opf,
+                        bio_end_io_t end_io_func,
+                        u64 disk_bytenr, u32 offset, u64 file_offset,
+                        unsigned long bio_flags)
+{
+       struct btrfs_fs_info *fs_info = inode->root->fs_info;
+       struct bio *bio;
+       int ret;
+
+       /*
+        * For compressed page range, its disk_bytenr is always @disk_bytenr
+        * passed in, no matter if we have added any range into previous bio.
+        */
+       if (bio_flags & EXTENT_BIO_COMPRESSED)
+               bio = btrfs_bio_alloc(disk_bytenr);
+       else
+               bio = btrfs_bio_alloc(disk_bytenr + offset);
+       bio_ctrl->bio = bio;
+       bio_ctrl->bio_flags = bio_flags;
+       bio->bi_end_io = end_io_func;
+       bio->bi_private = &inode->io_tree;
+       bio->bi_write_hint = inode->vfs_inode.i_write_hint;
+       bio->bi_opf = opf;
+       ret = calc_bio_boundaries(bio_ctrl, inode, file_offset);
+       if (ret < 0)
+               goto error;
+       if (wbc) {
+               struct block_device *bdev;
+
+               bdev = fs_info->fs_devices->latest_bdev;
+               bio_set_dev(bio, bdev);
+               wbc_init_bio(wbc, bio);
+       }
+       if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) {
+               struct btrfs_device *device;
+
+               device = btrfs_zoned_get_device(fs_info, disk_bytenr,
+                                               fs_info->sectorsize);
+               if (IS_ERR(device)) {
+                       ret = PTR_ERR(device);
+                       goto error;
+               }
+
+               btrfs_io_bio(bio)->device = device;
+       }
+       return 0;
+error:
+       bio_ctrl->bio = NULL;
+       bio->bi_status = errno_to_blk_status(ret);
+       bio_endio(bio);
+       return ret;
+}
+
 /*
  * @opf:       bio REQ_OP_* and REQ_* flags as one value
  * @wbc:       optional writeback control for io accounting
@@ -3305,61 +3376,67 @@ static int submit_extent_page(unsigned int opf,
                              bool force_bio_submit)
 {
        int ret = 0;
-       struct bio *bio;
-       size_t io_size = min_t(size_t, size, PAGE_SIZE);
        struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
-       struct extent_io_tree *tree = &inode->io_tree;
-       struct btrfs_fs_info *fs_info = inode->root->fs_info;
+       unsigned int cur = pg_offset;
 
        ASSERT(bio_ctrl);
 
        ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE &&
               pg_offset + size <= PAGE_SIZE);
-       if (bio_ctrl->bio) {
-               bio = bio_ctrl->bio;
-               if (force_bio_submit ||
-                   !btrfs_bio_add_page(bio_ctrl, page, disk_bytenr, io_size,
-                                       pg_offset, bio_flags)) {
-                       ret = submit_one_bio(bio, mirror_num, bio_ctrl->bio_flags);
+       if (force_bio_submit && bio_ctrl->bio) {
+               ret = submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->bio_flags);
+               bio_ctrl->bio = NULL;
+               if (ret < 0)
+                       return ret;
+       }
+
+       while (cur < pg_offset + size) {
+               u32 offset = cur - pg_offset;
+               int added;
+
+               /* Allocate new bio if needed */
+               if (!bio_ctrl->bio) {
+                       ret = alloc_new_bio(inode, bio_ctrl, wbc, opf,
+                                           end_io_func, disk_bytenr, offset,
+                                           page_offset(page) + cur,
+                                           bio_flags);
+                       if (ret < 0)
+                               return ret;
+               }
+               /*
+                * We must go through btrfs_bio_add_page() to ensure each
+                * page range won't cross various boundaries.
+                */
+               if (bio_flags & EXTENT_BIO_COMPRESSED)
+                       added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr,
+                                       size - offset, pg_offset + offset,
+                                       bio_flags);
+               else
+                       added = btrfs_bio_add_page(bio_ctrl, page,
+                                       disk_bytenr + offset, size - offset,
+                                       pg_offset + offset, bio_flags);
+
+               /* Metadata page range should never be split */
+               if (!is_data_inode(&inode->vfs_inode))
+                       ASSERT(added == 0 || added == size - offset);
+
+               /* At least we added some page, update the account */
+               if (wbc && added)
+                       wbc_account_cgroup_owner(wbc, page, added);
+
+               /* We have reached boundary, submit right now */
+               if (added < size - offset) {
+                       /* The bio should contain some page(s) */
+                       ASSERT(bio_ctrl->bio->bi_iter.bi_size);
+                       ret = submit_one_bio(bio_ctrl->bio, mirror_num,
+                                       bio_ctrl->bio_flags);
                        bio_ctrl->bio = NULL;
                        if (ret < 0)
                                return ret;
-               } else {
-                       if (wbc)
-                               wbc_account_cgroup_owner(wbc, page, io_size);
-                       return 0;
                }
+               cur += added;
        }
-
-       bio = btrfs_bio_alloc(disk_bytenr);
-       bio_add_page(bio, page, io_size, pg_offset);
-       bio->bi_end_io = end_io_func;
-       bio->bi_private = tree;
-       bio->bi_write_hint = page->mapping->host->i_write_hint;
-       bio->bi_opf = opf;
-       if (wbc) {
-               struct block_device *bdev;
-
-               bdev = fs_info->fs_devices->latest_bdev;
-               bio_set_dev(bio, bdev);
-               wbc_init_bio(wbc, bio);
-               wbc_account_cgroup_owner(wbc, page, io_size);
-       }
-       if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) {
-               struct btrfs_device *device;
-
-               device = btrfs_zoned_get_device(fs_info, disk_bytenr, io_size);
-               if (IS_ERR(device))
-                       return PTR_ERR(device);
-
-               btrfs_io_bio(bio)->device = device;
-       }
-
-       bio_ctrl->bio = bio;
-       bio_ctrl->bio_flags = bio_flags;
-       ret = calc_bio_boundaries(bio_ctrl, inode);
-
-       return ret;
+       return 0;
 }
 
 static int attach_extent_buffer_page(struct extent_buffer *eb,
@@ -3488,7 +3565,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
        size_t pg_offset = 0;
        size_t iosize;
        size_t blocksize = inode->i_sb->s_blocksize;
-       unsigned long this_bio_flag = 0;
        struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
 
        ret = set_page_extent_mapped(page);
@@ -3519,6 +3595,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
        }
        begin_page_read(fs_info, page);
        while (cur <= end) {
+               unsigned long this_bio_flag = 0;
                bool force_bio_submit = false;
                u64 disk_bytenr;
 
@@ -3627,7 +3704,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
                /* the get_extent function already copied into the page */
                if (test_range_bit(tree, cur, cur_end,
                                   EXTENT_UPTODATE, 1, NULL)) {
-                       check_page_uptodate(tree, page);
                        unlock_extent(tree, cur, cur + iosize - 1);
                        end_page_read(page, true, cur, iosize);
                        cur = cur + iosize;
@@ -3722,14 +3798,9 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
                ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
                                delalloc_end, &page_started, nr_written, wbc);
                if (ret) {
-                       SetPageError(page);
-                       /*
-                        * btrfs_run_delalloc_range should return < 0 for error
-                        * but just in case, we use > 0 here meaning the IO is
-                        * started, so we don't want to return > 0 unless
-                        * things are going well.
-                        */
-                       return ret < 0 ? ret : -EIO;
+                       btrfs_page_set_error(inode->root->fs_info, page,
+                                            page_offset(page), PAGE_SIZE);
+                       return ret;
                }
                /*
                 * delalloc_end is already one less than the total length, so
@@ -3829,9 +3900,8 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
                                 int *nr_ret)
 {
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
-       u64 start = page_offset(page);
-       u64 end = start + PAGE_SIZE - 1;
-       u64 cur = start;
+       u64 cur = page_offset(page);
+       u64 end = cur + PAGE_SIZE - 1;
        u64 extent_offset;
        u64 block_start;
        struct extent_map *em;
@@ -3841,7 +3911,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
        const unsigned int write_flags = wbc_to_write_flags(wbc);
        bool compressed;
 
-       ret = btrfs_writepage_cow_fixup(page, start, end);
+       ret = btrfs_writepage_cow_fixup(page);
        if (ret) {
                /* Fixup worker will requeue */
                redirty_page_for_writepage(wbc, page);
@@ -3865,7 +3935,16 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
 
                if (cur >= i_size) {
                        btrfs_writepage_endio_finish_ordered(inode, page, cur,
-                                                            end, 1);
+                                                            end, true);
+                       /*
+                        * This range is beyond i_size, thus we don't need to
+                        * bother writing back.
+                        * But we still need to clear the dirty subpage bit, or
+                        * the next time the page gets dirtied, we will try to
+                        * writeback the sectors with subpage dirty bits,
+                        * causing writeback without ordered extent.
+                        */
+                       btrfs_page_clear_dirty(fs_info, page, cur, end + 1 - cur);
                        break;
                }
 
@@ -3915,7 +3994,8 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
                                nr++;
                        else
                                btrfs_writepage_endio_finish_ordered(inode,
-                                               page, cur, cur + iosize - 1, 1);
+                                               page, cur, cur + iosize - 1, true);
+                       btrfs_page_clear_dirty(fs_info, page, cur, iosize);
                        cur += iosize;
                        continue;
                }
@@ -3951,6 +4031,12 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
                cur += iosize;
                nr++;
        }
+       /*
+        * If we finish without problem, we should not only clear page dirty,
+        * but also empty subpage dirty bits
+        */
+       if (!ret)
+               btrfs_page_assert_not_dirty(fs_info, page);
        *nr_ret = nr;
        return ret;
 }
@@ -3981,7 +4067,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 
        WARN_ON(!PageLocked(page));
 
-       ClearPageError(page);
+       btrfs_page_clear_error(btrfs_sb(inode->i_sb), page,
+                              page_offset(page), PAGE_SIZE);
 
        pg_offset = offset_in_page(i_size);
        if (page->index > end_index ||
@@ -4022,10 +4109,39 @@ done:
                set_page_writeback(page);
                end_page_writeback(page);
        }
-       if (PageError(page)) {
-               ret = ret < 0 ? ret : -EIO;
+       /*
+        * Here we used to have a check for PageError() and then set @ret and
+        * call end_extent_writepage().
+        *
+        * But in fact setting @ret here will cause different error paths
+        * between subpage and regular sectorsize.
+        *
+        * For regular page size, we never submit current page, but only add
+        * current page to current bio.
+        * The bio submission can only happen in next page.
+        * Thus if we hit the PageError() branch, @ret is already set to
+        * non-zero value and will not get updated for regular sectorsize.
+        *
+        * But for subpage case, it's possible we submit part of current page,
+        * thus can get PageError() set by submitted bio of the same page,
+        * while our @ret is still 0.
+        *
+        * So here we unify the behavior and don't set @ret.
+        * Error can still be properly passed to higher layer as page will
+        * be set error, here we just don't handle the IO failure.
+        *
+        * NOTE: This is just a hotfix for subpage.
+        * The root fix will be properly ending ordered extent when we hit
+        * an error during writeback.
+        *
+        * But that needs a bigger refactoring, as we not only need to grab the
+        * submitted OE, but also need to know exactly at which bytenr we hit
+        * the error.
+        * Currently the full page based __extent_writepage_io() is not
+        * capable of that.
+        */
+       if (PageError(page))
                end_extent_writepage(page, ret, start, page_end);
-       }
        unlock_page(page);
        ASSERT(ret <= 0);
        return ret;
@@ -4984,7 +5100,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
                        ret = __extent_writepage(page, &wbc_writepages, &epd);
                else {
                        btrfs_writepage_endio_finish_ordered(BTRFS_I(inode),
-                                       page, start, start + PAGE_SIZE - 1, 1);
+                                       page, start, start + PAGE_SIZE - 1, true);
                        unlock_page(page);
                }
                put_page(page);