Merge tag 'for-5.11/drivers-2020-12-14' of git://git.kernel.dk/linux-block
[linux-2.6-microblaze.git] / fs / btrfs / inode.c
index 9570458..8e23780 100644 (file)
@@ -6,7 +6,6 @@
 #include <crypto/hash.h>
 #include <linux/kernel.h>
 #include <linux/bio.h>
-#include <linux/buffer_head.h>
 #include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
@@ -31,6 +30,7 @@
 #include <linux/swap.h>
 #include <linux/migrate.h>
 #include <linux/sched/mm.h>
+#include <linux/iomap.h>
 #include <asm/unaligned.h>
 #include "misc.h"
 #include "ctree.h"
@@ -45,7 +45,6 @@
 #include "compression.h"
 #include "locking.h"
 #include "free-space-cache.h"
-#include "inode-map.h"
 #include "props.h"
 #include "qgroup.h"
 #include "delalloc-space.h"
@@ -59,9 +58,9 @@ struct btrfs_iget_args {
 
 struct btrfs_dio_data {
        u64 reserve;
-       u64 unsubmitted_oe_range_start;
-       u64 unsubmitted_oe_range_end;
-       int overwrite;
+       loff_t length;
+       ssize_t submitted;
+       struct extent_changeset *data_reserved;
 };
 
 static const struct inode_operations btrfs_dir_inode_operations;
@@ -70,7 +69,6 @@ static const struct inode_operations btrfs_special_inode_operations;
 static const struct inode_operations btrfs_file_inode_operations;
 static const struct address_space_operations btrfs_aops;
 static const struct file_operations btrfs_dir_file_operations;
-static const struct extent_io_ops btrfs_extent_io_ops;
 
 static struct kmem_cache *btrfs_inode_cachep;
 struct kmem_cache *btrfs_trans_handle_cachep;
@@ -95,6 +93,51 @@ static void __endio_write_update_ordered(struct btrfs_inode *inode,
                                         const u64 offset, const u64 bytes,
                                         const bool uptodate);
 
+/*
+ * btrfs_inode_lock - lock inode i_rwsem based on arguments passed
+ *
+ * ilock_flags can have the following bit set:
+ *
+ * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode
+ * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt
+ *                  return -EAGAIN
+ */
+int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags)
+{
+       if (ilock_flags & BTRFS_ILOCK_SHARED) {
+               if (ilock_flags & BTRFS_ILOCK_TRY) {
+                       if (!inode_trylock_shared(inode))
+                               return -EAGAIN;
+                       else
+                               return 0;
+               }
+               inode_lock_shared(inode);
+       } else {
+               if (ilock_flags & BTRFS_ILOCK_TRY) {
+                       if (!inode_trylock(inode))
+                               return -EAGAIN;
+                       else
+                               return 0;
+               }
+               inode_lock(inode);
+       }
+       return 0;
+}
+
+/*
+ * btrfs_inode_unlock - unock inode i_rwsem
+ *
+ * ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
+ * to decide whether the lock acquired is shared or exclusive.
+ */
+void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags)
+{
+       if (ilock_flags & BTRFS_ILOCK_SHARED)
+               inode_unlock_shared(inode);
+       else
+               inode_unlock(inode);
+}
+
 /*
  * Cleanup all submitted ordered extents in specified range to handle errors
  * from the btrfs_run_delalloc_range() callback.
@@ -140,13 +183,6 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
 
 static int btrfs_dirty_inode(struct inode *inode);
 
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-void btrfs_test_inode_set_ops(struct inode *inode)
-{
-       BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
-}
-#endif
-
 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
                                     struct inode *inode,  struct inode *dir,
                                     const struct qstr *qstr)
@@ -165,7 +201,7 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
  * no overlapping inline items exist in the btree
  */
 static int insert_inline_extent(struct btrfs_trans_handle *trans,
-                               struct btrfs_path *path, int extent_inserted,
+                               struct btrfs_path *path, bool extent_inserted,
                                struct btrfs_root *root, struct inode *inode,
                                u64 start, size_t size, size_t compressed_size,
                                int compress_type,
@@ -186,8 +222,6 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
        if (compressed_size && compressed_pages)
                cur_size = compressed_size;
 
-       inode_add_bytes(inode, size);
-
        if (!extent_inserted) {
                struct btrfs_key key;
                size_t datasize;
@@ -197,7 +231,6 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
                key.type = BTRFS_EXTENT_DATA_KEY;
 
                datasize = btrfs_file_extent_calc_inline_size(cur_size);
-               path->leave_spinning = 1;
                ret = btrfs_insert_empty_item(trans, root, path, &key,
                                              datasize);
                if (ret)
@@ -263,8 +296,6 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
         * could end up racing with unlink.
         */
        BTRFS_I(inode)->disk_i_size = inode->i_size;
-       ret = btrfs_update_inode(trans, root, inode);
-
 fail:
        return ret;
 }
@@ -280,6 +311,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
                                          int compress_type,
                                          struct page **compressed_pages)
 {
+       struct btrfs_drop_extents_args drop_args = { 0 };
        struct btrfs_root *root = inode->root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_trans_handle *trans;
@@ -290,8 +322,6 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
        u64 data_len = inline_len;
        int ret;
        struct btrfs_path *path;
-       int extent_inserted = 0;
-       u32 extent_item_size;
 
        if (compressed_size)
                data_len = compressed_size;
@@ -317,16 +347,20 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
        }
        trans->block_rsv = &inode->block_rsv;
 
+       drop_args.path = path;
+       drop_args.start = start;
+       drop_args.end = aligned_end;
+       drop_args.drop_cache = true;
+       drop_args.replace_extent = true;
+
        if (compressed_size && compressed_pages)
-               extent_item_size = btrfs_file_extent_calc_inline_size(
+               drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(
                   compressed_size);
        else
-               extent_item_size = btrfs_file_extent_calc_inline_size(
+               drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(
                    inline_len);
 
-       ret = __btrfs_drop_extents(trans, root, inode, path, start, aligned_end,
-                                  NULL, 1, 1, extent_item_size,
-                                  &extent_inserted);
+       ret = btrfs_drop_extents(trans, root, inode, &drop_args);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                goto out;
@@ -334,7 +368,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
 
        if (isize > actual_end)
                inline_len = min_t(u64, isize, actual_end);
-       ret = insert_inline_extent(trans, path, extent_inserted,
+       ret = insert_inline_extent(trans, path, drop_args.extent_inserted,
                                   root, &inode->vfs_inode, start,
                                   inline_len, compressed_size,
                                   compress_type, compressed_pages);
@@ -346,8 +380,17 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
                goto out;
        }
 
+       btrfs_update_inode_bytes(inode, inline_len, drop_args.bytes_found);
+       ret = btrfs_update_inode(trans, root, inode);
+       if (ret && ret != -ENOSPC) {
+               btrfs_abort_transaction(trans, ret);
+               goto out;
+       } else if (ret == -ENOSPC) {
+               ret = 1;
+               goto out;
+       }
+
        set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
-       btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
 out:
        /*
         * Don't forget to free the reserved space, as for inlined extent
@@ -1605,6 +1648,15 @@ next_slot:
                                goto out_check;
                        if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
                                goto out_check;
+
+                       /*
+                        * The following checks can be expensive, as they need to
+                        * take other locks and do btree or rbtree searches, so
+                        * release the path to avoid blocking other tasks for too
+                        * long.
+                        */
+                       btrfs_release_path(path);
+
                        /* If extent is RO, we must COW it */
                        if (btrfs_extent_readonly(fs_info, disk_bytenr))
                                goto out_check;
@@ -1680,12 +1732,12 @@ out_check:
                        cur_offset = extent_end;
                        if (cur_offset > end)
                                break;
+                       if (!path->nodes[0])
+                               continue;
                        path->slots[0]++;
                        goto next_slot;
                }
 
-               btrfs_release_path(path);
-
                /*
                 * COW range from cow_start to found_key.offset - 1. As the key
                 * will contain the beginning of the first extent that can be
@@ -2105,6 +2157,8 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
                spin_lock(&inode->lock);
                ASSERT(inode->new_delalloc_bytes >= len);
                inode->new_delalloc_bytes -= len;
+               if (*bits & EXTENT_ADD_INODE_BYTES)
+                       inode_add_bytes(&inode->vfs_inode, len);
                spin_unlock(&inode->lock);
        }
 }
@@ -2128,7 +2182,7 @@ int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
 {
        struct inode *inode = page->mapping->host;
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-       u64 logical = (u64)bio->bi_iter.bi_sector << 9;
+       u64 logical = bio->bi_iter.bi_sector << 9;
        u64 length = 0;
        u64 map_length;
        int ret;
@@ -2157,11 +2211,9 @@ int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
  * At IO completion time the cums attached on the ordered extent record
  * are inserted into the btree
  */
-static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio,
-                                   u64 bio_offset)
+static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
+                                          u64 dio_file_offset)
 {
-       struct inode *inode = private_data;
-
        return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
 }
 
@@ -2183,9 +2235,8 @@ static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio,
  *
  *    c-3) otherwise:                  async submit
  */
-static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
-                                         int mirror_num,
-                                         unsigned long bio_flags)
+blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
+                                  int mirror_num, unsigned long bio_flags)
 
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2195,7 +2246,8 @@ static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
        int skip_sum;
        int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
 
-       skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+       skip_sum = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) ||
+                  !fs_info->csum_root;
 
        if (btrfs_is_free_space_inode(BTRFS_I(inode)))
                metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
@@ -2210,8 +2262,13 @@ static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
                                                           mirror_num,
                                                           bio_flags);
                        goto out;
-               } else if (!skip_sum) {
-                       ret = btrfs_lookup_bio_sums(inode, bio, (u64)-1, NULL);
+               } else {
+                       /*
+                        * Lookup bio sums does extra checks around whether we
+                        * need to csum or not, which is why we ignore skip_sum
+                        * here.
+                        */
+                       ret = btrfs_lookup_bio_sums(inode, bio, NULL);
                        if (ret)
                                goto out;
                }
@@ -2221,8 +2278,8 @@ static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
                if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
                        goto mapit;
                /* we're doing a write, do the async checksumming */
-               ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags,
-                                         0, inode, btrfs_submit_bio_start);
+               ret = btrfs_wq_submit_bio(inode, bio, mirror_num, bio_flags,
+                                         0, btrfs_submit_bio_start);
                goto out;
        } else if (!skip_sum) {
                ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
@@ -2245,16 +2302,15 @@ out:
  * given a list of ordered sums record them in the inode.  This happens
  * at IO completion time based on sums calculated at bio submission time.
  */
-static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
-                            struct inode *inode, struct list_head *list)
+static int add_pending_csums(struct btrfs_trans_handle *trans,
+                            struct list_head *list)
 {
        struct btrfs_ordered_sum *sum;
        int ret;
 
        list_for_each_entry(sum, list, list) {
                trans->adding_csums = true;
-               ret = btrfs_csum_file_blocks(trans,
-                      BTRFS_I(inode)->root->fs_info->csum_root, sum);
+               ret = btrfs_csum_file_blocks(trans, trans->fs_info->csum_root, sum);
                trans->adding_csums = false;
                if (ret)
                        return ret;
@@ -2262,11 +2318,69 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
        return 0;
 }
 
+static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
+                                        const u64 start,
+                                        const u64 len,
+                                        struct extent_state **cached_state)
+{
+       u64 search_start = start;
+       const u64 end = start + len - 1;
+
+       while (search_start < end) {
+               const u64 search_len = end - search_start + 1;
+               struct extent_map *em;
+               u64 em_len;
+               int ret = 0;
+
+               em = btrfs_get_extent(inode, NULL, 0, search_start, search_len);
+               if (IS_ERR(em))
+                       return PTR_ERR(em);
+
+               if (em->block_start != EXTENT_MAP_HOLE)
+                       goto next;
+
+               em_len = em->len;
+               if (em->start < search_start)
+                       em_len -= search_start - em->start;
+               if (em_len > search_len)
+                       em_len = search_len;
+
+               ret = set_extent_bit(&inode->io_tree, search_start,
+                                    search_start + em_len - 1,
+                                    EXTENT_DELALLOC_NEW, 0, NULL, cached_state,
+                                    GFP_NOFS, NULL);
+next:
+               search_start = extent_map_end(em);
+               free_extent_map(em);
+               if (ret)
+                       return ret;
+       }
+       return 0;
+}
+
 int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
                              unsigned int extra_bits,
                              struct extent_state **cached_state)
 {
        WARN_ON(PAGE_ALIGNED(end));
+
+       if (start >= i_size_read(&inode->vfs_inode) &&
+           !(inode->flags & BTRFS_INODE_PREALLOC)) {
+               /*
+                * There can't be any extents following eof in this case so just
+                * set the delalloc new bit for the range directly.
+                */
+               extra_bits |= EXTENT_DELALLOC_NEW;
+       } else {
+               int ret;
+
+               ret = btrfs_find_new_delalloc_bytes(inode, start,
+                                                   end + 1 - start,
+                                                   cached_state);
+               if (ret)
+                       return ret;
+       }
+
        return set_extent_delalloc(&inode->io_tree, start, end, extra_bits,
                                   cached_state);
 }
@@ -2357,7 +2471,7 @@ again:
                unlock_extent_cached(&inode->io_tree, page_start, page_end,
                                     &cached_state);
                unlock_page(page);
-               btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1);
+               btrfs_start_ordered_extent(ordered, 1);
                btrfs_put_ordered_extent(ordered);
                goto again;
        }
@@ -2462,9 +2576,11 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end)
 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
                                       struct btrfs_inode *inode, u64 file_pos,
                                       struct btrfs_file_extent_item *stack_fi,
+                                      const bool update_inode_bytes,
                                       u64 qgroup_reserved)
 {
        struct btrfs_root *root = inode->root;
+       const u64 sectorsize = root->fs_info->sectorsize;
        struct btrfs_path *path;
        struct extent_buffer *leaf;
        struct btrfs_key ins;
@@ -2472,7 +2588,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
        u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi);
        u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi);
        u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi);
-       int extent_inserted = 0;
+       struct btrfs_drop_extents_args drop_args = { 0 };
        int ret;
 
        path = btrfs_alloc_path();
@@ -2488,18 +2604,20 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
         * the caller is expected to unpin it and allow it to be merged
         * with the others.
         */
-       ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
-                                  file_pos + num_bytes, NULL, 0,
-                                  1, sizeof(*stack_fi), &extent_inserted);
+       drop_args.path = path;
+       drop_args.start = file_pos;
+       drop_args.end = file_pos + num_bytes;
+       drop_args.replace_extent = true;
+       drop_args.extent_item_size = sizeof(*stack_fi);
+       ret = btrfs_drop_extents(trans, root, inode, &drop_args);
        if (ret)
                goto out;
 
-       if (!extent_inserted) {
+       if (!drop_args.extent_inserted) {
                ins.objectid = btrfs_ino(inode);
                ins.offset = file_pos;
                ins.type = BTRFS_EXTENT_DATA_KEY;
 
-               path->leave_spinning = 1;
                ret = btrfs_insert_empty_item(trans, root, path, &ins,
                                              sizeof(*stack_fi));
                if (ret)
@@ -2514,7 +2632,24 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(leaf);
        btrfs_release_path(path);
 
-       inode_add_bytes(&inode->vfs_inode, num_bytes);
+       /*
+        * If we dropped an inline extent here, we know the range where it is
+        * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the
+        * number of bytes only for that range contaning the inline extent.
+        * The remaining of the range will be processed when clearning the
+        * EXTENT_DELALLOC_BIT bit through the ordered extent completion.
+        */
+       if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) {
+               u64 inline_size = round_down(drop_args.bytes_found, sectorsize);
+
+               inline_size = drop_args.bytes_found - inline_size;
+               btrfs_update_inode_bytes(inode, sectorsize, inline_size);
+               drop_args.bytes_found -= inline_size;
+               num_bytes -= sectorsize;
+       }
+
+       if (update_inode_bytes)
+               btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found);
 
        ins.objectid = disk_bytenr;
        ins.offset = disk_num_bytes;
@@ -2548,11 +2683,11 @@ static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
 }
 
 static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
-                                            struct inode *inode,
                                             struct btrfs_ordered_extent *oe)
 {
        struct btrfs_file_extent_item stack_fi;
        u64 logical_len;
+       bool update_inode_bytes;
 
        memset(&stack_fi, 0, sizeof(stack_fi));
        btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG);
@@ -2568,8 +2703,18 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
        btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
        /* Encryption and other encoding is reserved and all 0 */
 
-       return insert_reserved_file_extent(trans, BTRFS_I(inode), oe->file_offset,
-                                          &stack_fi, oe->qgroup_rsv);
+       /*
+        * For delalloc, when completing an ordered extent we update the inode's
+        * bytes when clearing the range in the inode's io tree, so pass false
+        * as the argument 'update_inode_bytes' to insert_reserved_file_extent(),
+        * except if the ordered extent was truncated.
+        */
+       update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) ||
+                            test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags);
+
+       return insert_reserved_file_extent(trans, BTRFS_I(oe->inode),
+                                          oe->file_offset, &stack_fi,
+                                          update_inode_bytes, oe->qgroup_rsv);
 }
 
 /*
@@ -2579,11 +2724,11 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
  */
 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
 {
-       struct inode *inode = ordered_extent->inode;
-       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode);
+       struct btrfs_root *root = inode->root;
+       struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_trans_handle *trans = NULL;
-       struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+       struct extent_io_tree *io_tree = &inode->io_tree;
        struct extent_state *cached_state = NULL;
        u64 start, end;
        int compress_type = 0;
@@ -2591,10 +2736,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
        u64 logical_len = ordered_extent->num_bytes;
        bool freespace_inode;
        bool truncated = false;
-       bool range_locked = false;
-       bool clear_new_delalloc_bytes = false;
        bool clear_reserved_extent = true;
-       unsigned int clear_bits;
+       unsigned int clear_bits = EXTENT_DEFRAG;
 
        start = ordered_extent->file_offset;
        end = start + ordered_extent->num_bytes - 1;
@@ -2602,16 +2745,16 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
        if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
            !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
            !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
-               clear_new_delalloc_bytes = true;
+               clear_bits |= EXTENT_DELALLOC_NEW;
 
-       freespace_inode = btrfs_is_free_space_inode(BTRFS_I(inode));
+       freespace_inode = btrfs_is_free_space_inode(inode);
 
        if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
                ret = -EIO;
                goto out;
        }
 
-       btrfs_free_io_failure_record(BTRFS_I(inode), start, end);
+       btrfs_free_io_failure_record(inode, start, end);
 
        if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
                truncated = true;
@@ -2634,14 +2777,14 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
                        trans = NULL;
                        goto out;
                }
-               trans->block_rsv = &BTRFS_I(inode)->block_rsv;
+               trans->block_rsv = &inode->block_rsv;
                ret = btrfs_update_inode_fallback(trans, root, inode);
                if (ret) /* -ENOMEM or corruption */
                        btrfs_abort_transaction(trans, ret);
                goto out;
        }
 
-       range_locked = true;
+       clear_bits |= EXTENT_LOCKED;
        lock_extent_bits(io_tree, start, end, &cached_state);
 
        if (freespace_inode)
@@ -2654,20 +2797,19 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
                goto out;
        }
 
-       trans->block_rsv = &BTRFS_I(inode)->block_rsv;
+       trans->block_rsv = &inode->block_rsv;
 
        if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
                compress_type = ordered_extent->compress_type;
        if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
                BUG_ON(compress_type);
-               ret = btrfs_mark_extent_written(trans, BTRFS_I(inode),
+               ret = btrfs_mark_extent_written(trans, inode,
                                                ordered_extent->file_offset,
                                                ordered_extent->file_offset +
                                                logical_len);
        } else {
                BUG_ON(root == fs_info->tree_root);
-               ret = insert_ordered_extent_file_extent(trans, inode,
-                                                       ordered_extent);
+               ret = insert_ordered_extent_file_extent(trans, ordered_extent);
                if (!ret) {
                        clear_reserved_extent = false;
                        btrfs_release_delalloc_bytes(fs_info,
@@ -2675,20 +2817,30 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
                                                ordered_extent->disk_num_bytes);
                }
        }
-       unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
-                          ordered_extent->file_offset,
+       unpin_extent_cache(&inode->extent_tree, ordered_extent->file_offset,
                           ordered_extent->num_bytes, trans->transid);
        if (ret < 0) {
                btrfs_abort_transaction(trans, ret);
                goto out;
        }
 
-       ret = add_pending_csums(trans, inode, &ordered_extent->list);
+       ret = add_pending_csums(trans, &ordered_extent->list);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                goto out;
        }
 
+       /*
+        * If this is a new delalloc range, clear its new delalloc flag to
+        * update the inode's number of bytes. This needs to be done first
+        * before updating the inode item.
+        */
+       if ((clear_bits & EXTENT_DELALLOC_NEW) &&
+           !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags))
+               clear_extent_bit(&inode->io_tree, start, end,
+                                EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
+                                0, 0, &cached_state);
+
        btrfs_inode_safe_disk_i_size_write(inode, 0);
        ret = btrfs_update_inode_fallback(trans, root, inode);
        if (ret) { /* -ENOMEM or corruption */
@@ -2697,12 +2849,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
        }
        ret = 0;
 out:
-       clear_bits = EXTENT_DEFRAG;
-       if (range_locked)
-               clear_bits |= EXTENT_LOCKED;
-       if (clear_new_delalloc_bytes)
-               clear_bits |= EXTENT_DELALLOC_NEW;
-       clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits,
+       clear_extent_bit(&inode->io_tree, start, end, clear_bits,
                         (clear_bits & EXTENT_LOCKED) ? 1 : 0, 0,
                         &cached_state);
 
@@ -2717,7 +2864,7 @@ out:
                clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
 
                /* Drop the cache for the part of the extent we didn't write. */
-               btrfs_drop_extent_cache(BTRFS_I(inode), unwritten_start, end, 0);
+               btrfs_drop_extent_cache(inode, unwritten_start, end, 0);
 
                /*
                 * If the ordered extent had an IOERR or something else went
@@ -2772,8 +2919,8 @@ static void finish_ordered_fn(struct btrfs_work *work)
 void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
                                          u64 end, int uptodate)
 {
-       struct inode *inode = page->mapping->host;
-       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+       struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+       struct btrfs_fs_info *fs_info = inode->root->fs_info;
        struct btrfs_ordered_extent *ordered_extent = NULL;
        struct btrfs_workqueue *wq;
 
@@ -2784,7 +2931,7 @@ void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
                                            end - start + 1, uptodate))
                return;
 
-       if (btrfs_is_free_space_inode(BTRFS_I(inode)))
+       if (btrfs_is_free_space_inode(inode))
                wq = fs_info->endio_freespace_worker;
        else
                wq = fs_info->endio_write_workers;
@@ -2793,18 +2940,32 @@ void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
        btrfs_queue_work(wq, &ordered_extent->work);
 }
 
+/*
+ * check_data_csum - verify checksum of one sector of uncompressed data
+ * @inode:     inode
+ * @io_bio:    btrfs_io_bio which contains the csum
+ * @bio_offset:        offset to the beginning of the bio (in bytes)
+ * @page:      page where is the data to be verified
+ * @pgoff:     offset inside the page
+ *
+ * The length of such check is always one sector size.
+ */
 static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio,
-                          int icsum, struct page *page, int pgoff, u64 start,
-                          size_t len)
+                          u32 bio_offset, struct page *page, u32 pgoff)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
        char *kaddr;
-       u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+       u32 len = fs_info->sectorsize;
+       const u32 csum_size = fs_info->csum_size;
+       unsigned int offset_sectors;
        u8 *csum_expected;
        u8 csum[BTRFS_CSUM_SIZE];
 
-       csum_expected = ((u8 *)io_bio->csum) + icsum * csum_size;
+       ASSERT(pgoff + len <= PAGE_SIZE);
+
+       offset_sectors = bio_offset >> fs_info->sectorsize_bits;
+       csum_expected = ((u8 *)io_bio->csum) + offset_sectors * csum_size;
 
        kaddr = kmap_atomic(page);
        shash->tfm = fs_info->csum_shash;
@@ -2817,8 +2978,8 @@ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio,
        kunmap_atomic(kaddr);
        return 0;
 zeroit:
-       btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
-                                   io_bio->mirror_num);
+       btrfs_print_data_csum_error(BTRFS_I(inode), page_offset(page) + pgoff,
+                                   csum, csum_expected, io_bio->mirror_num);
        if (io_bio->device)
                btrfs_dev_stat_inc_and_print(io_bio->device,
                                             BTRFS_DEV_STAT_CORRUPTION_ERRS);
@@ -2829,18 +2990,23 @@ zeroit:
 }
 
 /*
- * when reads are done, we need to check csums to verify the data is correct
+ * When reads are done, we need to check csums to verify the data is correct.
  * if there's a match, we allow the bio to finish.  If not, the code in
  * extent_io.c will try to find good copies for us.
+ *
+ * @bio_offset:        offset to the beginning of the bio (in bytes)
+ * @start:     file offset of the range start
+ * @end:       file offset of the range end (inclusive)
+ * @mirror:    mirror number
  */
-static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
-                                     u64 phy_offset, struct page *page,
-                                     u64 start, u64 end, int mirror)
+int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
+                          struct page *page, u64 start, u64 end, int mirror)
 {
-       size_t offset = start - page_offset(page);
        struct inode *inode = page->mapping->host;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct btrfs_root *root = BTRFS_I(inode)->root;
+       const u32 sectorsize = root->fs_info->sectorsize;
+       u32 pg_off;
 
        if (PageChecked(page)) {
                ClearPageChecked(page);
@@ -2850,15 +3016,27 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
                return 0;
 
+       if (!root->fs_info->csum_root)
+               return 0;
+
        if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
            test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
                clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM);
                return 0;
        }
 
-       phy_offset >>= inode->i_sb->s_blocksize_bits;
-       return check_data_csum(inode, io_bio, phy_offset, page, offset, start,
-                              (size_t)(end - start + 1));
+       ASSERT(page_offset(page) <= start &&
+              end <= page_offset(page) + PAGE_SIZE - 1);
+       for (pg_off = offset_in_page(start);
+            pg_off < offset_in_page(end);
+            pg_off += sectorsize, bio_offset += sectorsize) {
+               int ret;
+
+               ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off);
+               if (ret < 0)
+                       return -EIO;
+       }
+       return 0;
 }
 
 /*
@@ -3055,7 +3233,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 
                if (ret == -ENOENT && root == fs_info->tree_root) {
                        struct btrfs_root *dead_root;
-                       struct btrfs_fs_info *fs_info = root->fs_info;
                        int is_dead_root = 0;
 
                        /*
@@ -3395,7 +3572,6 @@ cache_acl:
        switch (inode->i_mode & S_IFMT) {
        case S_IFREG:
                inode->i_mapping->a_ops = &btrfs_aops;
-               BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
                inode->i_fop = &btrfs_file_operations;
                inode->i_op = &btrfs_file_inode_operations;
                break;
@@ -3470,7 +3646,8 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
  * copy everything in the in-memory inode into the btree.
  */
 static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
-                               struct btrfs_root *root, struct inode *inode)
+                               struct btrfs_root *root,
+                               struct btrfs_inode *inode)
 {
        struct btrfs_inode_item *inode_item;
        struct btrfs_path *path;
@@ -3481,9 +3658,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
        if (!path)
                return -ENOMEM;
 
-       path->leave_spinning = 1;
-       ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location,
-                                1);
+       ret = btrfs_lookup_inode(trans, root, path, &inode->location, 1);
        if (ret) {
                if (ret > 0)
                        ret = -ENOENT;
@@ -3494,9 +3669,9 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
        inode_item = btrfs_item_ptr(leaf, path->slots[0],
                                    struct btrfs_inode_item);
 
-       fill_inode_item(trans, leaf, inode_item, inode);
+       fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode);
        btrfs_mark_buffer_dirty(leaf);
-       btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
+       btrfs_set_inode_last_trans(trans, inode);
        ret = 0;
 failed:
        btrfs_free_path(path);
@@ -3507,7 +3682,8 @@ failed:
  * copy everything in the in-memory inode into the btree.
  */
 noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
-                               struct btrfs_root *root, struct inode *inode)
+                               struct btrfs_root *root,
+                               struct btrfs_inode *inode)
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
        int ret;
@@ -3519,23 +3695,22 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
         * The data relocation inode should also be directly updated
         * without delay
         */
-       if (!btrfs_is_free_space_inode(BTRFS_I(inode))
+       if (!btrfs_is_free_space_inode(inode)
            && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
            && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
                btrfs_update_root_times(trans, root);
 
                ret = btrfs_delayed_update_inode(trans, root, inode);
                if (!ret)
-                       btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
+                       btrfs_set_inode_last_trans(trans, inode);
                return ret;
        }
 
        return btrfs_update_inode_item(trans, root, inode);
 }
 
-noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
-                                        struct btrfs_root *root,
-                                        struct inode *inode)
+int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root, struct btrfs_inode *inode)
 {
        int ret;
 
@@ -3570,7 +3745,6 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
                goto out;
        }
 
-       path->leave_spinning = 1;
        di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
                                    name, name_len, -1);
        if (IS_ERR_OR_NULL(di)) {
@@ -3650,7 +3824,7 @@ err:
        inode_inc_iversion(&dir->vfs_inode);
        inode->vfs_inode.i_ctime = dir->vfs_inode.i_mtime =
                dir->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
-       ret = btrfs_update_inode(trans, root, &dir->vfs_inode);
+       ret = btrfs_update_inode(trans, root, dir);
 out:
        return ret;
 }
@@ -3664,7 +3838,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
        ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
        if (!ret) {
                drop_nlink(&inode->vfs_inode);
-               ret = btrfs_update_inode(trans, root, &inode->vfs_inode);
+               ret = btrfs_update_inode(trans, root, inode);
        }
        return ret;
 }
@@ -3813,7 +3987,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
        btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2);
        inode_inc_iversion(dir);
        dir->i_mtime = dir->i_ctime = current_time(dir);
-       ret = btrfs_update_inode_fallback(trans, root, dir);
+       ret = btrfs_update_inode_fallback(trans, root, BTRFS_I(dir));
        if (ret)
                btrfs_abort_transaction(trans, ret);
 out:
@@ -3950,7 +4124,6 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
        struct btrfs_block_rsv block_rsv;
        u64 root_flags;
        int ret;
-       int err;
 
        /*
         * Don't allow to delete a subvolume with send in progress. This is
@@ -3972,8 +4145,8 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
 
        down_write(&fs_info->subvol_sem);
 
-       err = may_destroy_subvol(dest);
-       if (err)
+       ret = may_destroy_subvol(dest);
+       if (ret)
                goto out_up_write;
 
        btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
@@ -3982,13 +4155,13 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
         * two for dir entries,
         * two for root ref/backref.
         */
-       err = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
-       if (err)
+       ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
+       if (ret)
                goto out_up_write;
 
        trans = btrfs_start_transaction(root, 0);
        if (IS_ERR(trans)) {
-               err = PTR_ERR(trans);
+               ret = PTR_ERR(trans);
                goto out_release;
        }
        trans->block_rsv = &block_rsv;
@@ -3998,7 +4171,6 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
 
        ret = btrfs_unlink_subvol(trans, dir, dentry);
        if (ret) {
-               err = ret;
                btrfs_abort_transaction(trans, ret);
                goto out_end_trans;
        }
@@ -4007,7 +4179,7 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
 
        memset(&dest->root_item.drop_progress, 0,
                sizeof(dest->root_item.drop_progress));
-       dest->root_item.drop_level = 0;
+       btrfs_set_root_drop_level(&dest->root_item, 0);
        btrfs_set_root_refs(&dest->root_item, 0);
 
        if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
@@ -4016,7 +4188,6 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
                                        dest->root_key.objectid);
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
-                       err = ret;
                        goto out_end_trans;
                }
        }
@@ -4026,7 +4197,6 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
                                  dest->root_key.objectid);
        if (ret && ret != -ENOENT) {
                btrfs_abort_transaction(trans, ret);
-               err = ret;
                goto out_end_trans;
        }
        if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
@@ -4036,7 +4206,6 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
                                          dest->root_key.objectid);
                if (ret && ret != -ENOENT) {
                        btrfs_abort_transaction(trans, ret);
-                       err = ret;
                        goto out_end_trans;
                }
        }
@@ -4047,14 +4216,12 @@ out_end_trans:
        trans->block_rsv = NULL;
        trans->bytes_reserved = 0;
        ret = btrfs_end_transaction(trans);
-       if (ret && !err)
-               err = ret;
        inode->i_flags |= S_DEAD;
 out_release:
-       btrfs_subvolume_release_metadata(fs_info, &block_rsv);
+       btrfs_subvolume_release_metadata(root, &block_rsv);
 out_up_write:
        up_write(&fs_info->subvol_sem);
-       if (err) {
+       if (ret) {
                spin_lock(&dest->root_item_lock);
                root_flags = btrfs_root_flags(&dest->root_item);
                btrfs_set_root_flags(&dest->root_item,
@@ -4064,15 +4231,9 @@ out_up_write:
                d_invalidate(dentry);
                btrfs_prune_dentries(dest);
                ASSERT(dest->send_in_progress == 0);
-
-               /* the last ref */
-               if (dest->ino_cache_inode) {
-                       iput(dest->ino_cache_inode);
-                       dest->ino_cache_inode = NULL;
-               }
        }
 
-       return err;
+       return ret;
 }
 
 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
@@ -4149,7 +4310,7 @@ out:
  */
 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root,
-                              struct inode *inode,
+                              struct btrfs_inode *inode,
                               u64 new_size, u32 min_type)
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4170,7 +4331,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
        int pending_del_slot = 0;
        int extent_type = -1;
        int ret;
-       u64 ino = btrfs_ino(BTRFS_I(inode));
+       u64 ino = btrfs_ino(inode);
        u64 bytes_deleted = 0;
        bool be_nice = false;
        bool should_throttle = false;
@@ -4184,7 +4345,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
         * off from time to time.  This means all inodes in subvolume roots,
         * reloc roots, and data reloc roots.
         */
-       if (!btrfs_is_free_space_inode(BTRFS_I(inode)) &&
+       if (!btrfs_is_free_space_inode(inode) &&
            test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
                be_nice = true;
 
@@ -4194,7 +4355,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
        path->reada = READA_BACK;
 
        if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
-               lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
+               lock_extent_bits(&inode->io_tree, lock_start, (u64)-1,
                                 &cached_state);
 
                /*
@@ -4202,7 +4363,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
                 * new size is not block aligned since we will be keeping the
                 * last block of the extent just the way it is.
                 */
-               btrfs_drop_extent_cache(BTRFS_I(inode), ALIGN(new_size,
+               btrfs_drop_extent_cache(inode, ALIGN(new_size,
                                        fs_info->sectorsize),
                                        (u64)-1, 0);
        }
@@ -4213,8 +4374,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
         * it is used to drop the logged items. So we shouldn't kill the delayed
         * items.
         */
-       if (min_type == 0 && root == BTRFS_I(inode)->root)
-               btrfs_kill_delayed_inode_items(BTRFS_I(inode));
+       if (min_type == 0 && root == inode->root)
+               btrfs_kill_delayed_inode_items(inode);
 
        key.objectid = ino;
        key.offset = (u64)-1;
@@ -4270,14 +4431,13 @@ search_again:
                                    btrfs_file_extent_num_bytes(leaf, fi);
 
                                trace_btrfs_truncate_show_fi_regular(
-                                       BTRFS_I(inode), leaf, fi,
-                                       found_key.offset);
+                                       inode, leaf, fi, found_key.offset);
                        } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
                                item_end += btrfs_file_extent_ram_bytes(leaf,
                                                                        fi);
 
                                trace_btrfs_truncate_show_fi_inline(
-                                       BTRFS_I(inode), leaf, fi, path->slots[0],
+                                       inode, leaf, fi, path->slots[0],
                                        found_key.offset);
                        }
                        item_end--;
@@ -4316,7 +4476,8 @@ search_again:
                                if (test_bit(BTRFS_ROOT_SHAREABLE,
                                             &root->state) &&
                                    extent_start != 0)
-                                       inode_sub_bytes(inode, num_dec);
+                                       inode_sub_bytes(&inode->vfs_inode,
+                                                       num_dec);
                                btrfs_mark_buffer_dirty(leaf);
                        } else {
                                extent_num_bytes =
@@ -4331,7 +4492,8 @@ search_again:
                                        found_extent = 1;
                                        if (test_bit(BTRFS_ROOT_SHAREABLE,
                                                     &root->state))
-                                               inode_sub_bytes(inode, num_dec);
+                                               inode_sub_bytes(&inode->vfs_inode,
+                                                               num_dec);
                                }
                        }
                        clear_len = num_dec;
@@ -4366,7 +4528,8 @@ search_again:
                        }
 
                        if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
-                               inode_sub_bytes(inode, item_end + 1 - new_size);
+                               inode_sub_bytes(&inode->vfs_inode,
+                                               item_end + 1 - new_size);
                }
 delete:
                /*
@@ -4374,8 +4537,8 @@ delete:
                 * multiple fsyncs, and in this case we don't want to clear the
                 * file extent range because it's just the log.
                 */
-               if (root == BTRFS_I(inode)->root) {
-                       ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
+               if (root == inode->root) {
+                       ret = btrfs_inode_clear_file_extent_range(inode,
                                                  clear_start, clear_len);
                        if (ret) {
                                btrfs_abort_transaction(trans, ret);
@@ -4484,8 +4647,8 @@ out:
                if (!ret && last_size > new_size)
                        last_size = new_size;
                btrfs_inode_safe_disk_i_size_write(inode, last_size);
-               unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start,
-                                    (u64)-1, &cached_state);
+               unlock_extent_cached(&inode->io_tree, lock_start, (u64)-1,
+                                    &cached_state);
        }
 
        btrfs_free_path(path);
@@ -4503,12 +4666,12 @@ out:
  * This will find the block for the "from" offset and cow the block and zero the
  * part we want to zero.  This is used with truncate and hole punching.
  */
-int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
-                       int front)
+int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
+                        int front)
 {
-       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-       struct address_space *mapping = inode->i_mapping;
-       struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+       struct btrfs_fs_info *fs_info = inode->root->fs_info;
+       struct address_space *mapping = inode->vfs_inode.i_mapping;
+       struct extent_io_tree *io_tree = &inode->io_tree;
        struct btrfs_ordered_extent *ordered;
        struct extent_state *cached_state = NULL;
        struct extent_changeset *data_reserved = NULL;
@@ -4531,30 +4694,29 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
        block_start = round_down(from, blocksize);
        block_end = block_start + blocksize - 1;
 
-       ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved,
-                                         block_start, blocksize);
+       ret = btrfs_check_data_free_space(inode, &data_reserved, block_start,
+                                         blocksize);
        if (ret < 0) {
-               if (btrfs_check_nocow_lock(BTRFS_I(inode), block_start,
-                                          &write_bytes) > 0) {
+               if (btrfs_check_nocow_lock(inode, block_start, &write_bytes) > 0) {
                        /* For nocow case, no need to reserve data space */
                        only_release_metadata = true;
                } else {
                        goto out;
                }
        }
-       ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), blocksize);
+       ret = btrfs_delalloc_reserve_metadata(inode, blocksize);
        if (ret < 0) {
                if (!only_release_metadata)
-                       btrfs_free_reserved_data_space(BTRFS_I(inode),
-                                       data_reserved, block_start, blocksize);
+                       btrfs_free_reserved_data_space(inode, data_reserved,
+                                                      block_start, blocksize);
                goto out;
        }
 again:
        page = find_or_create_page(mapping, index, mask);
        if (!page) {
-               btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
-                                            block_start, blocksize, true);
-               btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
+               btrfs_delalloc_release_space(inode, data_reserved, block_start,
+                                            blocksize, true);
+               btrfs_delalloc_release_extents(inode, blocksize);
                ret = -ENOMEM;
                goto out;
        }
@@ -4577,22 +4739,22 @@ again:
        lock_extent_bits(io_tree, block_start, block_end, &cached_state);
        set_page_extent_mapped(page);
 
-       ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), block_start);
+       ordered = btrfs_lookup_ordered_extent(inode, block_start);
        if (ordered) {
                unlock_extent_cached(io_tree, block_start, block_end,
                                     &cached_state);
                unlock_page(page);
                put_page(page);
-               btrfs_start_ordered_extent(inode, ordered, 1);
+               btrfs_start_ordered_extent(ordered, 1);
                btrfs_put_ordered_extent(ordered);
                goto again;
        }
 
-       clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end,
+       clear_extent_bit(&inode->io_tree, block_start, block_end,
                         EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
                         0, 0, &cached_state);
 
-       ret = btrfs_set_extent_delalloc(BTRFS_I(inode), block_start, block_end, 0,
+       ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
                                        &cached_state);
        if (ret) {
                unlock_extent_cached(io_tree, block_start, block_end,
@@ -4618,34 +4780,33 @@ again:
        unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
 
        if (only_release_metadata)
-               set_extent_bit(&BTRFS_I(inode)->io_tree, block_start,
-                               block_end, EXTENT_NORESERVE, NULL, NULL,
-                               GFP_NOFS);
+               set_extent_bit(&inode->io_tree, block_start, block_end,
+                              EXTENT_NORESERVE, 0, NULL, NULL, GFP_NOFS, NULL);
 
 out_unlock:
        if (ret) {
                if (only_release_metadata)
-                       btrfs_delalloc_release_metadata(BTRFS_I(inode),
-                                       blocksize, true);
+                       btrfs_delalloc_release_metadata(inode, blocksize, true);
                else
-                       btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
+                       btrfs_delalloc_release_space(inode, data_reserved,
                                        block_start, blocksize, true);
        }
-       btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
+       btrfs_delalloc_release_extents(inode, blocksize);
        unlock_page(page);
        put_page(page);
 out:
        if (only_release_metadata)
-               btrfs_check_nocow_unlock(BTRFS_I(inode));
+               btrfs_check_nocow_unlock(inode);
        extent_changeset_free(data_reserved);
        return ret;
 }
 
-static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
+static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode,
                             u64 offset, u64 len)
 {
-       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+       struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_trans_handle *trans;
+       struct btrfs_drop_extents_args drop_args = { 0 };
        int ret;
 
        /*
@@ -4653,9 +4814,9 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
         * that any holes get logged if we fsync.
         */
        if (btrfs_fs_incompat(fs_info, NO_HOLES)) {
-               BTRFS_I(inode)->last_trans = fs_info->generation;
-               BTRFS_I(inode)->last_sub_trans = root->log_transid;
-               BTRFS_I(inode)->last_log_commit = root->last_log_commit;
+               inode->last_trans = fs_info->generation;
+               inode->last_sub_trans = root->log_transid;
+               inode->last_log_commit = root->last_log_commit;
                return 0;
        }
 
@@ -4668,19 +4829,25 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
        if (IS_ERR(trans))
                return PTR_ERR(trans);
 
-       ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1);
+       drop_args.start = offset;
+       drop_args.end = offset + len;
+       drop_args.drop_cache = true;
+
+       ret = btrfs_drop_extents(trans, root, inode, &drop_args);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                btrfs_end_transaction(trans);
                return ret;
        }
 
-       ret = btrfs_insert_file_extent(trans, root, btrfs_ino(BTRFS_I(inode)),
+       ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode),
                        offset, 0, 0, len, 0, len, 0, 0, 0);
-       if (ret)
+       if (ret) {
                btrfs_abort_transaction(trans, ret);
-       else
+       } else {
+               btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found);
                btrfs_update_inode(trans, root, inode);
+       }
        btrfs_end_transaction(trans);
        return ret;
 }
@@ -4691,14 +4858,14 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
  * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
  * the range between oldsize and size
  */
-int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
+int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
 {
-       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-       struct btrfs_root *root = BTRFS_I(inode)->root;
-       struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+       struct btrfs_root *root = inode->root;
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct extent_io_tree *io_tree = &inode->io_tree;
        struct extent_map *em = NULL;
        struct extent_state *cached_state = NULL;
-       struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+       struct extent_map_tree *em_tree = &inode->extent_tree;
        u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
        u64 block_end = ALIGN(size, fs_info->sectorsize);
        u64 last_byte;
@@ -4718,11 +4885,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
        if (size <= hole_start)
                return 0;
 
-       btrfs_lock_and_flush_ordered_range(BTRFS_I(inode), hole_start,
-                                          block_end - 1, &cached_state);
+       btrfs_lock_and_flush_ordered_range(inode, hole_start, block_end - 1,
+                                          &cached_state);
        cur_offset = hole_start;
        while (1) {
-               em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
+               em = btrfs_get_extent(inode, NULL, 0, cur_offset,
                                      block_end - cur_offset);
                if (IS_ERR(em)) {
                        err = PTR_ERR(em);
@@ -4741,17 +4908,17 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
                        if (err)
                                break;
 
-                       err = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
+                       err = btrfs_inode_set_file_extent_range(inode,
                                                        cur_offset, hole_size);
                        if (err)
                                break;
 
-                       btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
+                       btrfs_drop_extent_cache(inode, cur_offset,
                                                cur_offset + hole_size - 1, 0);
                        hole_em = alloc_extent_map();
                        if (!hole_em) {
                                set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
-                                       &BTRFS_I(inode)->runtime_flags);
+                                       &inode->runtime_flags);
                                goto next;
                        }
                        hole_em->start = cur_offset;
@@ -4771,14 +4938,13 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
                                write_unlock(&em_tree->lock);
                                if (err != -EEXIST)
                                        break;
-                               btrfs_drop_extent_cache(BTRFS_I(inode),
-                                                       cur_offset,
+                               btrfs_drop_extent_cache(inode, cur_offset,
                                                        cur_offset +
                                                        hole_size - 1, 0);
                        }
                        free_extent_map(hole_em);
                } else {
-                       err = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
+                       err = btrfs_inode_set_file_extent_range(inode,
                                                        cur_offset, hole_size);
                        if (err)
                                break;
@@ -4826,7 +4992,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
                 * this truncation.
                 */
                btrfs_drew_write_lock(&root->snapshot_lock);
-               ret = btrfs_cont_expand(inode, oldsize, newsize);
+               ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, newsize);
                if (ret) {
                        btrfs_drew_write_unlock(&root->snapshot_lock);
                        return ret;
@@ -4839,28 +5005,25 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
                }
 
                i_size_write(inode, newsize);
-               btrfs_inode_safe_disk_i_size_write(inode, 0);
+               btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
                pagecache_isize_extended(inode, oldsize, newsize);
-               ret = btrfs_update_inode(trans, root, inode);
+               ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
                btrfs_drew_write_unlock(&root->snapshot_lock);
                btrfs_end_transaction(trans);
        } else {
 
                /*
                 * We're truncating a file that used to have good data down to
-                * zero. Make sure it gets into the ordered flush list so that
-                * any new writes get down to disk quickly.
+                * zero. Make sure any new writes to the file get on disk
+                * on close.
                 */
                if (newsize == 0)
-                       set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+                       set_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
                                &BTRFS_I(inode)->runtime_flags);
 
                truncate_setsize(inode, newsize);
 
-               /* Disable nonlocked read DIO to avoid the endless truncate */
-               btrfs_inode_block_unlocked_dio(BTRFS_I(inode));
                inode_dio_wait(inode);
-               btrfs_inode_resume_unlocked_dio(BTRFS_I(inode));
 
                ret = btrfs_truncate(inode, newsize == oldsize);
                if (ret && inode->i_nlink) {
@@ -5115,7 +5278,8 @@ void btrfs_evict_inode(struct inode *inode)
 
                trans->block_rsv = rsv;
 
-               ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
+               ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
+                                                0, 0);
                trans->block_rsv = &fs_info->trans_block_rsv;
                btrfs_end_transaction(trans);
                btrfs_btree_balance_dirty(fs_info);
@@ -5142,10 +5306,6 @@ void btrfs_evict_inode(struct inode *inode)
                btrfs_end_transaction(trans);
        }
 
-       if (!(root == fs_info->tree_root ||
-             root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
-               btrfs_return_ino(root, btrfs_ino(BTRFS_I(inode)));
-
 free_rsv:
        btrfs_free_block_rsv(fs_info, rsv);
 no_delete:
@@ -5305,15 +5465,15 @@ static void inode_tree_add(struct inode *inode)
        spin_unlock(&root->inode_lock);
 }
 
-static void inode_tree_del(struct inode *inode)
+static void inode_tree_del(struct btrfs_inode *inode)
 {
-       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct btrfs_root *root = inode->root;
        int empty = 0;
 
        spin_lock(&root->inode_lock);
-       if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
-               rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
-               RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
+       if (!RB_EMPTY_NODE(&inode->rb_node)) {
+               rb_erase(&inode->rb_node, &root->inode_tree);
+               RB_CLEAR_NODE(&inode->rb_node);
                empty = RB_EMPTY_ROOT(&root->inode_tree);
        }
        spin_unlock(&root->inode_lock);
@@ -5755,7 +5915,7 @@ static int btrfs_dirty_inode(struct inode *inode)
        if (IS_ERR(trans))
                return PTR_ERR(trans);
 
-       ret = btrfs_update_inode(trans, root, inode);
+       ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
        if (ret && ret == -ENOSPC) {
                /* whoops, lets try again with the full transaction */
                btrfs_end_transaction(trans);
@@ -5763,7 +5923,7 @@ static int btrfs_dirty_inode(struct inode *inode)
                if (IS_ERR(trans))
                        return PTR_ERR(trans);
 
-               ret = btrfs_update_inode(trans, root, inode);
+               ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
        }
        btrfs_end_transaction(trans);
        if (BTRFS_I(inode)->delayed_node)
@@ -6026,7 +6186,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                goto fail;
        }
 
-       path->leave_spinning = 1;
        ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
        if (ret != 0)
                goto fail_unlock;
@@ -6152,7 +6311,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
                parent_inode->vfs_inode.i_mtime = now;
                parent_inode->vfs_inode.i_ctime = now;
        }
-       ret = btrfs_update_inode(trans, root, &parent_inode->vfs_inode);
+       ret = btrfs_update_inode(trans, root, parent_inode);
        if (ret)
                btrfs_abort_transaction(trans, ret);
        return ret;
@@ -6212,7 +6371,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
        if (IS_ERR(trans))
                return PTR_ERR(trans);
 
-       err = btrfs_find_free_ino(root, &objectid);
+       err = btrfs_find_free_objectid(root, &objectid);
        if (err)
                goto out_unlock;
 
@@ -6243,7 +6402,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
        if (err)
                goto out_unlock;
 
-       btrfs_update_inode(trans, root, inode);
+       btrfs_update_inode(trans, root, BTRFS_I(inode));
        d_instantiate_new(dentry, inode);
 
 out_unlock:
@@ -6276,7 +6435,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
        if (IS_ERR(trans))
                return PTR_ERR(trans);
 
-       err = btrfs_find_free_ino(root, &objectid);
+       err = btrfs_find_free_objectid(root, &objectid);
        if (err)
                goto out_unlock;
 
@@ -6302,7 +6461,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
        if (err)
                goto out_unlock;
 
-       err = btrfs_update_inode(trans, root, inode);
+       err = btrfs_update_inode(trans, root, BTRFS_I(inode));
        if (err)
                goto out_unlock;
 
@@ -6311,7 +6470,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
        if (err)
                goto out_unlock;
 
-       BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
        d_instantiate_new(dentry, inode);
 
 out_unlock:
@@ -6374,9 +6532,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
                drop_inode = 1;
        } else {
                struct dentry *parent = dentry->d_parent;
-               int ret;
 
-               err = btrfs_update_inode(trans, root, inode);
+               err = btrfs_update_inode(trans, root, BTRFS_I(inode));
                if (err)
                        goto fail;
                if (inode->i_nlink == 1) {
@@ -6389,12 +6546,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
                                goto fail;
                }
                d_instantiate(dentry, inode);
-               ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent,
-                                        true, NULL);
-               if (ret == BTRFS_NEED_TRANS_COMMIT) {
-                       err = btrfs_commit_transaction(trans);
-                       trans = NULL;
-               }
+               btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent);
        }
 
 fail:
@@ -6427,7 +6579,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        if (IS_ERR(trans))
                return PTR_ERR(trans);
 
-       err = btrfs_find_free_ino(root, &objectid);
+       err = btrfs_find_free_objectid(root, &objectid);
        if (err)
                goto out_fail;
 
@@ -6449,7 +6601,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
                goto out_fail;
 
        btrfs_i_size_write(BTRFS_I(inode), 0);
-       err = btrfs_update_inode(trans, root, inode);
+       err = btrfs_update_inode(trans, root, BTRFS_I(inode));
        if (err)
                goto out_fail;
 
@@ -6540,8 +6692,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
                                    u64 start, u64 len)
 {
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
-       int ret;
-       int err = 0;
+       int ret = 0;
        u64 extent_start = 0;
        u64 extent_end = 0;
        u64 objectid = btrfs_ino(inode);
@@ -6569,7 +6720,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
        }
        em = alloc_extent_map();
        if (!em) {
-               err = -ENOMEM;
+               ret = -ENOMEM;
                goto out;
        }
        em->start = EXTENT_MAP_HOLE;
@@ -6579,7 +6730,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
 
        path = btrfs_alloc_path();
        if (!path) {
-               err = -ENOMEM;
+               ret = -ENOMEM;
                goto out;
        }
 
@@ -6587,19 +6738,23 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
        path->reada = READA_FORWARD;
 
        /*
-        * Unless we're going to uncompress the inline extent, no sleep would
-        * happen.
+        * The same explanation in load_free_space_cache applies here as well,
+        * we only read when we're loading the free space cache, and at that
+        * point the commit_root has everything we need.
         */
-       path->leave_spinning = 1;
+       if (btrfs_is_free_space_inode(inode)) {
+               path->search_commit_root = 1;
+               path->skip_locking = 1;
+       }
 
        ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
        if (ret < 0) {
-               err = ret;
                goto out;
        } else if (ret > 0) {
                if (path->slots[0] == 0)
                        goto not_found;
                path->slots[0]--;
+               ret = 0;
        }
 
        leaf = path->nodes[0];
@@ -6625,7 +6780,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
            extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
                /* Only regular file could have regular/prealloc extent */
                if (!S_ISREG(inode->vfs_inode.i_mode)) {
-                       err = -EUCLEAN;
+                       ret = -EUCLEAN;
                        btrfs_crit(fs_info,
                "regular/prealloc extent found for non-regular inode %llu",
                                   btrfs_ino(inode));
@@ -6643,12 +6798,11 @@ next:
                path->slots[0]++;
                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
                        ret = btrfs_next_leaf(root, path);
-                       if (ret < 0) {
-                               err = ret;
+                       if (ret < 0)
                                goto out;
-                       } else if (ret > 0) {
+                       else if (ret > 0)
                                goto not_found;
-                       }
+
                        leaf = path->nodes[0];
                }
                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
@@ -6693,16 +6847,13 @@ next:
                em->orig_start = em->start;
                ptr = btrfs_file_extent_inline_start(item) + extent_offset;
 
-               btrfs_set_path_blocking(path);
                if (!PageUptodate(page)) {
                        if (btrfs_file_extent_compression(leaf, item) !=
                            BTRFS_COMPRESS_NONE) {
                                ret = uncompress_inline(path, page, pg_offset,
                                                        extent_offset, item);
-                               if (ret) {
-                                       err = ret;
+                               if (ret)
                                        goto out;
-                               }
                        } else {
                                map = kmap(page);
                                read_extent_buffer(leaf, map + pg_offset, ptr,
@@ -6726,29 +6877,28 @@ not_found:
        em->len = len;
        em->block_start = EXTENT_MAP_HOLE;
 insert:
+       ret = 0;
        btrfs_release_path(path);
        if (em->start > start || extent_map_end(em) <= start) {
                btrfs_err(fs_info,
                          "bad extent! em: [%llu %llu] passed [%llu %llu]",
                          em->start, em->len, start, len);
-               err = -EIO;
+               ret = -EIO;
                goto out;
        }
 
-       err = 0;
        write_lock(&em_tree->lock);
-       err = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
+       ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
        write_unlock(&em_tree->lock);
 out:
        btrfs_free_path(path);
 
        trace_btrfs_get_extent(root, inode, em);
 
-       if (err) {
+       if (ret) {
                free_extent_map(em);
-               return ERR_PTR(err);
+               return ERR_PTR(ret);
        }
-       BUG_ON(!em); /* Error is always set */
        return em;
 }
 
@@ -7111,7 +7261,7 @@ out:
 }
 
 static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
-                             struct extent_state **cached_state, int writing)
+                             struct extent_state **cached_state, bool writing)
 {
        struct btrfs_ordered_extent *ordered;
        int ret = 0;
@@ -7160,7 +7310,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
                         */
                        if (writing ||
                            test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
-                               btrfs_start_ordered_extent(inode, ordered, 1);
+                               btrfs_start_ordered_extent(ordered, 1);
                        else
                                ret = -ENOTBLK;
                        btrfs_put_ordered_extent(ordered);
@@ -7249,30 +7399,7 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
 }
 
 
-static int btrfs_get_blocks_direct_read(struct extent_map *em,
-                                       struct buffer_head *bh_result,
-                                       struct inode *inode,
-                                       u64 start, u64 len)
-{
-       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-
-       if (em->block_start == EXTENT_MAP_HOLE ||
-                       test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
-               return -ENOENT;
-
-       len = min(len, em->len - (start - em->start));
-
-       bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
-               inode->i_blkbits;
-       bh_result->b_size = len;
-       bh_result->b_bdev = fs_info->fs_devices->latest_bdev;
-       set_buffer_mapped(bh_result);
-
-       return 0;
-}
-
 static int btrfs_get_blocks_direct_write(struct extent_map **map,
-                                        struct buffer_head *bh_result,
                                         struct inode *inode,
                                         struct btrfs_dio_data *dio_data,
                                         u64 start, u64 len)
@@ -7333,7 +7460,6 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
        }
 
        /* this will cow the extent */
-       len = bh_result->b_size;
        free_extent_map(em);
        *map = em = btrfs_new_extent_direct(BTRFS_I(inode), start, len);
        if (IS_ERR(em)) {
@@ -7344,64 +7470,76 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
        len = min(len, em->len - (start - em->start));
 
 skip_cow:
-       bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
-               inode->i_blkbits;
-       bh_result->b_size = len;
-       bh_result->b_bdev = fs_info->fs_devices->latest_bdev;
-       set_buffer_mapped(bh_result);
-
-       if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
-               set_buffer_new(bh_result);
-
        /*
         * Need to update the i_size under the extent lock so buffered
         * readers will get the updated i_size when we unlock.
         */
-       if (!dio_data->overwrite && start + len > i_size_read(inode))
+       if (start + len > i_size_read(inode))
                i_size_write(inode, start + len);
 
-       WARN_ON(dio_data->reserve < len);
        dio_data->reserve -= len;
-       dio_data->unsubmitted_oe_range_end = start + len;
-       current->journal_info = dio_data;
 out:
        return ret;
 }
 
-static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
-                                  struct buffer_head *bh_result, int create)
+static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
+               loff_t length, unsigned int flags, struct iomap *iomap,
+               struct iomap *srcmap)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct extent_map *em;
        struct extent_state *cached_state = NULL;
        struct btrfs_dio_data *dio_data = NULL;
-       u64 start = iblock << inode->i_blkbits;
        u64 lockstart, lockend;
-       u64 len = bh_result->b_size;
+       const bool write = !!(flags & IOMAP_WRITE);
        int ret = 0;
+       u64 len = length;
+       bool unlock_extents = false;
 
-       if (!create)
+       if (!write)
                len = min_t(u64, len, fs_info->sectorsize);
 
        lockstart = start;
        lockend = start + len - 1;
 
-       if (current->journal_info) {
-               /*
-                * Need to pull our outstanding extents and set journal_info to NULL so
-                * that anything that needs to check if there's a transaction doesn't get
-                * confused.
-                */
-               dio_data = current->journal_info;
-               current->journal_info = NULL;
+       /*
+        * The generic stuff only does filemap_write_and_wait_range, which
+        * isn't enough if we've written compressed pages to this area, so we
+        * need to flush the dirty pages again to make absolutely sure that any
+        * outstanding dirty pages are on disk.
+        */
+       if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+                    &BTRFS_I(inode)->runtime_flags)) {
+               ret = filemap_fdatawrite_range(inode->i_mapping, start,
+                                              start + length - 1);
+               if (ret)
+                       return ret;
+       }
+
+       dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS);
+       if (!dio_data)
+               return -ENOMEM;
+
+       dio_data->length = length;
+       if (write) {
+               dio_data->reserve = round_up(length, fs_info->sectorsize);
+               ret = btrfs_delalloc_reserve_space(BTRFS_I(inode),
+                               &dio_data->data_reserved,
+                               start, dio_data->reserve);
+               if (ret) {
+                       extent_changeset_free(dio_data->data_reserved);
+                       kfree(dio_data);
+                       return ret;
+               }
        }
+       iomap->private = dio_data;
+
 
        /*
         * If this errors out it's because we couldn't invalidate pagecache for
         * this range and we need to fallback to buffered.
         */
-       if (lock_extent_direct(inode, lockstart, lockend, &cached_state,
-                              create)) {
+       if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) {
                ret = -ENOTBLK;
                goto err;
        }
@@ -7433,36 +7571,48 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
                goto unlock_err;
        }
 
-       if (create) {
-               ret = btrfs_get_blocks_direct_write(&em, bh_result, inode,
-                                                   dio_data, start, len);
+       len = min(len, em->len - (start - em->start));
+       if (write) {
+               ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
+                                                   start, len);
                if (ret < 0)
                        goto unlock_err;
-
-               unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
-                                    lockend, &cached_state);
+               unlock_extents = true;
+               /* Recalc len in case the new em is smaller than requested */
+               len = min(len, em->len - (start - em->start));
        } else {
-               ret = btrfs_get_blocks_direct_read(em, bh_result, inode,
-                                                  start, len);
-               /* Can be negative only if we read from a hole */
-               if (ret < 0) {
-                       ret = 0;
-                       free_extent_map(em);
-                       goto unlock_err;
-               }
                /*
                 * We need to unlock only the end area that we aren't using.
                 * The rest is going to be unlocked by the endio routine.
                 */
-               lockstart = start + bh_result->b_size;
-               if (lockstart < lockend) {
-                       unlock_extent_cached(&BTRFS_I(inode)->io_tree,
-                                            lockstart, lockend, &cached_state);
-               } else {
-                       free_extent_state(cached_state);
-               }
+               lockstart = start + len;
+               if (lockstart < lockend)
+                       unlock_extents = true;
        }
 
+       if (unlock_extents)
+               unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+                                    lockstart, lockend, &cached_state);
+       else
+               free_extent_state(cached_state);
+
+       /*
+        * Translate extent map information to iomap.
+        * We trim the extents (and move the addr) even though iomap code does
+        * that, since we have locked only the parts we are performing I/O in.
+        */
+       if ((em->block_start == EXTENT_MAP_HOLE) ||
+           (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) {
+               iomap->addr = IOMAP_NULL_ADDR;
+               iomap->type = IOMAP_HOLE;
+       } else {
+               iomap->addr = em->block_start + (start - em->start);
+               iomap->type = IOMAP_MAPPED;
+       }
+       iomap->offset = start;
+       iomap->bdev = fs_info->fs_devices->latest_bdev;
+       iomap->length = len;
+
        free_extent_map(em);
 
        return 0;
@@ -7471,8 +7621,55 @@ unlock_err:
        unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
                             &cached_state);
 err:
-       if (dio_data)
-               current->journal_info = dio_data;
+       if (dio_data) {
+               btrfs_delalloc_release_space(BTRFS_I(inode),
+                               dio_data->data_reserved, start,
+                               dio_data->reserve, true);
+               btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->reserve);
+               extent_changeset_free(dio_data->data_reserved);
+               kfree(dio_data);
+       }
+       return ret;
+}
+
+static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
+               ssize_t written, unsigned int flags, struct iomap *iomap)
+{
+       int ret = 0;
+       struct btrfs_dio_data *dio_data = iomap->private;
+       size_t submitted = dio_data->submitted;
+       const bool write = !!(flags & IOMAP_WRITE);
+
+       if (!write && (iomap->type == IOMAP_HOLE)) {
+               /* If reading from a hole, unlock and return */
+               unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1);
+               goto out;
+       }
+
+       if (submitted < length) {
+               pos += submitted;
+               length -= submitted;
+               if (write)
+                       __endio_write_update_ordered(BTRFS_I(inode), pos,
+                                       length, false);
+               else
+                       unlock_extent(&BTRFS_I(inode)->io_tree, pos,
+                                     pos + length - 1);
+               ret = -ENOTBLK;
+       }
+
+       if (write) {
+               if (dio_data->reserve)
+                       btrfs_delalloc_release_space(BTRFS_I(inode),
+                                       dio_data->data_reserved, pos,
+                                       dio_data->reserve, true);
+               btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->length);
+               extent_changeset_free(dio_data->data_reserved);
+       }
+out:
+       kfree(dio_data);
+       iomap->private = NULL;
+
        return ret;
 }
 
@@ -7496,7 +7693,7 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
                              dip->logical_offset + dip->bytes - 1);
        }
 
-       dio_end_io(dip->dio_bio);
+       bio_endio(dip->dio_bio);
        kfree(dip);
 }
 
@@ -7533,7 +7730,7 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
        struct bio_vec bvec;
        struct bvec_iter iter;
        u64 start = io_bio->logical;
-       int icsum = 0;
+       u32 bio_offset = 0;
        blk_status_t err = BLK_STS_OK;
 
        __bio_for_each_segment(bvec, &io_bio->bio, iter, io_bio->iter) {
@@ -7544,9 +7741,8 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
                for (i = 0; i < nr_sectors; i++) {
                        ASSERT(pgoff < PAGE_SIZE);
                        if (uptodate &&
-                           (!csum || !check_data_csum(inode, io_bio, icsum,
-                                                      bvec.bv_page, pgoff,
-                                                      start, sectorsize))) {
+                           (!csum || !check_data_csum(inode, io_bio,
+                                       bio_offset, bvec.bv_page, pgoff))) {
                                clean_io_failure(fs_info, failure_tree, io_tree,
                                                 start, bvec.bv_page,
                                                 btrfs_ino(BTRFS_I(inode)),
@@ -7554,6 +7750,7 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
                        } else {
                                blk_status_t status;
 
+                               ASSERT((start - io_bio->logical) < UINT_MAX);
                                status = btrfs_submit_read_repair(inode,
                                                        &io_bio->bio,
                                                        start - io_bio->logical,
@@ -7566,7 +7763,8 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
                                        err = status;
                        }
                        start += sectorsize;
-                       icsum++;
+                       ASSERT(bio_offset + sectorsize > bio_offset);
+                       bio_offset += sectorsize;
                        pgoff += sectorsize;
                }
        }
@@ -7616,12 +7814,11 @@ static void __endio_write_update_ordered(struct btrfs_inode *inode,
        }
 }
 
-static blk_status_t btrfs_submit_bio_start_direct_io(void *private_data,
-                                   struct bio *bio, u64 offset)
+static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode,
+                                                    struct bio *bio,
+                                                    u64 dio_file_offset)
 {
-       struct inode *inode = private_data;
-
-       return btrfs_csum_one_bio(BTRFS_I(inode), bio, offset, 1);
+       return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, 1);
 }
 
 static void btrfs_end_dio_bio(struct bio *bio)
@@ -7633,8 +7830,7 @@ static void btrfs_end_dio_bio(struct bio *bio)
                btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
                           "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d",
                           btrfs_ino(BTRFS_I(dip->inode)), bio_op(bio),
-                          bio->bi_opf,
-                          (unsigned long long)bio->bi_iter.bi_sector,
+                          bio->bi_opf, bio->bi_iter.bi_sector,
                           bio->bi_iter.bi_size, err);
 
        if (bio_op(bio) == REQ_OP_READ) {
@@ -7671,8 +7867,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
                goto map;
 
        if (write && async_submit) {
-               ret = btrfs_wq_submit_bio(fs_info, bio, 0, 0,
-                                         file_offset, inode,
+               ret = btrfs_wq_submit_bio(inode, bio, 0, 0, file_offset,
                                          btrfs_submit_bio_start_direct_io);
                goto err;
        } else if (write) {
@@ -7687,8 +7882,8 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
                u64 csum_offset;
 
                csum_offset = file_offset - dip->logical_offset;
-               csum_offset >>= inode->i_sb->s_blocksize_bits;
-               csum_offset *= btrfs_super_csum_size(fs_info->super_copy);
+               csum_offset >>= fs_info->sectorsize_bits;
+               csum_offset *= fs_info->csum_size;
                btrfs_io_bio(bio)->csum = dip->csums + csum_offset;
        }
 map:
@@ -7713,11 +7908,10 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
        dip_size = sizeof(*dip);
        if (!write && csum) {
                struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-               const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
                size_t nblocks;
 
-               nblocks = dio_bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits;
-               dip_size += csum_size * nblocks;
+               nblocks = dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits;
+               dip_size += fs_info->csum_size * nblocks;
        }
 
        dip = kzalloc(dip_size, GFP_NOFS);
@@ -7727,30 +7921,16 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
        dip->inode = inode;
        dip->logical_offset = file_offset;
        dip->bytes = dio_bio->bi_iter.bi_size;
-       dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
+       dip->disk_bytenr = dio_bio->bi_iter.bi_sector << 9;
        dip->dio_bio = dio_bio;
        refcount_set(&dip->refs, 1);
-
-       if (write) {
-               struct btrfs_dio_data *dio_data = current->journal_info;
-
-               /*
-                * Setting range start and end to the same value means that
-                * no cleanup will happen in btrfs_direct_IO
-                */
-               dio_data->unsubmitted_oe_range_end = dip->logical_offset +
-                       dip->bytes;
-               dio_data->unsubmitted_oe_range_start =
-                       dio_data->unsubmitted_oe_range_end;
-       }
        return dip;
 }
 
-static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
-                               loff_t file_offset)
+static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
+               struct bio *dio_bio, loff_t file_offset)
 {
        const bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
-       const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        const bool raid56 = (btrfs_data_alloc_profile(fs_info) &
                             BTRFS_BLOCK_GROUP_RAID56_MASK);
@@ -7764,6 +7944,7 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
        int ret;
        blk_status_t status;
        struct btrfs_io_geometry geom;
+       struct btrfs_dio_data *dio_data = iomap->private;
 
        dip = btrfs_create_dio_private(dio_bio, inode, file_offset);
        if (!dip) {
@@ -7772,17 +7953,18 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
                                file_offset + dio_bio->bi_iter.bi_size - 1);
                }
                dio_bio->bi_status = BLK_STS_RESOURCE;
-               dio_end_io(dio_bio);
-               return;
+               bio_endio(dio_bio);
+               return BLK_QC_T_NONE;
        }
 
-       if (!write && csum) {
+       if (!write) {
                /*
                 * Load the csums up front to reduce csum tree searches and
                 * contention when submitting bios.
+                *
+                * If we have csums disabled this will do nothing.
                 */
-               status = btrfs_lookup_bio_sums(inode, dio_bio, file_offset,
-                                              dip->csums);
+               status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums);
                if (status != BLK_STS_OK)
                        goto out_err;
        }
@@ -7844,154 +8026,27 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
                        goto out_err;
                }
 
+               dio_data->submitted += clone_len;
                clone_offset += clone_len;
                start_sector += clone_len >> 9;
                file_offset += clone_len;
        } while (submit_len > 0);
-       return;
+       return BLK_QC_T_NONE;
 
 out_err:
        dip->dio_bio->bi_status = status;
        btrfs_dio_private_put(dip);
+       return BLK_QC_T_NONE;
 }
 
-static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
-                              const struct iov_iter *iter, loff_t offset)
-{
-       int seg;
-       int i;
-       unsigned int blocksize_mask = fs_info->sectorsize - 1;
-       ssize_t retval = -EINVAL;
-
-       if (offset & blocksize_mask)
-               goto out;
-
-       if (iov_iter_alignment(iter) & blocksize_mask)
-               goto out;
-
-       /* If this is a write we don't need to check anymore */
-       if (iov_iter_rw(iter) != READ || !iter_is_iovec(iter))
-               return 0;
-       /*
-        * Check to make sure we don't have duplicate iov_base's in this
-        * iovec, if so return EINVAL, otherwise we'll get csum errors
-        * when reading back.
-        */
-       for (seg = 0; seg < iter->nr_segs; seg++) {
-               for (i = seg + 1; i < iter->nr_segs; i++) {
-                       if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
-                               goto out;
-               }
-       }
-       retval = 0;
-out:
-       return retval;
-}
-
-static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
-{
-       struct file *file = iocb->ki_filp;
-       struct inode *inode = file->f_mapping->host;
-       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-       struct btrfs_dio_data dio_data = { 0 };
-       struct extent_changeset *data_reserved = NULL;
-       loff_t offset = iocb->ki_pos;
-       size_t count = 0;
-       int flags = 0;
-       bool wakeup = true;
-       bool relock = false;
-       ssize_t ret;
-
-       if (check_direct_IO(fs_info, iter, offset))
-               return 0;
-
-       inode_dio_begin(inode);
-
-       /*
-        * The generic stuff only does filemap_write_and_wait_range, which
-        * isn't enough if we've written compressed pages to this area, so
-        * we need to flush the dirty pages again to make absolutely sure
-        * that any outstanding dirty pages are on disk.
-        */
-       count = iov_iter_count(iter);
-       if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
-                    &BTRFS_I(inode)->runtime_flags))
-               filemap_fdatawrite_range(inode->i_mapping, offset,
-                                        offset + count - 1);
-
-       if (iov_iter_rw(iter) == WRITE) {
-               /*
-                * If the write DIO is beyond the EOF, we need update
-                * the isize, but it is protected by i_mutex. So we can
-                * not unlock the i_mutex at this case.
-                */
-               if (offset + count <= inode->i_size) {
-                       dio_data.overwrite = 1;
-                       inode_unlock(inode);
-                       relock = true;
-               }
-               ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
-                                                  offset, count);
-               if (ret)
-                       goto out;
-
-               /*
-                * We need to know how many extents we reserved so that we can
-                * do the accounting properly if we go over the number we
-                * originally calculated.  Abuse current->journal_info for this.
-                */
-               dio_data.reserve = round_up(count,
-                                           fs_info->sectorsize);
-               dio_data.unsubmitted_oe_range_start = (u64)offset;
-               dio_data.unsubmitted_oe_range_end = (u64)offset;
-               current->journal_info = &dio_data;
-               down_read(&BTRFS_I(inode)->dio_sem);
-       } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
-                                    &BTRFS_I(inode)->runtime_flags)) {
-               inode_dio_end(inode);
-               flags = DIO_LOCKING | DIO_SKIP_HOLES;
-               wakeup = false;
-       }
-
-       ret = __blockdev_direct_IO(iocb, inode,
-                                  fs_info->fs_devices->latest_bdev,
-                                  iter, btrfs_get_blocks_direct, NULL,
-                                  btrfs_submit_direct, flags);
-       if (iov_iter_rw(iter) == WRITE) {
-               up_read(&BTRFS_I(inode)->dio_sem);
-               current->journal_info = NULL;
-               if (ret < 0 && ret != -EIOCBQUEUED) {
-                       if (dio_data.reserve)
-                               btrfs_delalloc_release_space(BTRFS_I(inode),
-                                       data_reserved, offset, dio_data.reserve,
-                                       true);
-                       /*
-                        * On error we might have left some ordered extents
-                        * without submitting corresponding bios for them, so
-                        * cleanup them up to avoid other tasks getting them
-                        * and waiting for them to complete forever.
-                        */
-                       if (dio_data.unsubmitted_oe_range_start <
-                           dio_data.unsubmitted_oe_range_end)
-                               __endio_write_update_ordered(BTRFS_I(inode),
-                                       dio_data.unsubmitted_oe_range_start,
-                                       dio_data.unsubmitted_oe_range_end -
-                                       dio_data.unsubmitted_oe_range_start,
-                                       false);
-               } else if (ret >= 0 && (size_t)ret < count)
-                       btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
-                                       offset, count - (size_t)ret, true);
-               btrfs_delalloc_release_extents(BTRFS_I(inode), count);
-       }
-out:
-       if (wakeup)
-               inode_dio_end(inode);
-       if (relock)
-               inode_lock(inode);
+const struct iomap_ops btrfs_dio_iomap_ops = {
+       .iomap_begin            = btrfs_dio_iomap_begin,
+       .iomap_end              = btrfs_dio_iomap_end,
+};
 
-       extent_changeset_free(data_reserved);
-       return ret;
-}
+const struct iomap_dio_ops btrfs_dio_ops = {
+       .submit_io              = btrfs_submit_direct,
+};
 
 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                        u64 start, u64 len)
@@ -8002,12 +8057,24 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        if (ret)
                return ret;
 
-       return extent_fiemap(inode, fieinfo, start, len);
+       return extent_fiemap(BTRFS_I(inode), fieinfo, start, len);
 }
 
 int btrfs_readpage(struct file *file, struct page *page)
 {
-       return extent_read_full_page(page, btrfs_get_extent, 0);
+       struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+       u64 start = page_offset(page);
+       u64 end = start + PAGE_SIZE - 1;
+       unsigned long bio_flags = 0;
+       struct bio *bio = NULL;
+       int ret;
+
+       btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
+
+       ret = btrfs_do_readpage(page, NULL, &bio, &bio_flags, 0, NULL);
+       if (bio)
+               ret = submit_one_bio(bio, 0, bio_flags);
+       return ret;
 }
 
 static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
@@ -8091,15 +8158,17 @@ static int btrfs_migratepage(struct address_space *mapping,
 static void btrfs_invalidatepage(struct page *page, unsigned int offset,
                                 unsigned int length)
 {
-       struct inode *inode = page->mapping->host;
-       struct extent_io_tree *tree;
+       struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+       struct extent_io_tree *tree = &inode->io_tree;
        struct btrfs_ordered_extent *ordered;
        struct extent_state *cached_state = NULL;
        u64 page_start = page_offset(page);
        u64 page_end = page_start + PAGE_SIZE - 1;
        u64 start;
        u64 end;
-       int inode_evicting = inode->i_state & I_FREEING;
+       int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
+       bool found_ordered = false;
+       bool completed_ordered = false;
 
        /*
         * we have the page locked, so new writeback can't start,
@@ -8110,7 +8179,6 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
         */
        wait_on_page_writeback(page);
 
-       tree = &BTRFS_I(inode)->io_tree;
        if (offset) {
                btrfs_releasepage(page, GFP_NOFS);
                return;
@@ -8120,18 +8188,19 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
                lock_extent_bits(tree, page_start, page_end, &cached_state);
 again:
        start = page_start;
-       ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start,
-                                       page_end - start + 1);
+       ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1);
        if (ordered) {
+               found_ordered = true;
                end = min(page_end,
                          ordered->file_offset + ordered->num_bytes - 1);
                /*
-                * IO on this page will never be started, so we need
-                * to account for any ordered extents now
+                * IO on this page will never be started, so we need to account
+                * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW
+                * here, must leave that up for the ordered extent completion.
                 */
                if (!inode_evicting)
                        clear_extent_bit(tree, start, end,
-                                        EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+                                        EXTENT_DELALLOC |
                                         EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
                                         EXTENT_DEFRAG, 1, 0, &cached_state);
                /*
@@ -8142,7 +8211,7 @@ again:
                        struct btrfs_ordered_inode_tree *tree;
                        u64 new_len;
 
-                       tree = &BTRFS_I(inode)->ordered_tree;
+                       tree = &inode->ordered_tree;
 
                        spin_lock_irq(&tree->lock);
                        set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
@@ -8153,8 +8222,10 @@ again:
 
                        if (btrfs_dec_test_ordered_pending(inode, &ordered,
                                                           start,
-                                                          end - start + 1, 1))
+                                                          end - start + 1, 1)) {
                                btrfs_finish_ordered_io(ordered);
+                               completed_ordered = true;
+                       }
                }
                btrfs_put_ordered_extent(ordered);
                if (!inode_evicting) {
@@ -8181,12 +8252,25 @@ again:
         *    bit of its io_tree, and free the qgroup reserved data space.
         *    Since the IO will never happen for this page.
         */
-       btrfs_qgroup_free_data(BTRFS_I(inode), NULL, page_start, PAGE_SIZE);
+       btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE);
        if (!inode_evicting) {
+               bool delete = true;
+
+               /*
+                * If there's an ordered extent for this range and we have not
+                * finished it ourselves, we must leave EXTENT_DELALLOC_NEW set
+                * in the range for the ordered extent completion. We must also
+                * not delete the range, otherwise we would lose that bit (and
+                * any other bits set in the range). Make sure EXTENT_UPTODATE
+                * is cleared if we don't delete, otherwise it can lead to
+                * corruptions if the i_size is extented later.
+                */
+               if (found_ordered && !completed_ordered)
+                       delete = false;
                clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED |
-                                EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
-                                EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
-                                &cached_state);
+                                EXTENT_DELALLOC | EXTENT_UPTODATE |
+                                EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1,
+                                delete, &cached_state);
 
                __btrfs_releasepage(page, GFP_NOFS);
        }
@@ -8283,7 +8367,7 @@ again:
                unlock_extent_cached(io_tree, page_start, page_end,
                                     &cached_state);
                unlock_page(page);
-               btrfs_start_ordered_extent(inode, ordered, 1);
+               btrfs_start_ordered_extent(ordered, 1);
                btrfs_put_ordered_extent(ordered);
                goto again;
        }
@@ -8435,14 +8519,14 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
        trans->block_rsv = rsv;
 
        while (1) {
-               ret = btrfs_truncate_inode_items(trans, root, inode,
+               ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
                                                 inode->i_size,
                                                 BTRFS_EXTENT_DATA_KEY);
                trans->block_rsv = &fs_info->trans_block_rsv;
                if (ret != -ENOSPC && ret != -EAGAIN)
                        break;
 
-               ret = btrfs_update_inode(trans, root, inode);
+               ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
                if (ret)
                        break;
 
@@ -8473,7 +8557,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
                btrfs_end_transaction(trans);
                btrfs_btree_balance_dirty(fs_info);
 
-               ret = btrfs_truncate_block(inode, inode->i_size, 0, 0);
+               ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0);
                if (ret)
                        goto out;
                trans = btrfs_start_transaction(root, 1);
@@ -8481,14 +8565,14 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
                        ret = PTR_ERR(trans);
                        goto out;
                }
-               btrfs_inode_safe_disk_i_size_write(inode, 0);
+               btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
        }
 
        if (trans) {
                int ret2;
 
                trans->block_rsv = &fs_info->trans_block_rsv;
-               ret2 = btrfs_update_inode(trans, root, inode);
+               ret2 = btrfs_update_inode(trans, root, BTRFS_I(inode));
                if (ret2 && !ret)
                        ret = ret2;
 
@@ -8534,7 +8618,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
                          "error inheriting subvolume %llu properties: %d",
                          new_root->root_key.objectid, err);
 
-       err = btrfs_update_inode(trans, new_root, inode);
+       err = btrfs_update_inode(trans, new_root, BTRFS_I(inode));
 
        iput(inode);
        return err;
@@ -8596,7 +8680,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        INIT_LIST_HEAD(&ei->delalloc_inodes);
        INIT_LIST_HEAD(&ei->delayed_iput);
        RB_CLEAR_NODE(&ei->rb_node);
-       init_rwsem(&ei->dio_sem);
 
        return inode;
 }
@@ -8614,21 +8697,21 @@ void btrfs_free_inode(struct inode *inode)
        kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
 }
 
-void btrfs_destroy_inode(struct inode *inode)
+void btrfs_destroy_inode(struct inode *vfs_inode)
 {
-       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct btrfs_ordered_extent *ordered;
-       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct btrfs_inode *inode = BTRFS_I(vfs_inode);
+       struct btrfs_root *root = inode->root;
 
-       WARN_ON(!hlist_empty(&inode->i_dentry));
-       WARN_ON(inode->i_data.nrpages);
-       WARN_ON(BTRFS_I(inode)->block_rsv.reserved);
-       WARN_ON(BTRFS_I(inode)->block_rsv.size);
-       WARN_ON(BTRFS_I(inode)->outstanding_extents);
-       WARN_ON(BTRFS_I(inode)->delalloc_bytes);
-       WARN_ON(BTRFS_I(inode)->new_delalloc_bytes);
-       WARN_ON(BTRFS_I(inode)->csum_bytes);
-       WARN_ON(BTRFS_I(inode)->defrag_bytes);
+       WARN_ON(!hlist_empty(&vfs_inode->i_dentry));
+       WARN_ON(vfs_inode->i_data.nrpages);
+       WARN_ON(inode->block_rsv.reserved);
+       WARN_ON(inode->block_rsv.size);
+       WARN_ON(inode->outstanding_extents);
+       WARN_ON(inode->delalloc_bytes);
+       WARN_ON(inode->new_delalloc_bytes);
+       WARN_ON(inode->csum_bytes);
+       WARN_ON(inode->defrag_bytes);
 
        /*
         * This can happen where we create an inode, but somebody else also
@@ -8643,7 +8726,7 @@ void btrfs_destroy_inode(struct inode *inode)
                if (!ordered)
                        break;
                else {
-                       btrfs_err(fs_info,
+                       btrfs_err(root->fs_info,
                                  "found ordered extent %llu %llu on inode cleanup",
                                  ordered->file_offset, ordered->num_bytes);
                        btrfs_remove_ordered_extent(inode, ordered);
@@ -8651,11 +8734,11 @@ void btrfs_destroy_inode(struct inode *inode)
                        btrfs_put_ordered_extent(ordered);
                }
        }
-       btrfs_qgroup_check_reserved_leak(BTRFS_I(inode));
+       btrfs_qgroup_check_reserved_leak(inode);
        inode_tree_del(inode);
-       btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
-       btrfs_inode_clear_file_extent_range(BTRFS_I(inode), 0, (u64)-1);
-       btrfs_put_root(BTRFS_I(inode)->root);
+       btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
+       btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
+       btrfs_put_root(inode->root);
 }
 
 int btrfs_drop_inode(struct inode *inode)
@@ -8736,6 +8819,7 @@ static int btrfs_getattr(const struct path *path, struct kstat *stat,
                         u32 request_mask, unsigned int flags)
 {
        u64 delalloc_bytes;
+       u64 inode_bytes;
        struct inode *inode = d_inode(path->dentry);
        u32 blocksize = inode->i_sb->s_blocksize;
        u32 bi_flags = BTRFS_I(inode)->flags;
@@ -8762,8 +8846,9 @@ static int btrfs_getattr(const struct path *path, struct kstat *stat,
 
        spin_lock(&BTRFS_I(inode)->lock);
        delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
+       inode_bytes = inode_get_bytes(inode);
        spin_unlock(&BTRFS_I(inode)->lock);
-       stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
+       stat->blocks = (ALIGN(inode_bytes, blocksize) +
                        ALIGN(delalloc_bytes, blocksize)) >> 9;
        return 0;
 }
@@ -8780,27 +8865,19 @@ static int btrfs_rename_exchange(struct inode *old_dir,
        struct inode *new_inode = new_dentry->d_inode;
        struct inode *old_inode = old_dentry->d_inode;
        struct timespec64 ctime = current_time(old_inode);
-       struct dentry *parent;
        u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
        u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
        u64 old_idx = 0;
        u64 new_idx = 0;
        int ret;
+       int ret2;
        bool root_log_pinned = false;
        bool dest_log_pinned = false;
-       struct btrfs_log_ctx ctx_root;
-       struct btrfs_log_ctx ctx_dest;
-       bool sync_log_root = false;
-       bool sync_log_dest = false;
-       bool commit_transaction = false;
 
        /* we only allow rename subvolume link between subvolumes */
        if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
                return -EXDEV;
 
-       btrfs_init_log_ctx(&ctx_root, old_inode);
-       btrfs_init_log_ctx(&ctx_dest, new_inode);
-
        /* close the race window with snapshot create/destroy ioctl */
        if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
            new_ino == BTRFS_FIRST_FREE_OBJECTID)
@@ -8897,7 +8974,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
                                           old_dentry->d_name.name,
                                           old_dentry->d_name.len);
                if (!ret)
-                       ret = btrfs_update_inode(trans, root, old_inode);
+                       ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
        }
        if (ret) {
                btrfs_abort_transaction(trans, ret);
@@ -8913,7 +8990,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
                                           new_dentry->d_name.name,
                                           new_dentry->d_name.len);
                if (!ret)
-                       ret = btrfs_update_inode(trans, dest, new_inode);
+                       ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode));
        }
        if (ret) {
                btrfs_abort_transaction(trans, ret);
@@ -8942,30 +9019,14 @@ static int btrfs_rename_exchange(struct inode *old_dir,
                BTRFS_I(new_inode)->dir_index = new_idx;
 
        if (root_log_pinned) {
-               parent = new_dentry->d_parent;
-               ret = btrfs_log_new_name(trans, BTRFS_I(old_inode),
-                                        BTRFS_I(old_dir), parent,
-                                        false, &ctx_root);
-               if (ret == BTRFS_NEED_LOG_SYNC)
-                       sync_log_root = true;
-               else if (ret == BTRFS_NEED_TRANS_COMMIT)
-                       commit_transaction = true;
-               ret = 0;
+               btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
+                                  new_dentry->d_parent);
                btrfs_end_log_trans(root);
                root_log_pinned = false;
        }
        if (dest_log_pinned) {
-               if (!commit_transaction) {
-                       parent = old_dentry->d_parent;
-                       ret = btrfs_log_new_name(trans, BTRFS_I(new_inode),
-                                                BTRFS_I(new_dir), parent,
-                                                false, &ctx_dest);
-                       if (ret == BTRFS_NEED_LOG_SYNC)
-                               sync_log_dest = true;
-                       else if (ret == BTRFS_NEED_TRANS_COMMIT)
-                               commit_transaction = true;
-                       ret = 0;
-               }
+               btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir),
+                                  old_dentry->d_parent);
                btrfs_end_log_trans(dest);
                dest_log_pinned = false;
        }
@@ -8998,46 +9059,13 @@ out_fail:
                        dest_log_pinned = false;
                }
        }
-       if (!ret && sync_log_root && !commit_transaction) {
-               ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root,
-                                    &ctx_root);
-               if (ret)
-                       commit_transaction = true;
-       }
-       if (!ret && sync_log_dest && !commit_transaction) {
-               ret = btrfs_sync_log(trans, BTRFS_I(new_inode)->root,
-                                    &ctx_dest);
-               if (ret)
-                       commit_transaction = true;
-       }
-       if (commit_transaction) {
-               /*
-                * We may have set commit_transaction when logging the new name
-                * in the destination root, in which case we left the source
-                * root context in the list of log contextes. So make sure we
-                * remove it to avoid invalid memory accesses, since the context
-                * was allocated in our stack frame.
-                */
-               if (sync_log_root) {
-                       mutex_lock(&root->log_mutex);
-                       list_del_init(&ctx_root.list);
-                       mutex_unlock(&root->log_mutex);
-               }
-               ret = btrfs_commit_transaction(trans);
-       } else {
-               int ret2;
-
-               ret2 = btrfs_end_transaction(trans);
-               ret = ret ? ret : ret2;
-       }
+       ret2 = btrfs_end_transaction(trans);
+       ret = ret ? ret : ret2;
 out_notrans:
        if (new_ino == BTRFS_FIRST_FREE_OBJECTID ||
            old_ino == BTRFS_FIRST_FREE_OBJECTID)
                up_read(&fs_info->subvol_sem);
 
-       ASSERT(list_empty(&ctx_root.list));
-       ASSERT(list_empty(&ctx_dest.list));
-
        return ret;
 }
 
@@ -9051,7 +9079,7 @@ static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
        u64 objectid;
        u64 index;
 
-       ret = btrfs_find_free_ino(root, &objectid);
+       ret = btrfs_find_free_objectid(root, &objectid);
        if (ret)
                return ret;
 
@@ -9082,7 +9110,7 @@ static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
        if (ret)
                goto out;
 
-       ret = btrfs_update_inode(trans, root, inode);
+       ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
 out:
        unlock_new_inode(inode);
        if (ret)
@@ -9105,11 +9133,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct inode *old_inode = d_inode(old_dentry);
        u64 index = 0;
        int ret;
+       int ret2;
        u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
        bool log_pinned = false;
-       struct btrfs_log_ctx ctx;
-       bool sync_log = false;
-       bool commit_transaction = false;
 
        if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
                return -EPERM;
@@ -9218,7 +9244,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                                        old_dentry->d_name.name,
                                        old_dentry->d_name.len);
                if (!ret)
-                       ret = btrfs_update_inode(trans, root, old_inode);
+                       ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
        }
        if (ret) {
                btrfs_abort_transaction(trans, ret);
@@ -9259,17 +9285,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                BTRFS_I(old_inode)->dir_index = index;
 
        if (log_pinned) {
-               struct dentry *parent = new_dentry->d_parent;
-
-               btrfs_init_log_ctx(&ctx, old_inode);
-               ret = btrfs_log_new_name(trans, BTRFS_I(old_inode),
-                                        BTRFS_I(old_dir), parent,
-                                        false, &ctx);
-               if (ret == BTRFS_NEED_LOG_SYNC)
-                       sync_log = true;
-               else if (ret == BTRFS_NEED_TRANS_COMMIT)
-                       commit_transaction = true;
-               ret = 0;
+               btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
+                                  new_dentry->d_parent);
                btrfs_end_log_trans(root);
                log_pinned = false;
        }
@@ -9306,23 +9323,8 @@ out_fail:
                btrfs_end_log_trans(root);
                log_pinned = false;
        }
-       if (!ret && sync_log) {
-               ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, &ctx);
-               if (ret)
-                       commit_transaction = true;
-       } else if (sync_log) {
-               mutex_lock(&root->log_mutex);
-               list_del(&ctx.list);
-               mutex_unlock(&root->log_mutex);
-       }
-       if (commit_transaction) {
-               ret = btrfs_commit_transaction(trans);
-       } else {
-               int ret2;
-
-               ret2 = btrfs_end_transaction(trans);
-               ret = ret ? ret : ret2;
-       }
+       ret2 = btrfs_end_transaction(trans);
+       ret = ret ? ret : ret2;
 out_notrans:
        if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
                up_read(&fs_info->subvol_sem);
@@ -9388,7 +9390,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode
  * some fairly slow code that needs optimization. This walks the list
  * of all the inodes with pending delalloc and forces them to disk.
  */
-static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot)
+static int start_delalloc_inodes(struct btrfs_root *root, u64 *nr, bool snapshot)
 {
        struct btrfs_inode *binode;
        struct inode *inode;
@@ -9428,9 +9430,11 @@ static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot)
                list_add_tail(&work->list, &works);
                btrfs_queue_work(root->fs_info->flush_workers,
                                 &work->work);
-               ret++;
-               if (nr != -1 && ret >= nr)
-                       goto out;
+               if (*nr != U64_MAX) {
+                       (*nr)--;
+                       if (*nr == 0)
+                               goto out;
+               }
                cond_resched();
                spin_lock(&root->delalloc_lock);
        }
@@ -9455,18 +9459,15 @@ out:
 int btrfs_start_delalloc_snapshot(struct btrfs_root *root)
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
-       int ret;
+       u64 nr = U64_MAX;
 
        if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
                return -EROFS;
 
-       ret = start_delalloc_inodes(root, -1, true);
-       if (ret > 0)
-               ret = 0;
-       return ret;
+       return start_delalloc_inodes(root, &nr, true);
 }
 
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr)
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr)
 {
        struct btrfs_root *root;
        struct list_head splice;
@@ -9489,15 +9490,10 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr)
                               &fs_info->delalloc_roots);
                spin_unlock(&fs_info->delalloc_root_lock);
 
-               ret = start_delalloc_inodes(root, nr, false);
+               ret = start_delalloc_inodes(root, &nr, false);
                btrfs_put_root(root);
                if (ret < 0)
                        goto out;
-
-               if (nr != -1) {
-                       nr -= ret;
-                       WARN_ON(nr < 0);
-               }
                spin_lock(&fs_info->delalloc_root_lock);
        }
        spin_unlock(&fs_info->delalloc_root_lock);
@@ -9546,7 +9542,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        if (IS_ERR(trans))
                return PTR_ERR(trans);
 
-       err = btrfs_find_free_ino(root, &objectid);
+       err = btrfs_find_free_objectid(root, &objectid);
        if (err)
                goto out_unlock;
 
@@ -9568,7 +9564,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        inode->i_fop = &btrfs_file_operations;
        inode->i_op = &btrfs_file_inode_operations;
        inode->i_mapping->a_ops = &btrfs_aops;
-       BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 
        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
        if (err)
@@ -9609,7 +9604,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        inode_nohighmem(inode);
        inode_set_bytes(inode, name_len);
        btrfs_i_size_write(BTRFS_I(inode), name_len);
-       err = btrfs_update_inode(trans, root, inode);
+       err = btrfs_update_inode(trans, root, BTRFS_I(inode));
        /*
         * Last step, add directory indexes for our symlink inode. This is the
         * last step to avoid extra cleanup of these indexes if an error happens
@@ -9633,11 +9628,16 @@ out_unlock:
        return err;
 }
 
-static int insert_prealloc_file_extent(struct btrfs_trans_handle *trans,
-                                      struct inode *inode, struct btrfs_key *ins,
+static struct btrfs_trans_handle *insert_prealloc_file_extent(
+                                      struct btrfs_trans_handle *trans_in,
+                                      struct btrfs_inode *inode,
+                                      struct btrfs_key *ins,
                                       u64 file_offset)
 {
        struct btrfs_file_extent_item stack_fi;
+       struct btrfs_replace_extent_info extent_info;
+       struct btrfs_trans_handle *trans = trans_in;
+       struct btrfs_path *path;
        u64 start = ins->objectid;
        u64 len = ins->offset;
        int ret;
@@ -9652,12 +9652,43 @@ static int insert_prealloc_file_extent(struct btrfs_trans_handle *trans,
        btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
        /* Encryption and other encoding is reserved and all 0 */
 
-       ret = btrfs_qgroup_release_data(BTRFS_I(inode), file_offset, len);
+       ret = btrfs_qgroup_release_data(inode, file_offset, len);
        if (ret < 0)
-               return ret;
-       return insert_reserved_file_extent(trans, BTRFS_I(inode), file_offset,
-                                          &stack_fi, ret);
+               return ERR_PTR(ret);
+
+       if (trans) {
+               ret = insert_reserved_file_extent(trans, inode,
+                                                 file_offset, &stack_fi,
+                                                 true, ret);
+               if (ret)
+                       return ERR_PTR(ret);
+               return trans;
+       }
+
+       extent_info.disk_offset = start;
+       extent_info.disk_len = len;
+       extent_info.data_offset = 0;
+       extent_info.data_len = len;
+       extent_info.file_offset = file_offset;
+       extent_info.extent_buf = (char *)&stack_fi;
+       extent_info.is_new_extent = true;
+       extent_info.qgroup_reserved = ret;
+       extent_info.insertions = 0;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return ERR_PTR(-ENOMEM);
+
+       ret = btrfs_replace_file_extents(&inode->vfs_inode, path, file_offset,
+                                    file_offset + len - 1, &extent_info,
+                                    &trans);
+       btrfs_free_path(path);
+       if (ret)
+               return ERR_PTR(ret);
+
+       return trans;
 }
+
 static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
                                       u64 start, u64 num_bytes, u64 min_size,
                                       loff_t actual_len, u64 *alloc_hint,
@@ -9680,14 +9711,6 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
        if (trans)
                own_trans = false;
        while (num_bytes > 0) {
-               if (own_trans) {
-                       trans = btrfs_start_transaction(root, 3);
-                       if (IS_ERR(trans)) {
-                               ret = PTR_ERR(trans);
-                               break;
-                       }
-               }
-
                cur_bytes = min_t(u64, num_bytes, SZ_256M);
                cur_bytes = max(cur_bytes, min_size);
                /*
@@ -9699,11 +9722,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
                cur_bytes = min(cur_bytes, last_alloc);
                ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
                                min_size, 0, *alloc_hint, &ins, 1, 0);
-               if (ret) {
-                       if (own_trans)
-                               btrfs_end_transaction(trans);
+               if (ret)
                        break;
-               }
 
                /*
                 * We've reserved this space, and thus converted it from
@@ -9713,16 +9733,21 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
                 * clear_offset by our extent size.
                 */
                clear_offset += ins.offset;
-               btrfs_dec_block_group_reservations(fs_info, ins.objectid);
 
                last_alloc = ins.offset;
-               ret = insert_prealloc_file_extent(trans, inode, &ins, cur_offset);
-               if (ret) {
+               trans = insert_prealloc_file_extent(trans, BTRFS_I(inode),
+                                                   &ins, cur_offset);
+               /*
+                * Now that we inserted the prealloc extent we can finally
+                * decrement the number of reservations in the block group.
+                * If we did it before, we could race with relocation and have
+                * relocation miss the reserved extent, making it fail later.
+                */
+               btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+               if (IS_ERR(trans)) {
+                       ret = PTR_ERR(trans);
                        btrfs_free_reserved_extent(fs_info, ins.objectid,
                                                   ins.offset, 0);
-                       btrfs_abort_transaction(trans, ret);
-                       if (own_trans)
-                               btrfs_end_transaction(trans);
                        break;
                }
 
@@ -9773,10 +9798,10 @@ next:
                        else
                                i_size = cur_offset;
                        i_size_write(inode, i_size);
-                       btrfs_inode_safe_disk_i_size_write(inode, 0);
+                       btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
                }
 
-               ret = btrfs_update_inode(trans, root, inode);
+               ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
 
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
@@ -9785,8 +9810,10 @@ next:
                        break;
                }
 
-               if (own_trans)
+               if (own_trans) {
                        btrfs_end_transaction(trans);
+                       trans = NULL;
+               }
        }
        if (clear_offset < end)
                btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset,
@@ -9849,7 +9876,7 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
        if (IS_ERR(trans))
                return PTR_ERR(trans);
 
-       ret = btrfs_find_free_ino(root, &objectid);
+       ret = btrfs_find_free_objectid(root, &objectid);
        if (ret)
                goto out;
 
@@ -9865,13 +9892,12 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
        inode->i_op = &btrfs_file_inode_operations;
 
        inode->i_mapping->a_ops = &btrfs_aops;
-       BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 
        ret = btrfs_init_inode_security(trans, inode, dir, NULL);
        if (ret)
                goto out;
 
-       ret = btrfs_update_inode(trans, root, inode);
+       ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
        if (ret)
                goto out;
        ret = btrfs_orphan_add(trans, BTRFS_I(inode));
@@ -10072,14 +10098,14 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
 
        /*
         * Balance or device remove/replace/resize can move stuff around from
-        * under us. The EXCL_OP flag makes sure they aren't running/won't run
-        * concurrently while we are mapping the swap extents, and
-        * fs_info->swapfile_pins prevents them from running while the swap file
-        * is active and moving the extents. Note that this also prevents a
-        * concurrent device add which isn't actually necessary, but it's not
+        * under us. The exclop protection makes sure they aren't running/won't
+        * run concurrently while we are mapping the swap extents, and
+        * fs_info->swapfile_pins prevents them from running while the swap
+        * file is active and moving the extents. Note that this also prevents
+        * concurrent device add which isn't actually necessary, but it's not
         * really worth the trouble to allow it.
         */
-       if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+       if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) {
                btrfs_warn(fs_info,
           "cannot activate swapfile while exclusive operation is running");
                return -EBUSY;
@@ -10225,7 +10251,7 @@ out:
        if (ret)
                btrfs_swap_deactivate(file);
 
-       clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+       btrfs_exclop_finish(fs_info);
 
        if (ret)
                return ret;
@@ -10250,6 +10276,27 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
 }
 #endif
 
+/*
+ * Update the number of bytes used in the VFS' inode. When we replace extents in
+ * a range (clone, dedupe, fallocate's zero range), we must update the number of
+ * bytes used by the inode in an atomic manner, so that concurrent stat(2) calls
+ * always get a correct value.
+ */
+void btrfs_update_inode_bytes(struct btrfs_inode *inode,
+                             const u64 add_bytes,
+                             const u64 del_bytes)
+{
+       if (add_bytes == del_bytes)
+               return;
+
+       spin_lock(&inode->lock);
+       if (del_bytes > 0)
+               inode_sub_bytes(&inode->vfs_inode, del_bytes);
+       if (add_bytes > 0)
+               inode_add_bytes(&inode->vfs_inode, add_bytes);
+       spin_unlock(&inode->lock);
+}
+
 static const struct inode_operations btrfs_dir_inode_operations = {
        .getattr        = btrfs_getattr,
        .lookup         = btrfs_lookup,
@@ -10283,12 +10330,6 @@ static const struct file_operations btrfs_dir_file_operations = {
        .fsync          = btrfs_sync_file,
 };
 
-static const struct extent_io_ops btrfs_extent_io_ops = {
-       /* mandatory callbacks */
-       .submit_bio_hook = btrfs_submit_bio_hook,
-       .readpage_end_io_hook = btrfs_readpage_end_io_hook,
-};
-
 /*
  * btrfs doesn't support the bmap operation because swapfiles
  * use bmap to make a mapping of extents in the file.  They assume
@@ -10306,7 +10347,7 @@ static const struct address_space_operations btrfs_aops = {
        .writepage      = btrfs_writepage,
        .writepages     = btrfs_writepages,
        .readahead      = btrfs_readahead,
-       .direct_IO      = btrfs_direct_IO,
+       .direct_IO      = noop_direct_IO,
        .invalidatepage = btrfs_invalidatepage,
        .releasepage    = btrfs_releasepage,
 #ifdef CONFIG_MIGRATION