Merge tag 'iomap-5.15-merge-4' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 31 Aug 2021 18:13:35 +0000 (11:13 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 31 Aug 2021 18:13:35 +0000 (11:13 -0700)
Pull iomap updates from Darrick Wong:
 "The most notable externally visible change for this cycle is the
  addition of support for reads to inline tail fragments of files, which
  was requested by the erofs developers; and a correction for a kernel
  memory corruption bug if the sysadmin tries to activate a swapfile
  with more pages than the swapfile header suggests.

  We also now report writeback completion errors to the file mapping
  correctly, instead of munging all errors into EIO.

  Internally, the bulk of the changes are Christoph's patchset to reduce
  the indirect function call count by a third to a half by converting
  iomap iteration from a loop pattern to a generator/consumer pattern.
  As an added bonus, fsdax no longer open-codes iomap apply loops.

  Summary:

   - Simplify the bio_end_page usage in the buffered IO code.

   - Support reading inline data at nonzero offsets for erofs.

   - Fix some typos and bad grammar.

   - Convert kmap_atomic usage in the inline data read path.

   - Add some extra inline data input checking.

   - Fix a memory corruption bug stemming from iomap_swapfile_activate
     trying to activate more pages than mm was expecting.

   - Pass errnos through the page writeback code so that writeback
     errors are reported correctly instead of being munged to EIO.

   - Replace iomap_apply with a open-coded iterator loops to reduce the
     number of indirect calls by a third to a half.

   - Refactor the fsdax code to use iomap iterators instead of the
     open-coded iomap_apply code that it had before.

   - Format file range iomap tracepoint data in hexadecimal and
     standardize the names used in the pretty-print string"

* tag 'iomap-5.15-merge-4' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: (41 commits)
  iomap: standardize tracepoint formatting and storage
  mm/swap: consider max pages in iomap_swapfile_add_extent
  iomap: move loop control code to iter.c
  iomap: constify iomap_iter_srcmap
  fsdax: switch the fault handlers to use iomap_iter
  fsdax: factor out a dax_fault_actor() helper
  fsdax: factor out helpers to simplify the dax fault code
  iomap: rework unshare flag
  iomap: pass an iomap_iter to various buffered I/O helpers
  iomap: remove iomap_apply
  fsdax: switch dax_iomap_rw to use iomap_iter
  iomap: switch iomap_swapfile_activate to use iomap_iter
  iomap: switch iomap_seek_data to use iomap_iter
  iomap: switch iomap_seek_hole to use iomap_iter
  iomap: switch iomap_bmap to use iomap_iter
  iomap: switch iomap_fiemap to use iomap_iter
  iomap: switch __iomap_dio_rw to use iomap_iter
  iomap: switch iomap_page_mkwrite to use iomap_iter
  iomap: switch iomap_zero_range to use iomap_iter
  iomap: switch iomap_file_unshare to use iomap_iter
  ...

1  2 
fs/btrfs/inode.c
fs/dax.c
fs/internal.h

diff --combined fs/btrfs/inode.c
@@@ -32,7 -32,6 +32,7 @@@
  #include <linux/sched/mm.h>
  #include <linux/iomap.h>
  #include <asm/unaligned.h>
 +#include <linux/fsverity.h>
  #include "misc.h"
  #include "ctree.h"
  #include "disk-io.h"
@@@ -287,8 -286,9 +287,8 @@@ static int insert_inline_extent(struct 
                        cur_size = min_t(unsigned long, compressed_size,
                                       PAGE_SIZE);
  
 -                      kaddr = kmap_atomic(cpage);
 +                      kaddr = page_address(cpage);
                        write_extent_buffer(leaf, kaddr, ptr, cur_size);
 -                      kunmap_atomic(kaddr);
  
                        i++;
                        ptr += cur_size;
@@@ -490,9 -490,6 +490,9 @@@ static noinline int add_async_extent(st
   */
  static inline bool inode_can_compress(struct btrfs_inode *inode)
  {
 +      /* Subpage doesn't support compression yet */
 +      if (inode->root->fs_info->sectorsize < PAGE_SIZE)
 +              return false;
        if (inode->flags & BTRFS_INODE_NODATACOW ||
            inode->flags & BTRFS_INODE_NODATASUM)
                return false;
@@@ -632,7 -629,7 +632,7 @@@ again
         * inode has not been flagged as nocompress.  This flag can
         * change at any time if we discover bad compression ratios.
         */
 -      if (nr_pages > 1 && inode_need_compress(BTRFS_I(inode), start, end)) {
 +      if (inode_need_compress(BTRFS_I(inode), start, end)) {
                WARN_ON(pages);
                pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
                if (!pages) {
                }
        }
  cont:
 -      if (start == 0) {
 +      /*
 +       * Check cow_file_range() for why we don't even try to create inline
 +       * extent for subpage case.
 +       */
 +      if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
                /* lets try to make an inline extent */
                if (ret || total_in < actual_end) {
                        /* we didn't compress the entire range, try
@@@ -980,7 -973,7 +980,7 @@@ retry
  
                        p->mapping = inode->vfs_inode.i_mapping;
                        btrfs_writepage_endio_finish_ordered(inode, p, start,
 -                                                           end, 0);
 +                                                           end, false);
  
                        p->mapping = NULL;
                        extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
@@@ -1087,17 -1080,7 +1087,17 @@@ static noinline int cow_file_range(stru
  
        inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
  
 -      if (start == 0) {
 +      /*
 +       * Due to the page size limit, for subpage we can only trigger the
 +       * writeback for the dirty sectors of page, that means data writeback
 +       * is doing more writeback than what we want.
 +       *
 +       * This is especially unexpected for some call sites like fallocate,
 +       * where we only increase i_size after everything is done.
 +       * This means we can trigger inline extent even if we didn't want to.
 +       * So here we skip inline extent creation completely.
 +       */
 +      if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
                /* lets try to make an inline extent */
                ret = cow_file_range_inline(inode, start, end, 0,
                                            BTRFS_COMPRESS_NONE, NULL);
@@@ -1307,6 -1290,11 +1307,6 @@@ static noinline void async_cow_submit(s
        nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
                PAGE_SHIFT;
  
 -      /* atomic_sub_return implies a barrier */
 -      if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
 -          5 * SZ_1M)
 -              cond_wake_up_nomb(&fs_info->async_submit_wait);
 -
        /*
         * ->inode could be NULL if async_chunk_start has failed to compress,
         * in which case we don't have anything to submit, yet we need to
         */
        if (async_chunk->inode)
                submit_compressed_extents(async_chunk);
 +
 +      /* atomic_sub_return implies a barrier */
 +      if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
 +          5 * SZ_1M)
 +              cond_wake_up_nomb(&fs_info->async_submit_wait);
  }
  
  static noinline void async_cow_free(struct btrfs_work *work)
@@@ -1963,7 -1946,6 +1963,7 @@@ int btrfs_run_delalloc_range(struct btr
                ret = cow_file_range_async(inode, wbc, locked_page, start, end,
                                           page_started, nr_written);
        }
 +      ASSERT(ret <= 0);
        if (ret)
                btrfs_cleanup_ordered_extents(inode, locked_page, start,
                                              end - start + 1);
@@@ -2303,6 -2285,7 +2303,6 @@@ static int split_zoned_em(struct btrfs_
        struct extent_map *split_mid = NULL;
        struct extent_map *split_post = NULL;
        int ret = 0;
 -      int modified;
        unsigned long flags;
  
        /* Sanity check */
        ASSERT(em->len == len);
        ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
        ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
 +      ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags));
 +      ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags));
 +      ASSERT(!list_empty(&em->list));
  
        flags = em->flags;
        clear_bit(EXTENT_FLAG_PINNED, &em->flags);
 -      clear_bit(EXTENT_FLAG_LOGGING, &flags);
 -      modified = !list_empty(&em->list);
  
        /* First, replace the em with a new extent_map starting from * em->start */
        split_pre->start = em->start;
        split_pre->compress_type = em->compress_type;
        split_pre->generation = em->generation;
  
 -      replace_extent_mapping(em_tree, em, split_pre, modified);
 +      replace_extent_mapping(em_tree, em, split_pre, 1);
  
        /*
         * Now we only have an extent_map at:
                split_mid->flags = flags;
                split_mid->compress_type = em->compress_type;
                split_mid->generation = em->generation;
 -              add_extent_mapping(em_tree, split_mid, modified);
 +              add_extent_mapping(em_tree, split_mid, 1);
        }
  
        if (post) {
                split_post->flags = flags;
                split_post->compress_type = em->compress_type;
                split_post->generation = em->generation;
 -              add_extent_mapping(em_tree, split_post, modified);
 +              add_extent_mapping(em_tree, split_post, 1);
        }
  
        /* Once for us */
@@@ -2788,7 -2770,7 +2788,7 @@@ out_page
   * to fix it up.  The async helper will wait for ordered extents, set
   * the delalloc bit and make it safe to write the page.
   */
 -int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end)
 +int btrfs_writepage_cow_fixup(struct page *page)
  {
        struct inode *inode = page->mapping->host;
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@@ -3189,7 -3171,7 +3189,7 @@@ static void finish_ordered_fn(struct bt
  
  void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
                                          struct page *page, u64 start,
 -                                        u64 end, int uptodate)
 +                                        u64 end, bool uptodate)
  {
        trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate);
  
@@@ -3275,44 -3257,25 +3275,44 @@@ unsigned int btrfs_verify_data_csum(str
                return 0;
        }
  
 -      if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
 +      /*
 +       * For subpage case, above PageChecked is not safe as it's not subpage
 +       * compatible.
 +       * But for now only cow fixup and compressed read utilize PageChecked
 +       * flag, while in this context we can easily use io_bio->csum to
 +       * determine if we really need to do csum verification.
 +       *
 +       * So for now, just exit if io_bio->csum is NULL, as it means it's
 +       * compressed read, and its compressed data csum has already been
 +       * verified.
 +       */
 +      if (io_bio->csum == NULL)
                return 0;
  
 -      if (!root->fs_info->csum_root)
 +      if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
                return 0;
  
 -      if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
 -          test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
 -              clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM);
 +      if (!root->fs_info->csum_root)
                return 0;
 -      }
  
        ASSERT(page_offset(page) <= start &&
               end <= page_offset(page) + PAGE_SIZE - 1);
        for (pg_off = offset_in_page(start);
             pg_off < offset_in_page(end);
             pg_off += sectorsize, bio_offset += sectorsize) {
 +              u64 file_offset = pg_off + page_offset(page);
                int ret;
  
 +              if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
 +                  test_range_bit(io_tree, file_offset,
 +                                 file_offset + sectorsize - 1,
 +                                 EXTENT_NODATASUM, 1, NULL)) {
 +                      /* Skip the range without csum for data reloc inode */
 +                      clear_extent_bits(io_tree, file_offset,
 +                                        file_offset + sectorsize - 1,
 +                                        EXTENT_NODATASUM);
 +                      continue;
 +              }
                ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off,
                                      page_offset(page) + pg_off);
                if (ret < 0) {
@@@ -3557,14 -3520,7 +3557,14 @@@ int btrfs_orphan_cleanup(struct btrfs_r
  
                /*
                 * If we have an inode with links, there are a couple of
 -               * possibilities. Old kernels (before v3.12) used to create an
 +               * possibilities:
 +               *
 +               * 1. We were halfway through creating fsverity metadata for the
 +               * file. In that case, the orphan item represents incomplete
 +               * fsverity metadata which must be cleaned up with
 +               * btrfs_drop_verity_items and deleting the orphan item.
 +
 +               * 2. Old kernels (before v3.12) used to create an
                 * orphan item for truncate indicating that there were possibly
                 * extent items past i_size that needed to be deleted. In v3.12,
                 * truncate was changed to update i_size in sync with the extent
                 * but either way, we can delete the orphan item.
                 */
                if (ret == -ENOENT || inode->i_nlink) {
 -                      if (!ret)
 +                      if (!ret) {
 +                              ret = btrfs_drop_verity_items(BTRFS_I(inode));
                                iput(inode);
 +                              if (ret)
 +                                      goto out;
 +                      }
                        trans = btrfs_start_transaction(root, 1);
                        if (IS_ERR(trans)) {
                                ret = PTR_ERR(trans);
@@@ -3776,8 -3728,7 +3776,8 @@@ static int btrfs_read_locked_inode(stru
        rdev = btrfs_inode_rdev(leaf, inode_item);
  
        BTRFS_I(inode)->index_cnt = (u64)-1;
 -      BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
 +      btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
 +                              &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
  
  cache_index:
        /*
@@@ -3908,7 -3859,6 +3908,7 @@@ static void fill_inode_item(struct btrf
                            struct inode *inode)
  {
        struct btrfs_map_token token;
 +      u64 flags;
  
        btrfs_init_map_token(&token, leaf);
  
        btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
        btrfs_set_token_inode_transid(&token, item, trans->transid);
        btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
 -      btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags);
 +      flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
 +                                        BTRFS_I(inode)->ro_flags);
 +      btrfs_set_token_inode_flags(&token, item, flags);
        btrfs_set_token_inode_block_group(&token, item, 0);
  }
  
@@@ -5140,13 -5088,15 +5140,13 @@@ static int maybe_insert_hole(struct btr
        int ret;
  
        /*
 -       * Still need to make sure the inode looks like it's been updated so
 -       * that any holes get logged if we fsync.
 +       * If NO_HOLES is enabled, we don't need to do anything.
 +       * Later, up in the call chain, either btrfs_set_inode_last_sub_trans()
 +       * or btrfs_update_inode() will be called, which guarantee that the next
 +       * fsync will know this inode was changed and needs to be logged.
         */
 -      if (btrfs_fs_incompat(fs_info, NO_HOLES)) {
 -              inode->last_trans = fs_info->generation;
 -              inode->last_sub_trans = root->log_transid;
 -              inode->last_log_commit = root->last_log_commit;
 +      if (btrfs_fs_incompat(fs_info, NO_HOLES))
                return 0;
 -      }
  
        /*
         * 1 - for the one we're dropping
@@@ -5392,7 -5342,7 +5392,7 @@@ static int btrfs_setattr(struct user_na
        if (btrfs_root_readonly(root))
                return -EROFS;
  
 -      err = setattr_prepare(&init_user_ns, dentry, attr);
 +      err = setattr_prepare(mnt_userns, dentry, attr);
        if (err)
                return err;
  
        }
  
        if (attr->ia_valid) {
 -              setattr_copy(&init_user_ns, inode, attr);
 +              setattr_copy(mnt_userns, inode, attr);
                inode_inc_iversion(inode);
                err = btrfs_dirty_inode(inode);
  
                if (!err && attr->ia_valid & ATTR_MODE)
 -                      err = posix_acl_chmod(&init_user_ns, inode,
 -                                            inode->i_mode);
 +                      err = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
        }
  
        return err;
@@@ -5571,7 -5522,6 +5571,7 @@@ void btrfs_evict_inode(struct inode *in
        trace_btrfs_inode_evict(inode);
  
        if (!root) {
 +              fsverity_cleanup_inode(inode);
                clear_inode(inode);
                return;
        }
@@@ -5654,7 -5604,6 +5654,7 @@@ no_delete
         * to retry these periodically in the future.
         */
        btrfs_remove_delayed_node(BTRFS_I(inode));
 +      fsverity_cleanup_inode(inode);
        clear_inode(inode);
  }
  
@@@ -6421,7 -6370,6 +6421,7 @@@ static void btrfs_inherit_iflags(struc
  
  static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root,
 +                                   struct user_namespace *mnt_userns,
                                     struct inode *dir,
                                     const char *name, int name_len,
                                     u64 ref_objectid, u64 objectid,
        if (ret != 0)
                goto fail_unlock;
  
 -      inode_init_owner(&init_user_ns, inode, dir, mode);
 +      inode_init_owner(mnt_userns, inode, dir, mode);
        inode_set_bytes(inode, 0);
  
        inode->i_mtime = current_time(inode);
@@@ -6716,9 -6664,9 +6716,9 @@@ static int btrfs_mknod(struct user_name
        if (err)
                goto out_unlock;
  
 -      inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
 -                      dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
 -                      mode, &index);
 +      inode = btrfs_new_inode(trans, root, mnt_userns, dir,
 +                      dentry->d_name.name, dentry->d_name.len,
 +                      btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                inode = NULL;
@@@ -6780,9 -6728,9 +6780,9 @@@ static int btrfs_create(struct user_nam
        if (err)
                goto out_unlock;
  
 -      inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
 -                      dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
 -                      mode, &index);
 +      inode = btrfs_new_inode(trans, root, mnt_userns, dir,
 +                      dentry->d_name.name, dentry->d_name.len,
 +                      btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                inode = NULL;
@@@ -6925,9 -6873,8 +6925,9 @@@ static int btrfs_mkdir(struct user_name
        if (err)
                goto out_fail;
  
 -      inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
 -                      dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
 +      inode = btrfs_new_inode(trans, root, mnt_userns, dir,
 +                      dentry->d_name.name, dentry->d_name.len,
 +                      btrfs_ino(BTRFS_I(dir)), objectid,
                        S_IFDIR | mode, &index);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
@@@ -8247,9 -8194,10 +8247,10 @@@ static struct btrfs_dio_private *btrfs_
        return dip;
  }
  
- static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
+ static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter,
                struct bio *dio_bio, loff_t file_offset)
  {
+       struct inode *inode = iter->inode;
        const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE);
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        const bool raid56 = (btrfs_data_alloc_profile(fs_info) &
        u64 start_sector;
        int async_submit = 0;
        u64 submit_len;
 -      int clone_offset = 0;
 -      int clone_len;
 +      u64 clone_offset = 0;
 +      u64 clone_len;
        u64 logical;
        int ret;
        blk_status_t status;
        struct btrfs_io_geometry geom;
-       struct btrfs_dio_data *dio_data = iomap->private;
+       struct btrfs_dio_data *dio_data = iter->iomap.private;
        struct extent_map *em = NULL;
  
        dip = btrfs_create_dio_private(dio_bio, inode, file_offset);
                        status = errno_to_blk_status(ret);
                        goto out_err_em;
                }
 -              ASSERT(geom.len <= INT_MAX);
  
 -              clone_len = min_t(int, submit_len, geom.len);
 +              clone_len = min(submit_len, geom.len);
 +              ASSERT(clone_len <= UINT_MAX);
  
                /*
                 * This will never fail as it's passing GPF_NOFS and
@@@ -8454,47 -8402,11 +8455,47 @@@ static void btrfs_readahead(struct read
        extent_readahead(rac);
  }
  
 +/*
 + * For releasepage() and invalidatepage() we have a race window where
 + * end_page_writeback() is called but the subpage spinlock is not yet released.
 + * If we continue to release/invalidate the page, we could cause use-after-free
 + * for subpage spinlock.  So this function is to spin and wait for subpage
 + * spinlock.
 + */
 +static void wait_subpage_spinlock(struct page *page)
 +{
 +      struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
 +      struct btrfs_subpage *subpage;
 +
 +      if (fs_info->sectorsize == PAGE_SIZE)
 +              return;
 +
 +      ASSERT(PagePrivate(page) && page->private);
 +      subpage = (struct btrfs_subpage *)page->private;
 +
 +      /*
 +       * This may look insane as we just acquire the spinlock and release it,
 +       * without doing anything.  But we just want to make sure no one is
 +       * still holding the subpage spinlock.
 +       * And since the page is not dirty nor writeback, and we have page
 +       * locked, the only possible way to hold a spinlock is from the endio
 +       * function to clear page writeback.
 +       *
 +       * Here we just acquire the spinlock so that all existing callers
 +       * should exit and we're safe to release/invalidate the page.
 +       */
 +      spin_lock_irq(&subpage->lock);
 +      spin_unlock_irq(&subpage->lock);
 +}
 +
  static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
  {
        int ret = try_release_extent_mapping(page, gfp_flags);
 -      if (ret == 1)
 +
 +      if (ret == 1) {
 +              wait_subpage_spinlock(page);
                clear_page_extent_mapped(page);
 +      }
        return ret;
  }
  
@@@ -8558,7 -8470,6 +8559,7 @@@ static void btrfs_invalidatepage(struc
         * do double ordered extent accounting on the same page.
         */
        wait_on_page_writeback(page);
 +      wait_subpage_spinlock(page);
  
        /*
         * For subpage case, we have call sites like
                spin_unlock_irq(&inode->ordered_tree.lock);
  
                if (btrfs_dec_test_ordered_pending(inode, &ordered,
 -                                      cur, range_end + 1 - cur, 1)) {
 +                                                 cur, range_end + 1 - cur)) {
                        btrfs_finish_ordered_io(ordered);
                        /*
                         * The ordered extent has finished, now we're again
@@@ -9028,8 -8939,7 +9029,8 @@@ out
   */
  int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
                             struct btrfs_root *new_root,
 -                           struct btrfs_root *parent_root)
 +                           struct btrfs_root *parent_root,
 +                           struct user_namespace *mnt_userns)
  {
        struct inode *inode;
        int err;
        if (err < 0)
                return err;
  
 -      inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, ino, ino,
 +      inode = btrfs_new_inode(trans, new_root, mnt_userns, NULL, "..", 2,
 +                              ino, ino,
                                S_IFDIR | (~current_umask() & S_IRWXUGO),
                                &index);
        if (IS_ERR(inode))
@@@ -9085,7 -8994,6 +9086,7 @@@ struct inode *btrfs_alloc_inode(struct 
        ei->defrag_bytes = 0;
        ei->disk_i_size = 0;
        ei->flags = 0;
 +      ei->ro_flags = 0;
        ei->csum_bytes = 0;
        ei->index_cnt = (u64)-1;
        ei->dir_index = 0;
@@@ -9267,7 -9175,6 +9268,7 @@@ static int btrfs_getattr(struct user_na
        struct inode *inode = d_inode(path->dentry);
        u32 blocksize = inode->i_sb->s_blocksize;
        u32 bi_flags = BTRFS_I(inode)->flags;
 +      u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
  
        stat->result_mask |= STATX_BTIME;
        stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec;
                stat->attributes |= STATX_ATTR_IMMUTABLE;
        if (bi_flags & BTRFS_INODE_NODUMP)
                stat->attributes |= STATX_ATTR_NODUMP;
 +      if (bi_ro_flags & BTRFS_INODE_RO_VERITY)
 +              stat->attributes |= STATX_ATTR_VERITY;
  
        stat->attributes_mask |= (STATX_ATTR_APPEND |
                                  STATX_ATTR_COMPRESSED |
                                  STATX_ATTR_IMMUTABLE |
                                  STATX_ATTR_NODUMP);
  
 -      generic_fillattr(&init_user_ns, inode, stat);
 +      generic_fillattr(mnt_userns, inode, stat);
        stat->dev = BTRFS_I(inode)->root->anon_dev;
  
        spin_lock(&BTRFS_I(inode)->lock);
@@@ -9322,14 -9227,8 +9323,14 @@@ static int btrfs_rename_exchange(struc
        bool dest_log_pinned = false;
        bool need_abort = false;
  
 -      /* we only allow rename subvolume link between subvolumes */
 -      if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
 +      /*
 +       * For non-subvolumes allow exchange only within one subvolume, in the
 +       * same inode namespace. Two subvolumes (represented as directory) can
 +       * be exchanged as they're a logical link and have a fixed inode number.
 +       */
 +      if (root != dest &&
 +          (old_ino != BTRFS_FIRST_FREE_OBJECTID ||
 +           new_ino != BTRFS_FIRST_FREE_OBJECTID))
                return -EXDEV;
  
        /* close the race window with snapshot create/destroy ioctl */
                /* force full log commit if subvolume involved. */
                btrfs_set_log_full_commit(trans);
        } else {
 -              btrfs_pin_log_trans(root);
 -              root_log_pinned = true;
                ret = btrfs_insert_inode_ref(trans, dest,
                                             new_dentry->d_name.name,
                                             new_dentry->d_name.len,
                /* force full log commit if subvolume involved. */
                btrfs_set_log_full_commit(trans);
        } else {
 -              btrfs_pin_log_trans(dest);
 -              dest_log_pinned = true;
                ret = btrfs_insert_inode_ref(trans, root,
                                             old_dentry->d_name.name,
                                             old_dentry->d_name.len,
                                BTRFS_I(new_inode), 1);
        }
  
 +      /*
 +       * Now pin the logs of the roots. We do it to ensure that no other task
 +       * can sync the logs while we are in progress with the rename, because
 +       * that could result in an inconsistency in case any of the inodes that
 +       * are part of this rename operation were logged before.
 +       *
 +       * We pin the logs even if at this precise moment none of the inodes was
 +       * logged before. This is because right after we checked for that, some
 +       * other task fsyncing some other inode not involved with this rename
 +       * operation could log that one of our inodes exists.
 +       *
 +       * We don't need to pin the logs before the above calls to
 +       * btrfs_insert_inode_ref(), since those don't ever need to change a log.
 +       */
 +      if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
 +              btrfs_pin_log_trans(root);
 +              root_log_pinned = true;
 +      }
 +      if (new_ino != BTRFS_FIRST_FREE_OBJECTID) {
 +              btrfs_pin_log_trans(dest);
 +              dest_log_pinned = true;
 +      }
 +
        /* src is a subvolume */
        if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
                ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
@@@ -9526,7 -9406,8 +9527,7 @@@ out_fail
                if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) ||
                    btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) ||
                    btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
 -                  (new_inode &&
 -                   btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)))
 +                  btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))
                        btrfs_set_log_full_commit(trans);
  
                if (root_log_pinned) {
@@@ -9550,7 -9431,6 +9551,7 @@@ out_notrans
  
  static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root,
 +                                   struct user_namespace *mnt_userns,
                                     struct inode *dir,
                                     struct dentry *dentry)
  {
        if (ret)
                return ret;
  
 -      inode = btrfs_new_inode(trans, root, dir,
 +      inode = btrfs_new_inode(trans, root, mnt_userns, dir,
                                dentry->d_name.name,
                                dentry->d_name.len,
                                btrfs_ino(BTRFS_I(dir)),
        return ret;
  }
  
 -static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 -                         struct inode *new_dir, struct dentry *new_dentry,
 -                         unsigned int flags)
 +static int btrfs_rename(struct user_namespace *mnt_userns,
 +                      struct inode *old_dir, struct dentry *old_dentry,
 +                      struct inode *new_dir, struct dentry *new_dentry,
 +                      unsigned int flags)
  {
        struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
        struct btrfs_trans_handle *trans;
                /* force full log commit if subvolume involved. */
                btrfs_set_log_full_commit(trans);
        } else {
 -              btrfs_pin_log_trans(root);
 -              log_pinned = true;
                ret = btrfs_insert_inode_ref(trans, dest,
                                             new_dentry->d_name.name,
                                             new_dentry->d_name.len,
        if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
                ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
        } else {
 +              /*
 +               * Now pin the log. We do it to ensure that no other task can
 +               * sync the log while we are in progress with the rename, as
 +               * that could result in an inconsistency in case any of the
 +               * inodes that are part of this rename operation were logged
 +               * before.
 +               *
 +               * We pin the log even if at this precise moment none of the
 +               * inodes was logged before. This is because right after we
 +               * checked for that, some other task fsyncing some other inode
 +               * not involved with this rename operation could log that one of
 +               * our inodes exists.
 +               *
 +               * We don't need to pin the logs before the above call to
 +               * btrfs_insert_inode_ref(), since that does not need to change
 +               * a log.
 +               */
 +              btrfs_pin_log_trans(root);
 +              log_pinned = true;
                ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
                                        BTRFS_I(d_inode(old_dentry)),
                                        old_dentry->d_name.name,
        }
  
        if (flags & RENAME_WHITEOUT) {
 -              ret = btrfs_whiteout_for_rename(trans, root, old_dir,
 -                                              old_dentry);
 +              ret = btrfs_whiteout_for_rename(trans, root, mnt_userns,
 +                                              old_dir, old_dentry);
  
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
@@@ -9844,8 -9706,7 +9845,8 @@@ static int btrfs_rename2(struct user_na
                return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
                                          new_dentry);
  
 -      return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
 +      return btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir,
 +                          new_dentry, flags);
  }
  
  struct btrfs_delalloc_work {
@@@ -9942,7 -9803,11 +9943,7 @@@ static int start_delalloc_inodes(struc
                        btrfs_queue_work(root->fs_info->flush_workers,
                                         &work->work);
                } else {
 -                      ret = sync_inode(inode, wbc);
 -                      if (!ret &&
 -                          test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
 -                                   &BTRFS_I(inode)->runtime_flags))
 -                              ret = sync_inode(inode, wbc);
 +                      ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc);
                        btrfs_add_delayed_iput(inode);
                        if (ret || wbc->nr_to_write <= 0)
                                goto out;
@@@ -10077,10 -9942,9 +10078,10 @@@ static int btrfs_symlink(struct user_na
        if (err)
                goto out_unlock;
  
 -      inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
 -                              dentry->d_name.len, btrfs_ino(BTRFS_I(dir)),
 -                              objectid, S_IFLNK|S_IRWXUGO, &index);
 +      inode = btrfs_new_inode(trans, root, mnt_userns, dir,
 +                              dentry->d_name.name, dentry->d_name.len,
 +                              btrfs_ino(BTRFS_I(dir)), objectid,
 +                              S_IFLNK | S_IRWXUGO, &index);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                inode = NULL;
@@@ -10404,7 -10268,7 +10405,7 @@@ static int btrfs_permission(struct user
                if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
                        return -EACCES;
        }
 -      return generic_permission(&init_user_ns, inode, mask);
 +      return generic_permission(mnt_userns, inode, mask);
  }
  
  static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
        if (ret)
                goto out;
  
 -      inode = btrfs_new_inode(trans, root, dir, NULL, 0,
 +      inode = btrfs_new_inode(trans, root, mnt_userns, dir, NULL, 0,
                        btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
        if (IS_ERR(inode)) {
                ret = PTR_ERR(inode);
diff --combined fs/dax.c
+++ b/fs/dax.c
@@@ -722,7 -722,7 +722,7 @@@ static int copy_cow_page_dax(struct blo
                return rc;
  
        id = dax_read_lock();
 -      rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(PAGE_SIZE), &kaddr, NULL);
 +      rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL);
        if (rc < 0) {
                dax_read_unlock(id);
                return rc;
@@@ -1005,12 -1005,12 +1005,12 @@@ int dax_writeback_mapping_range(struct 
  }
  EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
  
- static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
+ static sector_t dax_iomap_sector(const struct iomap *iomap, loff_t pos)
  {
        return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
  }
  
- static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size,
+ static int dax_iomap_pfn(const struct iomap *iomap, loff_t pos, size_t size,
                         pfn_t *pfnp)
  {
        const sector_t sector = dax_iomap_sector(iomap, pos);
@@@ -1066,6 -1066,66 +1066,66 @@@ static vm_fault_t dax_load_hole(struct 
        return ret;
  }
  
+ #ifdef CONFIG_FS_DAX_PMD
+ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
+               const struct iomap *iomap, void **entry)
+ {
+       struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+       unsigned long pmd_addr = vmf->address & PMD_MASK;
+       struct vm_area_struct *vma = vmf->vma;
+       struct inode *inode = mapping->host;
+       pgtable_t pgtable = NULL;
+       struct page *zero_page;
+       spinlock_t *ptl;
+       pmd_t pmd_entry;
+       pfn_t pfn;
+       zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
+       if (unlikely(!zero_page))
+               goto fallback;
+       pfn = page_to_pfn_t(zero_page);
+       *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
+                       DAX_PMD | DAX_ZERO_PAGE, false);
+       if (arch_needs_pgtable_deposit()) {
+               pgtable = pte_alloc_one(vma->vm_mm);
+               if (!pgtable)
+                       return VM_FAULT_OOM;
+       }
+       ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
+       if (!pmd_none(*(vmf->pmd))) {
+               spin_unlock(ptl);
+               goto fallback;
+       }
+       if (pgtable) {
+               pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
+               mm_inc_nr_ptes(vma->vm_mm);
+       }
+       pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
+       pmd_entry = pmd_mkhuge(pmd_entry);
+       set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
+       spin_unlock(ptl);
+       trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry);
+       return VM_FAULT_NOPAGE;
+ fallback:
+       if (pgtable)
+               pte_free(vma->vm_mm, pgtable);
+       trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry);
+       return VM_FAULT_FALLBACK;
+ }
+ #else
+ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
+               const struct iomap *iomap, void **entry)
+ {
+       return VM_FAULT_FALLBACK;
+ }
+ #endif /* CONFIG_FS_DAX_PMD */
  s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
  {
        sector_t sector = iomap_sector(iomap, pos & PAGE_MASK);
        return size;
  }
  
- static loff_t
- dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
-               struct iomap *iomap, struct iomap *srcmap)
+ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
+               struct iov_iter *iter)
  {
+       const struct iomap *iomap = &iomi->iomap;
+       loff_t length = iomap_length(iomi);
+       loff_t pos = iomi->pos;
        struct block_device *bdev = iomap->bdev;
        struct dax_device *dax_dev = iomap->dax_dev;
-       struct iov_iter *iter = data;
        loff_t end = pos + length, done = 0;
        ssize_t ret = 0;
        size_t xfer;
        int id;
  
        if (iov_iter_rw(iter) == READ) {
-               end = min(end, i_size_read(inode));
+               end = min(end, i_size_read(iomi->inode));
                if (pos >= end)
                        return 0;
  
         * written by write(2) is visible in mmap.
         */
        if (iomap->flags & IOMAP_F_NEW) {
-               invalidate_inode_pages2_range(inode->i_mapping,
+               invalidate_inode_pages2_range(iomi->inode->i_mapping,
                                              pos >> PAGE_SHIFT,
                                              (end - 1) >> PAGE_SHIFT);
        }
@@@ -1209,31 -1270,29 +1270,29 @@@ ssize_
  dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
                const struct iomap_ops *ops)
  {
-       struct address_space *mapping = iocb->ki_filp->f_mapping;
-       struct inode *inode = mapping->host;
-       loff_t pos = iocb->ki_pos, ret = 0, done = 0;
-       unsigned flags = 0;
+       struct iomap_iter iomi = {
+               .inode          = iocb->ki_filp->f_mapping->host,
+               .pos            = iocb->ki_pos,
+               .len            = iov_iter_count(iter),
+       };
+       loff_t done = 0;
+       int ret;
  
        if (iov_iter_rw(iter) == WRITE) {
-               lockdep_assert_held_write(&inode->i_rwsem);
-               flags |= IOMAP_WRITE;
+               lockdep_assert_held_write(&iomi.inode->i_rwsem);
+               iomi.flags |= IOMAP_WRITE;
        } else {
-               lockdep_assert_held(&inode->i_rwsem);
+               lockdep_assert_held(&iomi.inode->i_rwsem);
        }
  
        if (iocb->ki_flags & IOCB_NOWAIT)
-               flags |= IOMAP_NOWAIT;
+               iomi.flags |= IOMAP_NOWAIT;
  
-       while (iov_iter_count(iter)) {
-               ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
-                               iter, dax_iomap_actor);
-               if (ret <= 0)
-                       break;
-               pos += ret;
-               done += ret;
-       }
+       while ((ret = iomap_iter(&iomi, ops)) > 0)
+               iomi.processed = dax_iomap_iter(&iomi, iter);
  
-       iocb->ki_pos += done;
+       done = iomi.pos - iocb->ki_pos;
+       iocb->ki_pos = iomi.pos;
        return done ? done : ret;
  }
  EXPORT_SYMBOL_GPL(dax_iomap_rw);
@@@ -1250,44 -1309,146 +1309,146 @@@ static vm_fault_t dax_fault_return(int 
   * flushed on write-faults (non-cow), but not read-faults.
   */
  static bool dax_fault_is_synchronous(unsigned long flags,
-               struct vm_area_struct *vma, struct iomap *iomap)
+               struct vm_area_struct *vma, const struct iomap *iomap)
  {
        return (flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC)
                && (iomap->flags & IOMAP_F_DIRTY);
  }
  
+ /*
+  * When handling a synchronous page fault and the inode need a fsync, we can
+  * insert the PTE/PMD into page tables only after that fsync happened. Skip
+  * insertion for now and return the pfn so that caller can insert it after the
+  * fsync is done.
+  */
+ static vm_fault_t dax_fault_synchronous_pfnp(pfn_t *pfnp, pfn_t pfn)
+ {
+       if (WARN_ON_ONCE(!pfnp))
+               return VM_FAULT_SIGBUS;
+       *pfnp = pfn;
+       return VM_FAULT_NEEDDSYNC;
+ }
+ static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf,
+               const struct iomap_iter *iter)
+ {
+       sector_t sector = dax_iomap_sector(&iter->iomap, iter->pos);
+       unsigned long vaddr = vmf->address;
+       vm_fault_t ret;
+       int error = 0;
+       switch (iter->iomap.type) {
+       case IOMAP_HOLE:
+       case IOMAP_UNWRITTEN:
+               clear_user_highpage(vmf->cow_page, vaddr);
+               break;
+       case IOMAP_MAPPED:
+               error = copy_cow_page_dax(iter->iomap.bdev, iter->iomap.dax_dev,
+                                         sector, vmf->cow_page, vaddr);
+               break;
+       default:
+               WARN_ON_ONCE(1);
+               error = -EIO;
+               break;
+       }
+       if (error)
+               return dax_fault_return(error);
+       __SetPageUptodate(vmf->cow_page);
+       ret = finish_fault(vmf);
+       if (!ret)
+               return VM_FAULT_DONE_COW;
+       return ret;
+ }
+ /**
+  * dax_fault_iter - Common actor to handle pfn insertion in PTE/PMD fault.
+  * @vmf:      vm fault instance
+  * @iter:     iomap iter
+  * @pfnp:     pfn to be returned
+  * @xas:      the dax mapping tree of a file
+  * @entry:    an unlocked dax entry to be inserted
+  * @pmd:      distinguish whether it is a pmd fault
+  */
+ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
+               const struct iomap_iter *iter, pfn_t *pfnp,
+               struct xa_state *xas, void **entry, bool pmd)
+ {
+       struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+       const struct iomap *iomap = &iter->iomap;
+       size_t size = pmd ? PMD_SIZE : PAGE_SIZE;
+       loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
+       bool write = vmf->flags & FAULT_FLAG_WRITE;
+       bool sync = dax_fault_is_synchronous(iter->flags, vmf->vma, iomap);
+       unsigned long entry_flags = pmd ? DAX_PMD : 0;
+       int err = 0;
+       pfn_t pfn;
+       if (!pmd && vmf->cow_page)
+               return dax_fault_cow_page(vmf, iter);
+       /* if we are reading UNWRITTEN and HOLE, return a hole. */
+       if (!write &&
+           (iomap->type == IOMAP_UNWRITTEN || iomap->type == IOMAP_HOLE)) {
+               if (!pmd)
+                       return dax_load_hole(xas, mapping, entry, vmf);
+               return dax_pmd_load_hole(xas, vmf, iomap, entry);
+       }
+       if (iomap->type != IOMAP_MAPPED) {
+               WARN_ON_ONCE(1);
+               return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS;
+       }
+       err = dax_iomap_pfn(&iter->iomap, pos, size, &pfn);
+       if (err)
+               return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err);
+       *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, entry_flags,
+                                 write && !sync);
+       if (sync)
+               return dax_fault_synchronous_pfnp(pfnp, pfn);
+       /* insert PMD pfn */
+       if (pmd)
+               return vmf_insert_pfn_pmd(vmf, pfn, write);
+       /* insert PTE pfn */
+       if (write)
+               return vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
+       return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
+ }
  static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
                               int *iomap_errp, const struct iomap_ops *ops)
  {
-       struct vm_area_struct *vma = vmf->vma;
-       struct address_space *mapping = vma->vm_file->f_mapping;
+       struct address_space *mapping = vmf->vma->vm_file->f_mapping;
        XA_STATE(xas, &mapping->i_pages, vmf->pgoff);
-       struct inode *inode = mapping->host;
-       unsigned long vaddr = vmf->address;
-       loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
-       struct iomap iomap = { .type = IOMAP_HOLE };
-       struct iomap srcmap = { .type = IOMAP_HOLE };
-       unsigned flags = IOMAP_FAULT;
-       int error, major = 0;
-       bool write = vmf->flags & FAULT_FLAG_WRITE;
-       bool sync;
+       struct iomap_iter iter = {
+               .inode          = mapping->host,
+               .pos            = (loff_t)vmf->pgoff << PAGE_SHIFT,
+               .len            = PAGE_SIZE,
+               .flags          = IOMAP_FAULT,
+       };
        vm_fault_t ret = 0;
        void *entry;
-       pfn_t pfn;
+       int error;
  
-       trace_dax_pte_fault(inode, vmf, ret);
+       trace_dax_pte_fault(iter.inode, vmf, ret);
        /*
         * Check whether offset isn't beyond end of file now. Caller is supposed
         * to hold locks serializing us with truncate / punch hole so this is
         * a reliable test.
         */
-       if (pos >= i_size_read(inode)) {
+       if (iter.pos >= i_size_read(iter.inode)) {
                ret = VM_FAULT_SIGBUS;
                goto out;
        }
  
-       if (write && !vmf->cow_page)
-               flags |= IOMAP_WRITE;
+       if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
+               iter.flags |= IOMAP_WRITE;
  
        entry = grab_mapping_entry(&xas, mapping, 0);
        if (xa_is_internal(entry)) {
                goto unlock_entry;
        }
  
-       /*
-        * Note that we don't bother to use iomap_apply here: DAX required
-        * the file system block size to be equal the page size, which means
-        * that we never have to deal with more than a single extent here.
-        */
-       error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap, &srcmap);
-       if (iomap_errp)
-               *iomap_errp = error;
-       if (error) {
-               ret = dax_fault_return(error);
-               goto unlock_entry;
-       }
-       if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
-               error = -EIO;   /* fs corruption? */
-               goto error_finish_iomap;
-       }
-       if (vmf->cow_page) {
-               sector_t sector = dax_iomap_sector(&iomap, pos);
-               switch (iomap.type) {
-               case IOMAP_HOLE:
-               case IOMAP_UNWRITTEN:
-                       clear_user_highpage(vmf->cow_page, vaddr);
-                       break;
-               case IOMAP_MAPPED:
-                       error = copy_cow_page_dax(iomap.bdev, iomap.dax_dev,
-                                                 sector, vmf->cow_page, vaddr);
-                       break;
-               default:
-                       WARN_ON_ONCE(1);
-                       error = -EIO;
-                       break;
+       while ((error = iomap_iter(&iter, ops)) > 0) {
+               if (WARN_ON_ONCE(iomap_length(&iter) < PAGE_SIZE)) {
+                       iter.processed = -EIO;  /* fs corruption? */
+                       continue;
                }
  
-               if (error)
-                       goto error_finish_iomap;
-               __SetPageUptodate(vmf->cow_page);
-               ret = finish_fault(vmf);
-               if (!ret)
-                       ret = VM_FAULT_DONE_COW;
-               goto finish_iomap;
-       }
-       sync = dax_fault_is_synchronous(flags, vma, &iomap);
-       switch (iomap.type) {
-       case IOMAP_MAPPED:
-               if (iomap.flags & IOMAP_F_NEW) {
+               ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, false);
+               if (ret != VM_FAULT_SIGBUS &&
+                   (iter.iomap.flags & IOMAP_F_NEW)) {
                        count_vm_event(PGMAJFAULT);
-                       count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
-                       major = VM_FAULT_MAJOR;
+                       count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
+                       ret |= VM_FAULT_MAJOR;
                }
-               error = dax_iomap_pfn(&iomap, pos, PAGE_SIZE, &pfn);
-               if (error < 0)
-                       goto error_finish_iomap;
  
-               entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
-                                                0, write && !sync);
-               /*
-                * If we are doing synchronous page fault and inode needs fsync,
-                * we can insert PTE into page tables only after that happens.
-                * Skip insertion for now and return the pfn so that caller can
-                * insert it after fsync is done.
-                */
-               if (sync) {
-                       if (WARN_ON_ONCE(!pfnp)) {
-                               error = -EIO;
-                               goto error_finish_iomap;
-                       }
-                       *pfnp = pfn;
-                       ret = VM_FAULT_NEEDDSYNC | major;
-                       goto finish_iomap;
-               }
-               trace_dax_insert_mapping(inode, vmf, entry);
-               if (write)
-                       ret = vmf_insert_mixed_mkwrite(vma, vaddr, pfn);
-               else
-                       ret = vmf_insert_mixed(vma, vaddr, pfn);
-               goto finish_iomap;
-       case IOMAP_UNWRITTEN:
-       case IOMAP_HOLE:
-               if (!write) {
-                       ret = dax_load_hole(&xas, mapping, &entry, vmf);
-                       goto finish_iomap;
-               }
-               fallthrough;
-       default:
-               WARN_ON_ONCE(1);
-               error = -EIO;
-               break;
+               if (!(ret & VM_FAULT_ERROR))
+                       iter.processed = PAGE_SIZE;
        }
  
-  error_finish_iomap:
-       ret = dax_fault_return(error);
-  finish_iomap:
-       if (ops->iomap_end) {
-               int copied = PAGE_SIZE;
+       if (iomap_errp)
+               *iomap_errp = error;
+       if (!ret && error)
+               ret = dax_fault_return(error);
  
-               if (ret & VM_FAULT_ERROR)
-                       copied = 0;
-               /*
-                * The fault is done by now and there's no way back (other
-                * thread may be already happily using PTE we have installed).
-                * Just ignore error from ->iomap_end since we cannot do much
-                * with it.
-                */
-               ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
-       }
-  unlock_entry:
+ unlock_entry:
        dax_unlock_entry(&xas, entry);
 out:
-       trace_dax_pte_fault_done(inode, vmf, ret);
-       return ret | major;
+ out:
+       trace_dax_pte_fault_done(iter.inode, vmf, ret);
+       return ret;
  }
  
  #ifdef CONFIG_FS_DAX_PMD
- static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
-               struct iomap *iomap, void **entry)
+ static bool dax_fault_check_fallback(struct vm_fault *vmf, struct xa_state *xas,
+               pgoff_t max_pgoff)
  {
-       struct address_space *mapping = vmf->vma->vm_file->f_mapping;
        unsigned long pmd_addr = vmf->address & PMD_MASK;
-       struct vm_area_struct *vma = vmf->vma;
-       struct inode *inode = mapping->host;
-       pgtable_t pgtable = NULL;
-       struct page *zero_page;
-       spinlock_t *ptl;
-       pmd_t pmd_entry;
-       pfn_t pfn;
-       zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
-       if (unlikely(!zero_page))
-               goto fallback;
+       bool write = vmf->flags & FAULT_FLAG_WRITE;
  
-       pfn = page_to_pfn_t(zero_page);
-       *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
-                       DAX_PMD | DAX_ZERO_PAGE, false);
+       /*
+        * Make sure that the faulting address's PMD offset (color) matches
+        * the PMD offset from the start of the file.  This is necessary so
+        * that a PMD range in the page table overlaps exactly with a PMD
+        * range in the page cache.
+        */
+       if ((vmf->pgoff & PG_PMD_COLOUR) !=
+           ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR))
+               return true;
  
-       if (arch_needs_pgtable_deposit()) {
-               pgtable = pte_alloc_one(vma->vm_mm);
-               if (!pgtable)
-                       return VM_FAULT_OOM;
-       }
+       /* Fall back to PTEs if we're going to COW */
+       if (write && !(vmf->vma->vm_flags & VM_SHARED))
+               return true;
  
-       ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
-       if (!pmd_none(*(vmf->pmd))) {
-               spin_unlock(ptl);
-               goto fallback;
-       }
+       /* If the PMD would extend outside the VMA */
+       if (pmd_addr < vmf->vma->vm_start)
+               return true;
+       if ((pmd_addr + PMD_SIZE) > vmf->vma->vm_end)
+               return true;
  
-       if (pgtable) {
-               pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
-               mm_inc_nr_ptes(vma->vm_mm);
-       }
-       pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
-       pmd_entry = pmd_mkhuge(pmd_entry);
-       set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
-       spin_unlock(ptl);
-       trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry);
-       return VM_FAULT_NOPAGE;
+       /* If the PMD would extend beyond the file size */
+       if ((xas->xa_index | PG_PMD_COLOUR) >= max_pgoff)
+               return true;
  
- fallback:
-       if (pgtable)
-               pte_free(vma->vm_mm, pgtable);
-       trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry);
-       return VM_FAULT_FALLBACK;
+       return false;
  }
  
  static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
                               const struct iomap_ops *ops)
  {
-       struct vm_area_struct *vma = vmf->vma;
-       struct address_space *mapping = vma->vm_file->f_mapping;
+       struct address_space *mapping = vmf->vma->vm_file->f_mapping;
        XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER);
-       unsigned long pmd_addr = vmf->address & PMD_MASK;
-       bool write = vmf->flags & FAULT_FLAG_WRITE;
-       bool sync;
-       unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
-       struct inode *inode = mapping->host;
-       vm_fault_t result = VM_FAULT_FALLBACK;
-       struct iomap iomap = { .type = IOMAP_HOLE };
-       struct iomap srcmap = { .type = IOMAP_HOLE };
+       struct iomap_iter iter = {
+               .inode          = mapping->host,
+               .len            = PMD_SIZE,
+               .flags          = IOMAP_FAULT,
+       };
+       vm_fault_t ret = VM_FAULT_FALLBACK;
        pgoff_t max_pgoff;
        void *entry;
-       loff_t pos;
        int error;
-       pfn_t pfn;
+       if (vmf->flags & FAULT_FLAG_WRITE)
+               iter.flags |= IOMAP_WRITE;
  
        /*
         * Check whether offset isn't beyond end of file now. Caller is
         * supposed to hold locks serializing us with truncate / punch hole so
         * this is a reliable test.
         */
-       max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
-       trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
-       /*
-        * Make sure that the faulting address's PMD offset (color) matches
-        * the PMD offset from the start of the file.  This is necessary so
-        * that a PMD range in the page table overlaps exactly with a PMD
-        * range in the page cache.
-        */
-       if ((vmf->pgoff & PG_PMD_COLOUR) !=
-           ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR))
-               goto fallback;
+       max_pgoff = DIV_ROUND_UP(i_size_read(iter.inode), PAGE_SIZE);
  
-       /* Fall back to PTEs if we're going to COW */
-       if (write && !(vma->vm_flags & VM_SHARED))
-               goto fallback;
-       /* If the PMD would extend outside the VMA */
-       if (pmd_addr < vma->vm_start)
-               goto fallback;
-       if ((pmd_addr + PMD_SIZE) > vma->vm_end)
-               goto fallback;
+       trace_dax_pmd_fault(iter.inode, vmf, max_pgoff, 0);
  
        if (xas.xa_index >= max_pgoff) {
-               result = VM_FAULT_SIGBUS;
+               ret = VM_FAULT_SIGBUS;
                goto out;
        }
  
-       /* If the PMD would extend beyond the file size */
-       if ((xas.xa_index | PG_PMD_COLOUR) >= max_pgoff)
+       if (dax_fault_check_fallback(vmf, &xas, max_pgoff))
                goto fallback;
  
        /*
         */
        entry = grab_mapping_entry(&xas, mapping, PMD_ORDER);
        if (xa_is_internal(entry)) {
-               result = xa_to_internal(entry);
+               ret = xa_to_internal(entry);
                goto fallback;
        }
  
         */
        if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) &&
                        !pmd_devmap(*vmf->pmd)) {
-               result = 0;
+               ret = 0;
                goto unlock_entry;
        }
  
-       /*
-        * Note that we don't use iomap_apply here.  We aren't doing I/O, only
-        * setting up a mapping, so really we're using iomap_begin() as a way
-        * to look up our filesystem block.
-        */
-       pos = (loff_t)xas.xa_index << PAGE_SHIFT;
-       error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap,
-                       &srcmap);
-       if (error)
-               goto unlock_entry;
-       if (iomap.offset + iomap.length < pos + PMD_SIZE)
-               goto finish_iomap;
-       sync = dax_fault_is_synchronous(iomap_flags, vma, &iomap);
-       switch (iomap.type) {
-       case IOMAP_MAPPED:
-               error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn);
-               if (error < 0)
-                       goto finish_iomap;
-               entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
-                                               DAX_PMD, write && !sync);
-               /*
-                * If we are doing synchronous page fault and inode needs fsync,
-                * we can insert PMD into page tables only after that happens.
-                * Skip insertion for now and return the pfn so that caller can
-                * insert it after fsync is done.
-                */
-               if (sync) {
-                       if (WARN_ON_ONCE(!pfnp))
-                               goto finish_iomap;
-                       *pfnp = pfn;
-                       result = VM_FAULT_NEEDDSYNC;
-                       goto finish_iomap;
-               }
+       iter.pos = (loff_t)xas.xa_index << PAGE_SHIFT;
+       while ((error = iomap_iter(&iter, ops)) > 0) {
+               if (iomap_length(&iter) < PMD_SIZE)
+                       continue; /* actually breaks out of the loop */
  
-               trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
-               result = vmf_insert_pfn_pmd(vmf, pfn, write);
-               break;
-       case IOMAP_UNWRITTEN:
-       case IOMAP_HOLE:
-               if (WARN_ON_ONCE(write))
-                       break;
-               result = dax_pmd_load_hole(&xas, vmf, &iomap, &entry);
-               break;
-       default:
-               WARN_ON_ONCE(1);
-               break;
+               ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true);
+               if (ret != VM_FAULT_FALLBACK)
+                       iter.processed = PMD_SIZE;
        }
  
-  finish_iomap:
-       if (ops->iomap_end) {
-               int copied = PMD_SIZE;
-               if (result == VM_FAULT_FALLBACK)
-                       copied = 0;
-               /*
-                * The fault is done by now and there's no way back (other
-                * thread may be already happily using PMD we have installed).
-                * Just ignore error from ->iomap_end since we cannot do much
-                * with it.
-                */
-               ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
-                               &iomap);
-       }
-  unlock_entry:
+ unlock_entry:
        dax_unlock_entry(&xas, entry);
 fallback:
-       if (result == VM_FAULT_FALLBACK) {
-               split_huge_pmd(vma, vmf->pmd, vmf->address);
+ fallback:
+       if (ret == VM_FAULT_FALLBACK) {
+               split_huge_pmd(vmf->vma, vmf->pmd, vmf->address);
                count_vm_event(THP_FAULT_FALLBACK);
        }
  out:
-       trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result);
-       return result;
+       trace_dax_pmd_fault_done(iter.inode, vmf, max_pgoff, ret);
+       return ret;
  }
  #else
  static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
diff --combined fs/internal.h
@@@ -48,8 -48,8 +48,8 @@@ static inline int emergency_thaw_bdev(s
  /*
   * buffer.c
   */
extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
-               get_block_t *get_block, struct iomap *iomap);
+ int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
+               get_block_t *get_block, const struct iomap *iomap);
  
  /*
   * char_dev.c
@@@ -71,15 -71,11 +71,15 @@@ extern int filename_lookup(int dfd, str
                           struct path *path, struct path *root);
  extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
                           const char *, unsigned int, struct path *);
 -long do_rmdir(int dfd, struct filename *name);
 -long do_unlinkat(int dfd, struct filename *name);
 +int do_rmdir(int dfd, struct filename *name);
 +int do_unlinkat(int dfd, struct filename *name);
  int may_linkat(struct user_namespace *mnt_userns, struct path *link);
  int do_renameat2(int olddfd, struct filename *oldname, int newdfd,
                 struct filename *newname, unsigned int flags);
 +int do_mkdirat(int dfd, struct filename *name, umode_t mode);
 +int do_symlinkat(struct filename *from, int newdfd, struct filename *to);
 +int do_linkat(int olddfd, struct filename *old, int newdfd,
 +                      struct filename *new, int flags);
  
  /*
   * namespace.c