Merge tag 'for-5.15-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

[linux-2.6-microblaze.git] / fs / btrfs / inode.c
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c

index bd5689f..2b7fe98 100644 (file)
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -32,6 +32,7 @@
  #include <linux/sched/mm.h>
  #include <linux/iomap.h>
  #include <asm/unaligned.h>
+#include <linux/fsverity.h>
  #include "misc.h"
  #include "ctree.h"
  #include "disk-io.h"
@@ -286,9 +287,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
                         cur_size = min_t(unsigned long, compressed_size,
                                        PAGE_SIZE);
  
-                       kaddr = kmap_atomic(cpage);
+                       kaddr = page_address(cpage);
                         write_extent_buffer(leaf, kaddr, ptr, cur_size);
-                       kunmap_atomic(kaddr);
  
                         i++;
                         ptr += cur_size;
@@ -490,6 +490,9 @@ static noinline int add_async_extent(struct async_chunk *cow,
   */
  static inline bool inode_can_compress(struct btrfs_inode *inode)
  {
+       /* Subpage doesn't support compression yet */
+       if (inode->root->fs_info->sectorsize < PAGE_SIZE)
+               return false;
         if (inode->flags & BTRFS_INODE_NODATACOW ||
             inode->flags & BTRFS_INODE_NODATASUM)
                 return false;
@@ -682,7 +685,11 @@ again:
                 }
         }
  cont:
-       if (start == 0) {
+       /*
+        * Check cow_file_range() for why we don't even try to create inline
+        * extent for subpage case.
+        */
+       if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
                 /* lets try to make an inline extent */
                 if (ret || total_in < actual_end) {
                         /* we didn't compress the entire range, try
@@ -973,7 +980,7 @@ retry:
  
                         p->mapping = inode->vfs_inode.i_mapping;
                         btrfs_writepage_endio_finish_ordered(inode, p, start,
-                                                            end, 0);
+                                                            end, false);
  
                         p->mapping = NULL;
                         extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
@@ -1080,7 +1087,17 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
  
         inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
  
-       if (start == 0) {
+       /*
+        * Due to the page size limit, for subpage we can only trigger the
+        * writeback for the dirty sectors of page, that means data writeback
+        * is doing more writeback than what we want.
+        *
+        * This is especially unexpected for some call sites like fallocate,
+        * where we only increase i_size after everything is done.
+        * This means we can trigger inline extent even if we didn't want to.
+        * So here we skip inline extent creation completely.
+        */
+       if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
                 /* lets try to make an inline extent */
                 ret = cow_file_range_inline(inode, start, end, 0,
                                             BTRFS_COMPRESS_NONE, NULL);
@@ -1290,11 +1307,6 @@ static noinline void async_cow_submit(struct btrfs_work *work)
         nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
                 PAGE_SHIFT;
  
-       /* atomic_sub_return implies a barrier */
-       if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
-           5 * SZ_1M)
-               cond_wake_up_nomb(&fs_info->async_submit_wait);
-
         /*
          * ->inode could be NULL if async_chunk_start has failed to compress,
          * in which case we don't have anything to submit, yet we need to
@@ -1303,6 +1315,11 @@ static noinline void async_cow_submit(struct btrfs_work *work)
          */
         if (async_chunk->inode)
                 submit_compressed_extents(async_chunk);
+
+       /* atomic_sub_return implies a barrier */
+       if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
+           5 * SZ_1M)
+               cond_wake_up_nomb(&fs_info->async_submit_wait);
  }
  
  static noinline void async_cow_free(struct btrfs_work *work)
@@ -1946,6 +1963,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
                 ret = cow_file_range_async(inode, wbc, locked_page, start, end,
                                            page_started, nr_written);
         }
+       ASSERT(ret <= 0);
         if (ret)
                 btrfs_cleanup_ordered_extents(inode, locked_page, start,
                                               end - start + 1);
@@ -2285,7 +2303,6 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
         struct extent_map *split_mid = NULL;
         struct extent_map *split_post = NULL;
         int ret = 0;
-       int modified;
         unsigned long flags;
  
         /* Sanity check */
@@ -2315,11 +2332,12 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
         ASSERT(em->len == len);
         ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
         ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
+       ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags));
+       ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags));
+       ASSERT(!list_empty(&em->list));
  
         flags = em->flags;
         clear_bit(EXTENT_FLAG_PINNED, &em->flags);
-       clear_bit(EXTENT_FLAG_LOGGING, &flags);
-       modified = !list_empty(&em->list);
  
         /* First, replace the em with a new extent_map starting from * em->start */
         split_pre->start = em->start;
@@ -2333,7 +2351,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
         split_pre->compress_type = em->compress_type;
         split_pre->generation = em->generation;
  
-       replace_extent_mapping(em_tree, em, split_pre, modified);
+       replace_extent_mapping(em_tree, em, split_pre, 1);
  
         /*
          * Now we only have an extent_map at:
@@ -2353,7 +2371,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
                 split_mid->flags = flags;
                 split_mid->compress_type = em->compress_type;
                 split_mid->generation = em->generation;
-               add_extent_mapping(em_tree, split_mid, modified);
+               add_extent_mapping(em_tree, split_mid, 1);
         }
  
         if (post) {
@@ -2367,7 +2385,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
                 split_post->flags = flags;
                 split_post->compress_type = em->compress_type;
                 split_post->generation = em->generation;
-               add_extent_mapping(em_tree, split_post, modified);
+               add_extent_mapping(em_tree, split_post, 1);
         }
  
         /* Once for us */
@@ -2770,7 +2788,7 @@ out_page:
   * to fix it up.  The async helper will wait for ordered extents, set
   * the delalloc bit and make it safe to write the page.
   */
-int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end)
+int btrfs_writepage_cow_fixup(struct page *page)
  {
         struct inode *inode = page->mapping->host;
         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -3171,7 +3189,7 @@ static void finish_ordered_fn(struct btrfs_work *work)
  
  void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
                                           struct page *page, u64 start,
-                                         u64 end, int uptodate)
+                                         u64 end, bool uptodate)
  {
         trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate);
  
@@ -3257,25 +3275,44 @@ unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
                 return 0;
         }
  
-       if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
+       /*
+        * For subpage case, above PageChecked is not safe as it's not subpage
+        * compatible.
+        * But for now only cow fixup and compressed read utilize PageChecked
+        * flag, while in this context we can easily use io_bio->csum to
+        * determine if we really need to do csum verification.
+        *
+        * So for now, just exit if io_bio->csum is NULL, as it means it's
+        * compressed read, and its compressed data csum has already been
+        * verified.
+        */
+       if (io_bio->csum == NULL)
                 return 0;
  
-       if (!root->fs_info->csum_root)
+       if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
                 return 0;
  
-       if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
-           test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
-               clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM);
+       if (!root->fs_info->csum_root)
                 return 0;
-       }
  
         ASSERT(page_offset(page) <= start &&
                end <= page_offset(page) + PAGE_SIZE - 1);
         for (pg_off = offset_in_page(start);
              pg_off < offset_in_page(end);
              pg_off += sectorsize, bio_offset += sectorsize) {
+               u64 file_offset = pg_off + page_offset(page);
                 int ret;
  
+               if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
+                   test_range_bit(io_tree, file_offset,
+                                  file_offset + sectorsize - 1,
+                                  EXTENT_NODATASUM, 1, NULL)) {
+                       /* Skip the range without csum for data reloc inode */
+                       clear_extent_bits(io_tree, file_offset,
+                                         file_offset + sectorsize - 1,
+                                         EXTENT_NODATASUM);
+                       continue;
+               }
                 ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off,
                                       page_offset(page) + pg_off);
                 if (ret < 0) {
@@ -3520,7 +3557,14 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
  
                 /*
                  * If we have an inode with links, there are a couple of
-                * possibilities. Old kernels (before v3.12) used to create an
+                * possibilities:
+                *
+                * 1. We were halfway through creating fsverity metadata for the
+                * file. In that case, the orphan item represents incomplete
+                * fsverity metadata which must be cleaned up with
+                * btrfs_drop_verity_items and deleting the orphan item.
+
+                * 2. Old kernels (before v3.12) used to create an
                  * orphan item for truncate indicating that there were possibly
                  * extent items past i_size that needed to be deleted. In v3.12,
                  * truncate was changed to update i_size in sync with the extent
@@ -3538,8 +3582,12 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                  * but either way, we can delete the orphan item.
                  */
                 if (ret == -ENOENT || inode->i_nlink) {
-                       if (!ret)
+                       if (!ret) {
+                               ret = btrfs_drop_verity_items(BTRFS_I(inode));
                                 iput(inode);
+                               if (ret)
+                                       goto out;
+                       }
                         trans = btrfs_start_transaction(root, 1);
                         if (IS_ERR(trans)) {
                                 ret = PTR_ERR(trans);
@@ -3728,7 +3776,8 @@ static int btrfs_read_locked_inode(struct inode *inode,
         rdev = btrfs_inode_rdev(leaf, inode_item);
  
         BTRFS_I(inode)->index_cnt = (u64)-1;
-       BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
+       btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
+                               &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
  
  cache_index:
         /*
@@ -3859,6 +3908,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
                             struct inode *inode)
  {
         struct btrfs_map_token token;
+       u64 flags;
  
         btrfs_init_map_token(&token, leaf);
  
@@ -3894,7 +3944,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
         btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
         btrfs_set_token_inode_transid(&token, item, trans->transid);
         btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
-       btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags);
+       flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
+                                         BTRFS_I(inode)->ro_flags);
+       btrfs_set_token_inode_flags(&token, item, flags);
         btrfs_set_token_inode_block_group(&token, item, 0);
  }
  
@@ -5088,15 +5140,13 @@ static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode,
         int ret;
  
         /*
-        * Still need to make sure the inode looks like it's been updated so
-        * that any holes get logged if we fsync.
+        * If NO_HOLES is enabled, we don't need to do anything.
+        * Later, up in the call chain, either btrfs_set_inode_last_sub_trans()
+        * or btrfs_update_inode() will be called, which guarantee that the next
+        * fsync will know this inode was changed and needs to be logged.
          */
-       if (btrfs_fs_incompat(fs_info, NO_HOLES)) {
-               inode->last_trans = fs_info->generation;
-               inode->last_sub_trans = root->log_transid;
-               inode->last_log_commit = root->last_log_commit;
+       if (btrfs_fs_incompat(fs_info, NO_HOLES))
                 return 0;
-       }
  
         /*
          * 1 - for the one we're dropping
@@ -5342,7 +5392,7 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr
         if (btrfs_root_readonly(root))
                 return -EROFS;
  
-       err = setattr_prepare(&init_user_ns, dentry, attr);
+       err = setattr_prepare(mnt_userns, dentry, attr);
         if (err)
                 return err;
  
@@ -5353,13 +5403,12 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr
         }
  
         if (attr->ia_valid) {
-               setattr_copy(&init_user_ns, inode, attr);
+               setattr_copy(mnt_userns, inode, attr);
                 inode_inc_iversion(inode);
                 err = btrfs_dirty_inode(inode);
  
                 if (!err && attr->ia_valid & ATTR_MODE)
-                       err = posix_acl_chmod(&init_user_ns, inode,
-                                             inode->i_mode);
+                       err = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
         }
  
         return err;
@@ -5522,6 +5571,7 @@ void btrfs_evict_inode(struct inode *inode)
         trace_btrfs_inode_evict(inode);
  
         if (!root) {
+               fsverity_cleanup_inode(inode);
                 clear_inode(inode);
                 return;
         }
@@ -5604,6 +5654,7 @@ no_delete:
          * to retry these periodically in the future.
          */
         btrfs_remove_delayed_node(BTRFS_I(inode));
+       fsverity_cleanup_inode(inode);
         clear_inode(inode);
  }
  
@@ -6370,6 +6421,7 @@ static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
  
  static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                                      struct btrfs_root *root,
+                                    struct user_namespace *mnt_userns,
                                      struct inode *dir,
                                      const char *name, int name_len,
                                      u64 ref_objectid, u64 objectid,
@@ -6479,7 +6531,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
         if (ret != 0)
                 goto fail_unlock;
  
-       inode_init_owner(&init_user_ns, inode, dir, mode);
+       inode_init_owner(mnt_userns, inode, dir, mode);
         inode_set_bytes(inode, 0);
  
         inode->i_mtime = current_time(inode);
@@ -6664,9 +6716,9 @@ static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
         if (err)
                 goto out_unlock;
  
-       inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-                       dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
-                       mode, &index);
+       inode = btrfs_new_inode(trans, root, mnt_userns, dir,
+                       dentry->d_name.name, dentry->d_name.len,
+                       btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
         if (IS_ERR(inode)) {
                 err = PTR_ERR(inode);
                 inode = NULL;
@@ -6728,9 +6780,9 @@ static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir,
         if (err)
                 goto out_unlock;
  
-       inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-                       dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
-                       mode, &index);
+       inode = btrfs_new_inode(trans, root, mnt_userns, dir,
+                       dentry->d_name.name, dentry->d_name.len,
+                       btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
         if (IS_ERR(inode)) {
                 err = PTR_ERR(inode);
                 inode = NULL;
@@ -6873,8 +6925,9 @@ static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
         if (err)
                 goto out_fail;
  
-       inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-                       dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
+       inode = btrfs_new_inode(trans, root, mnt_userns, dir,
+                       dentry->d_name.name, dentry->d_name.len,
+                       btrfs_ino(BTRFS_I(dir)), objectid,
                         S_IFDIR | mode, &index);
         if (IS_ERR(inode)) {
                 err = PTR_ERR(inode);
@@ -8206,8 +8259,8 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
         u64 start_sector;
         int async_submit = 0;
         u64 submit_len;
-       int clone_offset = 0;
-       int clone_len;
+       u64 clone_offset = 0;
+       u64 clone_len;
         u64 logical;
         int ret;
         blk_status_t status;
@@ -8255,9 +8308,9 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
                         status = errno_to_blk_status(ret);
                         goto out_err_em;
                 }
-               ASSERT(geom.len <= INT_MAX);
  
-               clone_len = min_t(int, submit_len, geom.len);
+               clone_len = min(submit_len, geom.len);
+               ASSERT(clone_len <= UINT_MAX);
  
                 /*
                  * This will never fail as it's passing GPF_NOFS and
@@ -8401,11 +8454,47 @@ static void btrfs_readahead(struct readahead_control *rac)
         extent_readahead(rac);
  }
  
+/*
+ * For releasepage() and invalidatepage() we have a race window where
+ * end_page_writeback() is called but the subpage spinlock is not yet released.
+ * If we continue to release/invalidate the page, we could cause use-after-free
+ * for subpage spinlock.  So this function is to spin and wait for subpage
+ * spinlock.
+ */
+static void wait_subpage_spinlock(struct page *page)
+{
+       struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+       struct btrfs_subpage *subpage;
+
+       if (fs_info->sectorsize == PAGE_SIZE)
+               return;
+
+       ASSERT(PagePrivate(page) && page->private);
+       subpage = (struct btrfs_subpage *)page->private;
+
+       /*
+        * This may look insane as we just acquire the spinlock and release it,
+        * without doing anything.  But we just want to make sure no one is
+        * still holding the subpage spinlock.
+        * And since the page is not dirty nor writeback, and we have page
+        * locked, the only possible way to hold a spinlock is from the endio
+        * function to clear page writeback.
+        *
+        * Here we just acquire the spinlock so that all existing callers
+        * should exit and we're safe to release/invalidate the page.
+        */
+       spin_lock_irq(&subpage->lock);
+       spin_unlock_irq(&subpage->lock);
+}
+
  static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
  {
         int ret = try_release_extent_mapping(page, gfp_flags);
-       if (ret == 1)
+
+       if (ret == 1) {
+               wait_subpage_spinlock(page);
                 clear_page_extent_mapped(page);
+       }
         return ret;
  }
  
@@ -8469,6 +8558,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
          * do double ordered extent accounting on the same page.
          */
         wait_on_page_writeback(page);
+       wait_subpage_spinlock(page);
  
         /*
          * For subpage case, we have call sites like
@@ -8557,7 +8647,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
                 spin_unlock_irq(&inode->ordered_tree.lock);
  
                 if (btrfs_dec_test_ordered_pending(inode, &ordered,
-                                       cur, range_end + 1 - cur, 1)) {
+                                                  cur, range_end + 1 - cur)) {
                         btrfs_finish_ordered_io(ordered);
                         /*
                          * The ordered extent has finished, now we're again
@@ -8938,7 +9028,8 @@ out:
   */
  int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
                              struct btrfs_root *new_root,
-                            struct btrfs_root *parent_root)
+                            struct btrfs_root *parent_root,
+                            struct user_namespace *mnt_userns)
  {
         struct inode *inode;
         int err;
@@ -8949,7 +9040,8 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
         if (err < 0)
                 return err;
  
-       inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, ino, ino,
+       inode = btrfs_new_inode(trans, new_root, mnt_userns, NULL, "..", 2,
+                               ino, ino,
                                 S_IFDIR | (~current_umask() & S_IRWXUGO),
                                 &index);
         if (IS_ERR(inode))
@@ -8993,6 +9085,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
         ei->defrag_bytes = 0;
         ei->disk_i_size = 0;
         ei->flags = 0;
+       ei->ro_flags = 0;
         ei->csum_bytes = 0;
         ei->index_cnt = (u64)-1;
         ei->dir_index = 0;
@@ -9174,6 +9267,7 @@ static int btrfs_getattr(struct user_namespace *mnt_userns,
         struct inode *inode = d_inode(path->dentry);
         u32 blocksize = inode->i_sb->s_blocksize;
         u32 bi_flags = BTRFS_I(inode)->flags;
+       u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
  
         stat->result_mask |= STATX_BTIME;
         stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec;
@@ -9186,13 +9280,15 @@ static int btrfs_getattr(struct user_namespace *mnt_userns,
                 stat->attributes |= STATX_ATTR_IMMUTABLE;
         if (bi_flags & BTRFS_INODE_NODUMP)
                 stat->attributes |= STATX_ATTR_NODUMP;
+       if (bi_ro_flags & BTRFS_INODE_RO_VERITY)
+               stat->attributes |= STATX_ATTR_VERITY;
  
         stat->attributes_mask |= (STATX_ATTR_APPEND |
                                   STATX_ATTR_COMPRESSED |
                                   STATX_ATTR_IMMUTABLE |
                                   STATX_ATTR_NODUMP);
  
-       generic_fillattr(&init_user_ns, inode, stat);
+       generic_fillattr(mnt_userns, inode, stat);
         stat->dev = BTRFS_I(inode)->root->anon_dev;
  
         spin_lock(&BTRFS_I(inode)->lock);
@@ -9280,8 +9376,6 @@ static int btrfs_rename_exchange(struct inode *old_dir,
                 /* force full log commit if subvolume involved. */
                 btrfs_set_log_full_commit(trans);
         } else {
-               btrfs_pin_log_trans(root);
-               root_log_pinned = true;
                 ret = btrfs_insert_inode_ref(trans, dest,
                                              new_dentry->d_name.name,
                                              new_dentry->d_name.len,
@@ -9298,8 +9392,6 @@ static int btrfs_rename_exchange(struct inode *old_dir,
                 /* force full log commit if subvolume involved. */
                 btrfs_set_log_full_commit(trans);
         } else {
-               btrfs_pin_log_trans(dest);
-               dest_log_pinned = true;
                 ret = btrfs_insert_inode_ref(trans, root,
                                              old_dentry->d_name.name,
                                              old_dentry->d_name.len,
@@ -9330,6 +9422,29 @@ static int btrfs_rename_exchange(struct inode *old_dir,
                                 BTRFS_I(new_inode), 1);
         }
  
+       /*
+        * Now pin the logs of the roots. We do it to ensure that no other task
+        * can sync the logs while we are in progress with the rename, because
+        * that could result in an inconsistency in case any of the inodes that
+        * are part of this rename operation were logged before.
+        *
+        * We pin the logs even if at this precise moment none of the inodes was
+        * logged before. This is because right after we checked for that, some
+        * other task fsyncing some other inode not involved with this rename
+        * operation could log that one of our inodes exists.
+        *
+        * We don't need to pin the logs before the above calls to
+        * btrfs_insert_inode_ref(), since those don't ever need to change a log.
+        */
+       if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
+               btrfs_pin_log_trans(root);
+               root_log_pinned = true;
+       }
+       if (new_ino != BTRFS_FIRST_FREE_OBJECTID) {
+               btrfs_pin_log_trans(dest);
+               dest_log_pinned = true;
+       }
+
         /* src is a subvolume */
         if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
                 ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
@@ -9411,8 +9526,7 @@ out_fail:
                 if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) ||
                     btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) ||
                     btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
-                   (new_inode &&
-                    btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)))
+                   btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))
                         btrfs_set_log_full_commit(trans);
  
                 if (root_log_pinned) {
@@ -9436,6 +9550,7 @@ out_notrans:
  
  static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
                                      struct btrfs_root *root,
+                                    struct user_namespace *mnt_userns,
                                      struct inode *dir,
                                      struct dentry *dentry)
  {
@@ -9448,7 +9563,7 @@ static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
         if (ret)
                 return ret;
  
-       inode = btrfs_new_inode(trans, root, dir,
+       inode = btrfs_new_inode(trans, root, mnt_userns, dir,
                                 dentry->d_name.name,
                                 dentry->d_name.len,
                                 btrfs_ino(BTRFS_I(dir)),
@@ -9485,9 +9600,10 @@ out:
         return ret;
  }
  
-static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-                          struct inode *new_dir, struct dentry *new_dentry,
-                          unsigned int flags)
+static int btrfs_rename(struct user_namespace *mnt_userns,
+                       struct inode *old_dir, struct dentry *old_dentry,
+                       struct inode *new_dir, struct dentry *new_dentry,
+                       unsigned int flags)
  {
         struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
         struct btrfs_trans_handle *trans;
@@ -9582,8 +9698,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                 /* force full log commit if subvolume involved. */
                 btrfs_set_log_full_commit(trans);
         } else {
-               btrfs_pin_log_trans(root);
-               log_pinned = true;
                 ret = btrfs_insert_inode_ref(trans, dest,
                                              new_dentry->d_name.name,
                                              new_dentry->d_name.len,
@@ -9607,6 +9721,25 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
         if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
                 ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
         } else {
+               /*
+                * Now pin the log. We do it to ensure that no other task can
+                * sync the log while we are in progress with the rename, as
+                * that could result in an inconsistency in case any of the
+                * inodes that are part of this rename operation were logged
+                * before.
+                *
+                * We pin the log even if at this precise moment none of the
+                * inodes was logged before. This is because right after we
+                * checked for that, some other task fsyncing some other inode
+                * not involved with this rename operation could log that one of
+                * our inodes exists.
+                *
+                * We don't need to pin the logs before the above call to
+                * btrfs_insert_inode_ref(), since that does not need to change
+                * a log.
+                */
+               btrfs_pin_log_trans(root);
+               log_pinned = true;
                 ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
                                         BTRFS_I(d_inode(old_dentry)),
                                         old_dentry->d_name.name,
@@ -9660,8 +9793,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
         }
  
         if (flags & RENAME_WHITEOUT) {
-               ret = btrfs_whiteout_for_rename(trans, root, old_dir,
-                                               old_dentry);
+               ret = btrfs_whiteout_for_rename(trans, root, mnt_userns,
+                                               old_dir, old_dentry);
  
                 if (ret) {
                         btrfs_abort_transaction(trans, ret);
@@ -9711,7 +9844,8 @@ static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_di
                 return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
                                           new_dentry);
  
-       return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
+       return btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir,
+                           new_dentry, flags);
  }
  
  struct btrfs_delalloc_work {
@@ -9808,11 +9942,7 @@ static int start_delalloc_inodes(struct btrfs_root *root,
                         btrfs_queue_work(root->fs_info->flush_workers,
                                          &work->work);
                 } else {
-                       ret = sync_inode(inode, wbc);
-                       if (!ret &&
-                           test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
-                                    &BTRFS_I(inode)->runtime_flags))
-                               ret = sync_inode(inode, wbc);
+                       ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc);
                         btrfs_add_delayed_iput(inode);
                         if (ret || wbc->nr_to_write <= 0)
                                 goto out;
@@ -9947,9 +10077,10 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
         if (err)
                 goto out_unlock;
  
-       inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-                               dentry->d_name.len, btrfs_ino(BTRFS_I(dir)),
-                               objectid, S_IFLNK|S_IRWXUGO, &index);
+       inode = btrfs_new_inode(trans, root, mnt_userns, dir,
+                               dentry->d_name.name, dentry->d_name.len,
+                               btrfs_ino(BTRFS_I(dir)), objectid,
+                               S_IFLNK | S_IRWXUGO, &index);
         if (IS_ERR(inode)) {
                 err = PTR_ERR(inode);
                 inode = NULL;
@@ -10273,7 +10404,7 @@ static int btrfs_permission(struct user_namespace *mnt_userns,
                 if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
                         return -EACCES;
         }
-       return generic_permission(&init_user_ns, inode, mask);
+       return generic_permission(mnt_userns, inode, mask);
  }
  
  static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
@@ -10298,7 +10429,7 @@ static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
         if (ret)
                 goto out;
  
-       inode = btrfs_new_inode(trans, root, dir, NULL, 0,
+       inode = btrfs_new_inode(trans, root, mnt_userns, dir, NULL, 0,
                         btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
         if (IS_ERR(inode)) {
                 ret = PTR_ERR(inode);