Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
authorLinus Torvalds <torvalds@linux-foundation.org>
Sun, 18 Dec 2016 02:44:00 +0000 (18:44 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sun, 18 Dec 2016 02:44:00 +0000 (18:44 -0800)
Pull more vfs updates from Al Viro:
 "In this pile:

   - autofs-namespace series
   - dedupe stuff
   - more struct path constification"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs: (40 commits)
  ocfs2: implement the VFS clone_range, copy_range, and dedupe_range features
  ocfs2: charge quota for reflinked blocks
  ocfs2: fix bad pointer cast
  ocfs2: always unlock when completing dio writes
  ocfs2: don't eat io errors during _dio_end_io_write
  ocfs2: budget for extent tree splits when adding refcount flag
  ocfs2: prohibit refcounted swapfiles
  ocfs2: add newlines to some error messages
  ocfs2: convert inode refcount test to a helper
  simple_write_end(): don't zero in short copy into uptodate
  exofs: don't mess with simple_write_{begin,end}
  9p: saner ->write_end() on failing copy into non-uptodate page
  fix gfs2_stuffed_write_end() on short copies
  fix ceph_write_end()
  nfs_write_end(): fix handling of short copies
  vfs: refactor clone/dedupe_file_range common functions
  fs: try to clone files first in vfs_copy_file_range
  vfs: misc struct path constification
  namespace.c: constify struct path passed to a bunch of primitives
  quota: constify struct path in quota_on
  ...

20 files changed:
1  2 
Documentation/filesystems/Locking
fs/9p/vfs_addr.c
fs/btrfs/ctree.h
fs/btrfs/file.c
fs/btrfs/ioctl.c
fs/ceph/addr.c
fs/ext4/super.c
fs/internal.h
fs/namei.c
fs/nfs/file.c
fs/ocfs2/aops.c
fs/ocfs2/refcounttree.c
fs/read_write.c
fs/xfs/xfs_file.c
fs/xfs/xfs_reflink.c
include/linux/fs.h
kernel/audit.c
kernel/audit_fsnotify.c
kernel/audit_tree.c
kernel/audit_watch.c

@@@ -20,7 -20,7 +20,7 @@@ prototypes
        void (*d_iput)(struct dentry *, struct inode *);
        char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen);
        struct vfsmount *(*d_automount)(struct path *path);
-       int (*d_manage)(struct dentry *, bool);
+       int (*d_manage)(const struct path *, bool);
        struct dentry *(*d_real)(struct dentry *, const struct inode *,
                                 unsigned int);
  
@@@ -556,7 -556,7 +556,7 @@@ till "end_pgoff". ->map_pages() is call
  not block.  If it's not possible to reach a page without blocking,
  filesystem should skip it. Filesystem should use do_set_pte() to setup
  page table entry. Pointer to entry associated with the page is passed in
 -"pte" field in fault_env structure. Pointers to entries for other offsets
 +"pte" field in vm_fault structure. Pointers to entries for other offsets
  should be calculated relative to "pte".
  
        ->page_mkwrite() is called when a previously read-only pte is
diff --combined fs/9p/vfs_addr.c
@@@ -34,7 -34,6 +34,7 @@@
  #include <linux/idr.h>
  #include <linux/sched.h>
  #include <linux/uio.h>
 +#include <linux/bvec.h>
  #include <net/9p/9p.h>
  #include <net/9p/client.h>
  
@@@ -310,18 -309,10 +310,10 @@@ static int v9fs_write_end(struct file *
  
        p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping);
  
-       if (unlikely(copied < len)) {
-               /*
-                * zero out the rest of the area
-                */
-               unsigned from = pos & (PAGE_SIZE - 1);
-               zero_user(page, from + copied, len - copied);
-               flush_dcache_page(page);
+       if (unlikely(copied < len && !PageUptodate(page))) {
+               copied = 0;
+               goto out;
        }
-       if (!PageUptodate(page))
-               SetPageUptodate(page);
        /*
         * No need to use i_size_read() here, the i_size
         * cannot change under us because we hold the i_mutex.
                i_size_write(inode, last_pos);
        }
        set_page_dirty(page);
+ out:
        unlock_page(page);
        put_page(page);
  
diff --combined fs/btrfs/ctree.h
@@@ -90,6 -90,9 +90,6 @@@ static const int btrfs_csum_sizes[] = 
  /* four bytes for CRC32 */
  #define BTRFS_EMPTY_DIR_SIZE 0
  
 -/* specific to btrfs_map_block(), therefore not in include/linux/blk_types.h */
 -#define REQ_GET_READ_MIRRORS  (1 << 30)
 -
  /* ioprio of readahead is set to idle */
  #define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
  
@@@ -337,7 -340,7 +337,7 @@@ struct btrfs_path 
        unsigned int need_commit_sem:1;
        unsigned int skip_release_on_error:1;
  };
 -#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r) >> 4) - \
 +#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \
                                        sizeof(struct btrfs_item))
  struct btrfs_dev_replace {
        u64 replace_state;      /* see #define above */
@@@ -426,10 -429,6 +426,10 @@@ struct btrfs_space_info 
        struct list_head ro_bgs;
        struct list_head priority_tickets;
        struct list_head tickets;
 +      /*
 +       * tickets_id just indicates the next ticket will be handled, so note
 +       * it's not stored per ticket.
 +       */
        u64 tickets_id;
  
        struct rw_semaphore groups_sem;
@@@ -519,7 -518,7 +519,7 @@@ struct btrfs_io_ctl 
        void *cur, *orig;
        struct page *page;
        struct page **pages;
 -      struct btrfs_root *root;
 +      struct btrfs_fs_info *fs_info;
        struct inode *inode;
        unsigned long size;
        int index;
@@@ -799,6 -798,7 +799,6 @@@ struct btrfs_fs_info 
        spinlock_t super_lock;
        struct btrfs_super_block *super_copy;
        struct btrfs_super_block *super_for_commit;
 -      struct block_device *__bdev;
        struct super_block *sb;
        struct inode *btree_inode;
        struct backing_dev_info bdi;
  
        /* Used to record internally whether fs has been frozen */
        int fs_frozen;
 +
 +      /* Cached block sizes */
 +      u32 nodesize;
 +      u32 sectorsize;
 +      u32 stripesize;
  };
  
 +static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
 +{
 +      return sb->s_fs_info;
 +}
 +
  struct btrfs_subvolume_writers {
        struct percpu_counter   counter;
        wait_queue_head_t       wait;
@@@ -1169,6 -1159,14 +1169,6 @@@ struct btrfs_root 
        u64 objectid;
        u64 last_trans;
  
 -      /* data allocations are done in sectorsize units */
 -      u32 sectorsize;
 -
 -      /* node allocations are done in nodesize units */
 -      u32 nodesize;
 -
 -      u32 stripesize;
 -
        u32 type;
  
        u64 highest_objectid;
        /* For qgroup metadata space reserve */
        atomic_t qgroup_meta_rsv;
  };
 +static inline u32 btrfs_inode_sectorsize(const struct inode *inode)
 +{
 +      return btrfs_sb(inode->i_sb)->sectorsize;
 +}
  
  static inline u32 __BTRFS_LEAF_DATA_SIZE(u32 blocksize)
  {
        return blocksize - sizeof(struct btrfs_header);
  }
  
 -static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_root *root)
 +static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info)
  {
 -      return __BTRFS_LEAF_DATA_SIZE(root->nodesize);
 +      return __BTRFS_LEAF_DATA_SIZE(info->nodesize);
  }
  
 -static inline u32 BTRFS_MAX_ITEM_SIZE(const struct btrfs_root *root)
 +static inline u32 BTRFS_MAX_ITEM_SIZE(const struct btrfs_fs_info *info)
  {
 -      return BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
 +      return BTRFS_LEAF_DATA_SIZE(info) - sizeof(struct btrfs_item);
  }
  
 -static inline u32 BTRFS_NODEPTRS_PER_BLOCK(const struct btrfs_root *root)
 +static inline u32 BTRFS_NODEPTRS_PER_BLOCK(const struct btrfs_fs_info *info)
  {
 -      return BTRFS_LEAF_DATA_SIZE(root) / sizeof(struct btrfs_key_ptr);
 +      return BTRFS_LEAF_DATA_SIZE(info) / sizeof(struct btrfs_key_ptr);
  }
  
  #define BTRFS_FILE_EXTENT_INLINE_DATA_START           \
                (offsetof(struct btrfs_file_extent_item, disk_bytenr))
 -static inline u32 BTRFS_MAX_INLINE_DATA_SIZE(const struct btrfs_root *root)
 +static inline u32 BTRFS_MAX_INLINE_DATA_SIZE(const struct btrfs_fs_info *info)
  {
 -      return BTRFS_MAX_ITEM_SIZE(root) -
 +      return BTRFS_MAX_ITEM_SIZE(info) -
               BTRFS_FILE_EXTENT_INLINE_DATA_START;
  }
  
 -static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_root *root)
 +static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
  {
 -      return BTRFS_MAX_ITEM_SIZE(root) - sizeof(struct btrfs_dir_item);
 +      return BTRFS_MAX_ITEM_SIZE(info) - sizeof(struct btrfs_dir_item);
  }
  
  /*
  
  #ifdef CONFIG_BTRFS_DEBUG
  static inline int
 -btrfs_should_fragment_free_space(struct btrfs_root *root,
 -                               struct btrfs_block_group_cache *block_group)
 +btrfs_should_fragment_free_space(struct btrfs_block_group_cache *block_group)
  {
 -      return (btrfs_test_opt(root->fs_info, FRAGMENT_METADATA) &&
 +      struct btrfs_fs_info *fs_info = block_group->fs_info;
 +
 +      return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) &&
                block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
 -             (btrfs_test_opt(root->fs_info, FRAGMENT_DATA) &&
 +             (btrfs_test_opt(fs_info, FRAGMENT_DATA) &&
                block_group->flags &  BTRFS_BLOCK_GROUP_DATA);
  }
  #endif
@@@ -2217,8 -2210,6 +2217,8 @@@ btrfs_disk_balance_args_to_cpu(struct b
        cpu->target = le64_to_cpu(disk->target);
        cpu->flags = le64_to_cpu(disk->flags);
        cpu->limit = le64_to_cpu(disk->limit);
 +      cpu->stripes_min = le32_to_cpu(disk->stripes_min);
 +      cpu->stripes_max = le32_to_cpu(disk->stripes_max);
  }
  
  static inline void
@@@ -2237,8 -2228,6 +2237,8 @@@ btrfs_cpu_balance_args_to_disk(struct b
        disk->target = cpu_to_le64(cpu->target);
        disk->flags = cpu_to_le64(cpu->flags);
        disk->limit = cpu_to_le64(cpu->limit);
 +      disk->stripes_min = cpu_to_le32(cpu->stripes_min);
 +      disk->stripes_max = cpu_to_le32(cpu->stripes_max);
  }
  
  /* struct btrfs_super_block */
@@@ -2310,13 -2299,13 +2310,13 @@@ static inline unsigned long btrfs_leaf_
   * this returns the address of the start of the last item,
   * which is the stop of the leaf data stack
   */
 -static inline unsigned int leaf_data_end(struct btrfs_root *root,
 +static inline unsigned int leaf_data_end(struct btrfs_fs_info *fs_info,
                                         struct extent_buffer *leaf)
  {
        u32 nr = btrfs_header_nritems(leaf);
  
        if (nr == 0)
 -              return BTRFS_LEAF_DATA_SIZE(root);
 +              return BTRFS_LEAF_DATA_SIZE(fs_info);
        return btrfs_item_offset_nr(leaf, nr - 1);
  }
  
@@@ -2512,6 -2501,11 +2512,6 @@@ BTRFS_SETGET_STACK_FUNCS(stack_dev_repl
  BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
                         struct btrfs_dev_replace_item, cursor_right, 64);
  
 -static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
 -{
 -      return sb->s_fs_info;
 -}
 -
  /* helper function to cast into the data area of the leaf. */
  #define btrfs_item_ptr(leaf, slot, type) \
        ((type *)(btrfs_leaf_data(leaf) + \
@@@ -2534,28 -2528,28 +2534,28 @@@ static inline gfp_t btrfs_alloc_write_m
  
  /* extent-tree.c */
  
 -u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes);
 +u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes);
  
 -static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
 +static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_fs_info *fs_info,
                                                 unsigned num_items)
  {
 -      return root->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;
 +      return fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;
  }
  
  /*
   * Doing a truncate won't result in new nodes or leaves, just what we need for
   * COW.
   */
 -static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
 +static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info,
                                                 unsigned num_items)
  {
 -      return root->nodesize * BTRFS_MAX_LEVEL * num_items;
 +      return fs_info->nodesize * BTRFS_MAX_LEVEL * num_items;
  }
  
  int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
 -                                     struct btrfs_root *root);
 +                                     struct btrfs_fs_info *fs_info);
  int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
 -                                     struct btrfs_root *root);
 +                                     struct btrfs_fs_info *fs_info);
  void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
                                         const u64 start);
  void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg);
@@@ -2564,18 -2558,18 +2564,18 @@@ void btrfs_dec_nocow_writers(struct btr
  void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg);
  void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
  int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 -                         struct btrfs_root *root, unsigned long count);
 -int btrfs_async_run_delayed_refs(struct btrfs_root *root,
 +                         struct btrfs_fs_info *fs_info, unsigned long count);
 +int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
                                 unsigned long count, u64 transid, int wait);
 -int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len);
 +int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len);
  int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 -                           struct btrfs_root *root, u64 bytenr,
 +                           struct btrfs_fs_info *fs_info, u64 bytenr,
                             u64 offset, int metadata, u64 *refs, u64 *flags);
 -int btrfs_pin_extent(struct btrfs_root *root,
 +int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
                     u64 bytenr, u64 num, int reserved);
 -int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
 +int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
                                    u64 bytenr, u64 num_bytes);
 -int btrfs_exclude_logged_extents(struct btrfs_root *root,
 +int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info,
                                 struct extent_buffer *eb);
  int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root,
@@@ -2596,11 -2590,12 +2596,11 @@@ void btrfs_free_tree_block(struct btrfs
                           struct extent_buffer *buf,
                           u64 parent, int last_ref);
  int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 -                                   struct btrfs_root *root,
                                     u64 root_objectid, u64 owner,
                                     u64 offset, u64 ram_bytes,
                                     struct btrfs_key *ins);
  int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 -                                 struct btrfs_root *root,
 +                                 struct btrfs_fs_info *fs_info,
                                   u64 root_objectid, u64 owner, u64 offset,
                                   struct btrfs_key *ins);
  int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes,
@@@ -2611,52 -2606,52 +2611,52 @@@ int btrfs_inc_ref(struct btrfs_trans_ha
  int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                  struct extent_buffer *buf, int full_backref);
  int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
 -                              struct btrfs_root *root,
 +                              struct btrfs_fs_info *fs_info,
                                u64 bytenr, u64 num_bytes, u64 flags,
                                int level, int is_data);
  int btrfs_free_extent(struct btrfs_trans_handle *trans,
 -                    struct btrfs_root *root,
 +                    struct btrfs_fs_info *fs_info,
                      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
                      u64 owner, u64 offset);
  
 -int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len,
 -                             int delalloc);
 -int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
 +int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
 +                             u64 start, u64 len, int delalloc);
 +int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
                                       u64 start, u64 len);
  void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
 -                               struct btrfs_root *root);
 +                               struct btrfs_fs_info *fs_info);
  int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 -                             struct btrfs_root *root);
 +                             struct btrfs_fs_info *fs_info);
  int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 -                       struct btrfs_root *root,
 +                       struct btrfs_fs_info *fs_info,
                         u64 bytenr, u64 num_bytes, u64 parent,
                         u64 root_objectid, u64 owner, u64 offset);
  
  int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
 -                                 struct btrfs_root *root);
 +                                 struct btrfs_fs_info *fs_info);
  int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 -                                  struct btrfs_root *root);
 +                                 struct btrfs_fs_info *fs_info);
  int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
 -                          struct btrfs_root *root);
 -int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
 +                          struct btrfs_fs_info *fs_info);
 +int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr);
  int btrfs_free_block_groups(struct btrfs_fs_info *info);
 -int btrfs_read_block_groups(struct btrfs_root *root);
 -int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr);
 +int btrfs_read_block_groups(struct btrfs_fs_info *info);
 +int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr);
  int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 -                         struct btrfs_root *root, u64 bytes_used,
 +                         struct btrfs_fs_info *fs_info, u64 bytes_used,
                           u64 type, u64 chunk_objectid, u64 chunk_offset,
                           u64 size);
  struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
                                struct btrfs_fs_info *fs_info,
                                const u64 chunk_offset);
  int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 -                           struct btrfs_root *root, u64 group_start,
 +                           struct btrfs_fs_info *fs_info, u64 group_start,
                             struct extent_map *em);
  void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
  void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache);
  void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *cache);
  void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
 -                                     struct btrfs_root *root);
 +                                     struct btrfs_fs_info *fs_info);
  u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
  void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
  
@@@ -2686,7 -2681,7 +2686,7 @@@ void btrfs_free_reserved_data_space(str
  void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
                                            u64 len);
  void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
 -                              struct btrfs_root *root);
 +                                struct btrfs_fs_info *fs_info);
  void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
  int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
                                  struct inode *inode);
@@@ -2695,7 -2690,7 +2695,7 @@@ int btrfs_subvolume_reserve_metadata(st
                                     struct btrfs_block_rsv *rsv,
                                     int nitems,
                                     u64 *qgroup_reserved, bool use_global_rsv);
 -void btrfs_subvolume_release_metadata(struct btrfs_root *root,
 +void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
                                      struct btrfs_block_rsv *rsv,
                                      u64 qgroup_reserved);
  int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
@@@ -2703,15 -2698,16 +2703,15 @@@ void btrfs_delalloc_release_metadata(st
  int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len);
  void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len);
  void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
 -struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
 +struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
                                              unsigned short type);
 -void btrfs_free_block_rsv(struct btrfs_root *root,
 +void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
                          struct btrfs_block_rsv *rsv);
  void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv);
  int btrfs_block_rsv_add(struct btrfs_root *root,
                        struct btrfs_block_rsv *block_rsv, u64 num_bytes,
                        enum btrfs_reserve_flush_enum flush);
 -int btrfs_block_rsv_check(struct btrfs_root *root,
 -                        struct btrfs_block_rsv *block_rsv, int min_factor);
 +int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor);
  int btrfs_block_rsv_refill(struct btrfs_root *root,
                           struct btrfs_block_rsv *block_rsv, u64 min_reserved,
                           enum btrfs_reserve_flush_enum flush);
@@@ -2721,21 -2717,22 +2721,21 @@@ int btrfs_block_rsv_migrate(struct btrf
  int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
                             struct btrfs_block_rsv *dest, u64 num_bytes,
                             int min_factor);
 -void btrfs_block_rsv_release(struct btrfs_root *root,
 +void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
                             struct btrfs_block_rsv *block_rsv,
                             u64 num_bytes);
  int btrfs_inc_block_group_ro(struct btrfs_root *root,
                             struct btrfs_block_group_cache *cache);
 -void btrfs_dec_block_group_ro(struct btrfs_root *root,
 -                            struct btrfs_block_group_cache *cache);
 +void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache);
  void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
  u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
 -int btrfs_error_unpin_extent_range(struct btrfs_root *root,
 +int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
                                   u64 start, u64 end);
 -int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 +int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
                         u64 num_bytes, u64 *actual_bytes);
  int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
 -                          struct btrfs_root *root, u64 type);
 -int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range);
 +                          struct btrfs_fs_info *fs_info, u64 type);
 +int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
  
  int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
  int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
@@@ -2745,7 -2742,8 +2745,7 @@@ int btrfs_start_write_no_snapshoting(st
  void btrfs_end_write_no_snapshoting(struct btrfs_root *root);
  void btrfs_wait_for_snapshot_creation(struct btrfs_root *root);
  void check_system_chunk(struct btrfs_trans_handle *trans,
 -                      struct btrfs_root *root,
 -                      const u64 type);
 +                      struct btrfs_fs_info *fs_info, const u64 type);
  u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
                       struct btrfs_fs_info *info, u64 start, u64 end);
  
@@@ -2795,10 -2793,10 +2795,10 @@@ int btrfs_copy_root(struct btrfs_trans_
                      struct extent_buffer **cow_ret, u64 new_root_objectid);
  int btrfs_block_can_be_shared(struct btrfs_root *root,
                              struct extent_buffer *buf);
 -void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
 +void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
                       u32 data_size);
 -void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
 -                       u32 new_size, int from_end);
 +void btrfs_truncate_item(struct btrfs_fs_info *fs_info,
 +                       struct btrfs_path *path, u32 new_size, int from_end);
  int btrfs_split_item(struct btrfs_trans_handle *trans,
                     struct btrfs_root *root,
                     struct btrfs_path *path,
@@@ -2874,8 -2872,7 +2874,8 @@@ static inline int btrfs_next_item(struc
  {
        return btrfs_next_old_item(root, p, 0);
  }
 -int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
 +int btrfs_leaf_free_space(struct btrfs_fs_info *fs_info,
 +                        struct extent_buffer *leaf);
  int __must_check btrfs_drop_snapshot(struct btrfs_root *root,
                                     struct btrfs_block_rsv *block_rsv,
                                     int update_ref, int for_reloc);
@@@ -2901,9 -2898,10 +2901,9 @@@ static inline int btrfs_fs_closing(stru
   * anything except sleeping. This function is used to check the status of
   * the fs.
   */
 -static inline int btrfs_need_cleaner_sleep(struct btrfs_root *root)
 +static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info)
  {
 -      return (root->fs_info->sb->s_flags & MS_RDONLY ||
 -              btrfs_fs_closing(root->fs_info));
 +      return fs_info->sb->s_flags & MS_RDONLY || btrfs_fs_closing(fs_info);
  }
  
  static inline void free_fs_info(struct btrfs_fs_info *fs_info)
@@@ -2933,11 -2931,11 +2933,11 @@@ int btrfs_old_root_level(struct btrfs_r
  
  /* root-item.c */
  int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
 -                     struct btrfs_root *tree_root,
 +                     struct btrfs_fs_info *fs_info,
                       u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
                       const char *name, int name_len);
  int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
 -                     struct btrfs_root *tree_root,
 +                     struct btrfs_fs_info *fs_info,
                       u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
                       const char *name, int name_len);
  int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@@ -2952,7 -2950,7 +2952,7 @@@ int __must_check btrfs_update_root(stru
  int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key,
                    struct btrfs_path *path, struct btrfs_root_item *root_item,
                    struct btrfs_key *root_key);
 -int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
 +int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info);
  void btrfs_set_root_node(struct btrfs_root_item *item,
                         struct extent_buffer *node);
  void btrfs_check_and_init_root_item(struct btrfs_root_item *item);
@@@ -2961,10 -2959,10 +2961,10 @@@ void btrfs_update_root_times(struct btr
  
  /* uuid-tree.c */
  int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans,
 -                      struct btrfs_root *uuid_root, u8 *uuid, u8 type,
 +                      struct btrfs_fs_info *fs_info, u8 *uuid, u8 type,
                        u64 subid);
  int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans,
 -                      struct btrfs_root *uuid_root, u8 *uuid, u8 type,
 +                      struct btrfs_fs_info *fs_info, u8 *uuid, u8 type,
                        u64 subid);
  int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
                            int (*check_func)(struct btrfs_fs_info *, u8 *, u8,
@@@ -3006,10 -3004,10 +3006,10 @@@ struct btrfs_dir_item *btrfs_lookup_xat
                                          struct btrfs_path *path, u64 dir,
                                          const char *name, u16 name_len,
                                          int mod);
 -int verify_dir_item(struct btrfs_root *root,
 +int verify_dir_item(struct btrfs_fs_info *fs_info,
                    struct extent_buffer *leaf,
                    struct btrfs_dir_item *dir_item);
 -struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
 +struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
                                                 struct btrfs_path *path,
                                                 const char *name,
                                                 int name_len);
@@@ -3053,10 -3051,11 +3053,10 @@@ int btrfs_find_name_in_ext_backref(stru
  /* file-item.c */
  struct btrfs_dio_private;
  int btrfs_del_csums(struct btrfs_trans_handle *trans,
 -                  struct btrfs_root *root, u64 bytenr, u64 len);
 -int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 -                        struct bio *bio, u32 *dst);
 -int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
 -                            struct bio *bio, u64 logical_offset);
 +                  struct btrfs_fs_info *fs_info, u64 bytenr, u64 len);
 +int btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst);
 +int btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio,
 +                            u64 logical_offset);
  int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             u64 objectid, u64 pos,
@@@ -3070,8 -3069,8 +3070,8 @@@ int btrfs_lookup_file_extent(struct btr
  int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           struct btrfs_ordered_sum *sums);
 -int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 -                     struct bio *bio, u64 file_start, int contig);
 +int btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
 +                     u64 file_start, int contig);
  int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
                             struct list_head *list, int search_commit);
  void btrfs_extent_item_to_extent_map(struct inode *inode,
@@@ -3174,7 -3173,7 +3174,7 @@@ void btrfs_orphan_commit_root(struct bt
  int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
  void btrfs_invalidate_inodes(struct btrfs_root *root);
  void btrfs_add_delayed_iput(struct inode *inode);
 -void btrfs_run_delayed_iputs(struct btrfs_root *root);
 +void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info);
  int btrfs_prealloc_file_range(struct inode *inode, int mode,
                              u64 start, u64 num_bytes, u64 min_size,
                              loff_t actual_len, u64 *alloc_hint);
@@@ -3228,13 -3227,11 +3228,10 @@@ int btrfs_drop_extents(struct btrfs_tra
  int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
                              struct inode *inode, u64 start, u64 end);
  int btrfs_release_file(struct inode *inode, struct file *file);
 -int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
 -                    struct page **pages, size_t num_pages,
 -                    loff_t pos, size_t write_bytes,
 +int btrfs_dirty_pages(struct inode *inode, struct page **pages,
 +                    size_t num_pages, loff_t pos, size_t write_bytes,
                      struct extent_state **cached);
  int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
- ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in,
-                             struct file *file_out, loff_t pos_out,
-                             size_t len, unsigned int flags);
  int btrfs_clone_file_range(struct file *file_in, loff_t pos_in,
                           struct file *file_out, loff_t pos_out, u64 len);
  
@@@ -3252,7 -3249,7 +3249,7 @@@ void btrfs_sysfs_remove_mounted(struct 
  ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
  
  /* super.c */
 -int btrfs_parse_options(struct btrfs_root *root, char *options,
 +int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
                        unsigned long new_flags);
  int btrfs_sync_fs(struct super_block *sb, int wait);
  
@@@ -3445,14 -3442,9 +3442,14 @@@ do {                                                          
        /* Report first abort since mount */                    \
        if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED,     \
                        &((trans)->fs_info->fs_state))) {       \
 -              WARN(1, KERN_DEBUG                              \
 -              "BTRFS: Transaction aborted (error %d)\n",      \
 -              (errno));                                       \
 +              if ((errno) != -EIO) {                          \
 +                      WARN(1, KERN_DEBUG                              \
 +                      "BTRFS: Transaction aborted (error %d)\n",      \
 +                      (errno));                                       \
 +              } else {                                                \
 +                      pr_debug("BTRFS: Transaction aborted (error %d)\n", \
 +                                (errno));                     \
 +              }                                               \
        }                                                       \
        __btrfs_abort_transaction((trans), __func__,            \
                                  __LINE__, (errno));           \
@@@ -3614,7 -3606,7 +3611,7 @@@ static inline int btrfs_init_acl(struc
  #endif
  
  /* relocation.c */
 -int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
 +int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start);
  int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root);
  int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
@@@ -3633,12 -3625,12 +3630,12 @@@ int btrfs_reloc_post_snapshot(struct bt
  int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
                    u64 end, struct btrfs_scrub_progress *progress,
                    int readonly, int is_dev_replace);
 -void btrfs_scrub_pause(struct btrfs_root *root);
 -void btrfs_scrub_continue(struct btrfs_root *root);
 +void btrfs_scrub_pause(struct btrfs_fs_info *fs_info);
 +void btrfs_scrub_continue(struct btrfs_fs_info *fs_info);
  int btrfs_scrub_cancel(struct btrfs_fs_info *info);
  int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
                           struct btrfs_device *dev);
 -int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
 +int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
                         struct btrfs_scrub_progress *progress);
  
  /* dev-replace.c */
@@@ -3653,7 -3645,7 +3650,7 @@@ static inline void btrfs_bio_counter_de
  
  /* reada.c */
  struct reada_control {
 -      struct btrfs_root       *root;          /* tree to prefetch */
 +      struct btrfs_fs_info    *fs_info;               /* tree to prefetch */
        struct btrfs_key        key_start;
        struct btrfs_key        key_end;        /* exclusive */
        atomic_t                elems;
@@@ -3665,7 -3657,7 +3662,7 @@@ struct reada_control *btrfs_reada_add(s
  int btrfs_reada_wait(void *handle);
  void btrfs_reada_detach(void *handle);
  int btree_readahead_hook(struct btrfs_fs_info *fs_info,
 -                       struct extent_buffer *eb, u64 start, int err);
 +                       struct extent_buffer *eb, int err);
  
  static inline int is_fstree(u64 rootid)
  {
diff --combined fs/btrfs/file.c
@@@ -27,6 -27,7 +27,6 @@@
  #include <linux/falloc.h>
  #include <linux/swap.h>
  #include <linux/writeback.h>
 -#include <linux/statfs.h>
  #include <linux/compat.h>
  #include <linux/slab.h>
  #include <linux/btrfs.h>
@@@ -95,13 -96,13 +95,13 @@@ static int __compare_inode_defrag(struc
  static int __btrfs_add_inode_defrag(struct inode *inode,
                                    struct inode_defrag *defrag)
  {
 -      struct btrfs_root *root = BTRFS_I(inode)->root;
 +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct inode_defrag *entry;
        struct rb_node **p;
        struct rb_node *parent = NULL;
        int ret;
  
 -      p = &root->fs_info->defrag_inodes.rb_node;
 +      p = &fs_info->defrag_inodes.rb_node;
        while (*p) {
                parent = *p;
                entry = rb_entry(parent, struct inode_defrag, rb_node);
        }
        set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
        rb_link_node(&defrag->rb_node, parent, p);
 -      rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
 +      rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes);
        return 0;
  }
  
 -static inline int __need_auto_defrag(struct btrfs_root *root)
 +static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
  {
 -      if (!btrfs_test_opt(root->fs_info, AUTO_DEFRAG))
 +      if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
                return 0;
  
 -      if (btrfs_fs_closing(root->fs_info))
 +      if (btrfs_fs_closing(fs_info))
                return 0;
  
        return 1;
  int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
                           struct inode *inode)
  {
 +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct inode_defrag *defrag;
        u64 transid;
        int ret;
  
 -      if (!__need_auto_defrag(root))
 +      if (!__need_auto_defrag(fs_info))
                return 0;
  
        if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
        defrag->transid = transid;
        defrag->root = root->root_key.objectid;
  
 -      spin_lock(&root->fs_info->defrag_inodes_lock);
 +      spin_lock(&fs_info->defrag_inodes_lock);
        if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) {
                /*
                 * If we set IN_DEFRAG flag and evict the inode from memory,
        } else {
                kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
        }
 -      spin_unlock(&root->fs_info->defrag_inodes_lock);
 +      spin_unlock(&fs_info->defrag_inodes_lock);
        return 0;
  }
  
  static void btrfs_requeue_inode_defrag(struct inode *inode,
                                       struct inode_defrag *defrag)
  {
 -      struct btrfs_root *root = BTRFS_I(inode)->root;
 +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        int ret;
  
 -      if (!__need_auto_defrag(root))
 +      if (!__need_auto_defrag(fs_info))
                goto out;
  
        /*
         * Here we don't check the IN_DEFRAG flag, because we need merge
         * them together.
         */
 -      spin_lock(&root->fs_info->defrag_inodes_lock);
 +      spin_lock(&fs_info->defrag_inodes_lock);
        ret = __btrfs_add_inode_defrag(inode, defrag);
 -      spin_unlock(&root->fs_info->defrag_inodes_lock);
 +      spin_unlock(&fs_info->defrag_inodes_lock);
        if (ret)
                goto out;
        return;
@@@ -373,7 -373,7 +373,7 @@@ int btrfs_run_defrag_inodes(struct btrf
                             &fs_info->fs_state))
                        break;
  
 -              if (!__need_auto_defrag(fs_info->tree_root))
 +              if (!__need_auto_defrag(fs_info))
                        break;
  
                /* find an inode to defrag */
@@@ -485,11 -485,11 +485,11 @@@ static void btrfs_drop_pages(struct pag
   * this also makes the decision about creating an inline extent vs
   * doing real data extents, marking pages dirty and delalloc as required.
   */
 -int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
 -                           struct page **pages, size_t num_pages,
 -                           loff_t pos, size_t write_bytes,
 -                           struct extent_state **cached)
 +int btrfs_dirty_pages(struct inode *inode, struct page **pages,
 +                    size_t num_pages, loff_t pos, size_t write_bytes,
 +                    struct extent_state **cached)
  {
 +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        int err = 0;
        int i;
        u64 num_bytes;
        u64 end_pos = pos + write_bytes;
        loff_t isize = i_size_read(inode);
  
 -      start_pos = pos & ~((u64)root->sectorsize - 1);
 -      num_bytes = round_up(write_bytes + pos - start_pos, root->sectorsize);
 +      start_pos = pos & ~((u64) fs_info->sectorsize - 1);
 +      num_bytes = round_up(write_bytes + pos - start_pos,
 +                           fs_info->sectorsize);
  
        end_of_last_block = start_pos + num_bytes - 1;
        err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
@@@ -697,7 -696,6 +697,7 @@@ int __btrfs_drop_extents(struct btrfs_t
                         u32 extent_item_size,
                         int *key_inserted)
  {
 +      struct btrfs_fs_info *fs_info = root->fs_info;
        struct extent_buffer *leaf;
        struct btrfs_file_extent_item *fi;
        struct btrfs_key key;
        u64 num_bytes = 0;
        u64 extent_offset = 0;
        u64 extent_end = 0;
 +      u64 last_end = start;
        int del_nr = 0;
        int del_slot = 0;
        int extent_type;
                modify_tree = 0;
  
        update_refs = (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
 -                     root == root->fs_info->tree_root);
 +                     root == fs_info->tree_root);
        while (1) {
                recow = 0;
                ret = btrfs_lookup_file_extent(trans, root, path, ino,
@@@ -800,10 -797,8 +800,10 @@@ next_slot
                 * extent item in the call to setup_items_for_insert() later
                 * in this function.
                 */
 -              if (extent_end == key.offset && extent_end >= search_start)
 +              if (extent_end == key.offset && extent_end >= search_start) {
 +                      last_end = extent_end;
                        goto delete_extent_item;
 +              }
  
                if (extent_end <= search_start) {
                        path->slots[0]++;
                        btrfs_mark_buffer_dirty(leaf);
  
                        if (update_refs && disk_bytenr > 0) {
 -                              ret = btrfs_inc_extent_ref(trans, root,
 +                              ret = btrfs_inc_extent_ref(trans, fs_info,
                                                disk_bytenr, num_bytes, 0,
                                                root->root_key.objectid,
                                                new_key.objectid,
                        }
                        key.offset = start;
                }
 +              /*
 +               * From here on out we will have actually dropped something, so
 +               * last_end can be updated.
 +               */
 +              last_end = extent_end;
 +
                /*
                 *  | ---- range to drop ----- |
                 *      | -------- extent -------- |
  
                        memcpy(&new_key, &key, sizeof(new_key));
                        new_key.offset = end;
 -                      btrfs_set_item_key_safe(root->fs_info, path, &new_key);
 +                      btrfs_set_item_key_safe(fs_info, path, &new_key);
  
                        extent_offset += end - key.offset;
                        btrfs_set_file_extent_offset(leaf, fi, extent_offset);
@@@ -938,9 -927,9 +938,9 @@@ delete_extent_item
                                inode_sub_bytes(inode,
                                                extent_end - key.offset);
                                extent_end = ALIGN(extent_end,
 -                                                 root->sectorsize);
 +                                                 fs_info->sectorsize);
                        } else if (update_refs && disk_bytenr > 0) {
 -                              ret = btrfs_free_extent(trans, root,
 +                              ret = btrfs_free_extent(trans, fs_info,
                                                disk_bytenr, num_bytes, 0,
                                                root->root_key.objectid,
                                                key.objectid, key.offset -
        if (!ret && replace_extent && leafs_visited == 1 &&
            (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING ||
             path->locks[0] == BTRFS_WRITE_LOCK) &&
 -          btrfs_leaf_free_space(root, leaf) >=
 +          btrfs_leaf_free_space(fs_info, leaf) >=
            sizeof(struct btrfs_item) + extent_item_size) {
  
                key.objectid = ino;
        if (!replace_extent || !(*key_inserted))
                btrfs_release_path(path);
        if (drop_end)
 -              *drop_end = found ? min(end, extent_end) : end;
 +              *drop_end = found ? min(end, last_end) : end;
        return ret;
  }
  
@@@ -1084,7 -1073,6 +1084,7 @@@ static int extent_mergeable(struct exte
  int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
                              struct inode *inode, u64 start, u64 end)
  {
 +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct extent_buffer *leaf;
        struct btrfs_path *path;
@@@ -1154,7 -1142,7 +1154,7 @@@ again
                                     ino, bytenr, orig_offset,
                                     &other_start, &other_end)) {
                        new_key.offset = end;
 -                      btrfs_set_item_key_safe(root->fs_info, path, &new_key);
 +                      btrfs_set_item_key_safe(fs_info, path, &new_key);
                        fi = btrfs_item_ptr(leaf, path->slots[0],
                                            struct btrfs_file_extent_item);
                        btrfs_set_file_extent_generation(leaf, fi,
                                                         trans->transid);
                        path->slots[0]++;
                        new_key.offset = start;
 -                      btrfs_set_item_key_safe(root->fs_info, path, &new_key);
 +                      btrfs_set_item_key_safe(fs_info, path, &new_key);
  
                        fi = btrfs_item_ptr(leaf, path->slots[0],
                                            struct btrfs_file_extent_item);
                                                extent_end - split);
                btrfs_mark_buffer_dirty(leaf);
  
 -              ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
 -                                         root->root_key.objectid,
 +              ret = btrfs_inc_extent_ref(trans, fs_info, bytenr, num_bytes,
 +                                         0, root->root_key.objectid,
                                           ino, orig_offset);
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
                extent_end = other_end;
                del_slot = path->slots[0] + 1;
                del_nr++;
 -              ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
 +              ret = btrfs_free_extent(trans, fs_info, bytenr, num_bytes,
                                        0, root->root_key.objectid,
                                        ino, orig_offset);
                if (ret) {
                key.offset = other_start;
                del_slot = path->slots[0];
                del_nr++;
 -              ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
 +              ret = btrfs_free_extent(trans, fs_info, bytenr, num_bytes,
                                        0, root->root_key.objectid,
                                        ino, orig_offset);
                if (ret) {
@@@ -1421,16 -1409,15 +1421,16 @@@ lock_and_cleanup_extent_if_need(struct 
                                u64 *lockstart, u64 *lockend,
                                struct extent_state **cached_state)
  {
 -      struct btrfs_root *root = BTRFS_I(inode)->root;
 +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        u64 start_pos;
        u64 last_pos;
        int i;
        int ret = 0;
  
 -      start_pos = round_down(pos, root->sectorsize);
 +      start_pos = round_down(pos, fs_info->sectorsize);
        last_pos = start_pos
 -              + round_up(pos + write_bytes - start_pos, root->sectorsize) - 1;
 +              + round_up(pos + write_bytes - start_pos,
 +                         fs_info->sectorsize) - 1;
  
        if (start_pos < inode->i_size) {
                struct btrfs_ordered_extent *ordered;
  static noinline int check_can_nocow(struct inode *inode, loff_t pos,
                                    size_t *write_bytes)
  {
 +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_ordered_extent *ordered;
        u64 lockstart, lockend;
        if (!ret)
                return -ENOSPC;
  
 -      lockstart = round_down(pos, root->sectorsize);
 -      lockend = round_up(pos + *write_bytes, root->sectorsize) - 1;
 +      lockstart = round_down(pos, fs_info->sectorsize);
 +      lockend = round_up(pos + *write_bytes,
 +                         fs_info->sectorsize) - 1;
  
        while (1) {
                lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
@@@ -1524,7 -1509,6 +1524,7 @@@ static noinline ssize_t __btrfs_buffere
                                               loff_t pos)
  {
        struct inode *inode = file_inode(file);
 +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct page **pages = NULL;
        struct extent_state *cached_state = NULL;
                        break;
                }
  
 -              sector_offset = pos & (root->sectorsize - 1);
 +              sector_offset = pos & (fs_info->sectorsize - 1);
                reserve_bytes = round_up(write_bytes + sector_offset,
 -                              root->sectorsize);
 +                              fs_info->sectorsize);
  
                ret = btrfs_check_data_free_space(inode, pos, write_bytes);
                if (ret < 0) {
                                                         PAGE_SIZE);
                                reserve_bytes = round_up(write_bytes +
                                                         sector_offset,
 -                                                       root->sectorsize);
 +                                                       fs_info->sectorsize);
                        } else {
                                break;
                        }
@@@ -1637,10 -1621,12 +1637,10 @@@ again
  
                copied = btrfs_copy_from_user(pos, write_bytes, pages, i);
  
 -              num_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info,
 -                                              reserve_bytes);
 +              num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
                dirty_sectors = round_up(copied + sector_offset,
 -                                      root->sectorsize);
 -              dirty_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info,
 -                                              dirty_sectors);
 +                                      fs_info->sectorsize);
 +              dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors);
  
                /*
                 * if we have trouble faulting in the pages, fall
                 * managed to copy.
                 */
                if (num_sectors > dirty_sectors) {
 -
                        /* release everything except the sectors we dirtied */
                        release_bytes -= dirty_sectors <<
 -                              root->fs_info->sb->s_blocksize_bits;
 -
 +                                              fs_info->sb->s_blocksize_bits;
                        if (copied > 0) {
                                spin_lock(&BTRFS_I(inode)->lock);
                                BTRFS_I(inode)->outstanding_extents++;
                        } else {
                                u64 __pos;
  
 -                              __pos = round_down(pos, root->sectorsize) +
 +                              __pos = round_down(pos,
 +                                                 fs_info->sectorsize) +
                                        (dirty_pages << PAGE_SHIFT);
                                btrfs_delalloc_release_space(inode, __pos,
                                                             release_bytes);
                }
  
                release_bytes = round_up(copied + sector_offset,
 -                                      root->sectorsize);
 +                                      fs_info->sectorsize);
  
                if (copied > 0)
 -                      ret = btrfs_dirty_pages(root, inode, pages,
 -                                              dirty_pages, pos, copied,
 -                                              NULL);
 +                      ret = btrfs_dirty_pages(inode, pages, dirty_pages,
 +                                              pos, copied, NULL);
                if (need_unlock)
                        unlock_extent_cached(&BTRFS_I(inode)->io_tree,
                                             lockstart, lockend, &cached_state,
                        btrfs_end_write_no_snapshoting(root);
  
                if (only_release_metadata && copied > 0) {
 -                      lockstart = round_down(pos, root->sectorsize);
 -                      lockend = round_up(pos + copied, root->sectorsize) - 1;
 +                      lockstart = round_down(pos,
 +                                             fs_info->sectorsize);
 +                      lockend = round_up(pos + copied,
 +                                         fs_info->sectorsize) - 1;
  
                        set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
                                       lockend, EXTENT_NORESERVE, NULL,
                cond_resched();
  
                balance_dirty_pages_ratelimited(inode->i_mapping);
 -              if (dirty_pages < (root->nodesize >> PAGE_SHIFT) + 1)
 -                      btrfs_btree_balance_dirty(root);
 +              if (dirty_pages < (fs_info->nodesize >> PAGE_SHIFT) + 1)
 +                      btrfs_btree_balance_dirty(fs_info);
  
                pos += copied;
                num_written += copied;
                        btrfs_delalloc_release_metadata(inode, release_bytes);
                } else {
                        btrfs_delalloc_release_space(inode,
 -                                              round_down(pos, root->sectorsize),
 +                                              round_down(pos, fs_info->sectorsize),
                                                release_bytes);
                }
        }
@@@ -1812,7 -1798,6 +1812,7 @@@ static ssize_t btrfs_file_write_iter(st
  {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(file);
 +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        u64 start_pos;
        u64 end_pos;
         * although we have opened a file as writable, we have
         * to stop this write operation to ensure FS consistency.
         */
 -      if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
 +      if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
                inode_unlock(inode);
                err = -EROFS;
                goto out;
  
        pos = iocb->ki_pos;
        count = iov_iter_count(from);
 -      start_pos = round_down(pos, root->sectorsize);
 +      start_pos = round_down(pos, fs_info->sectorsize);
        oldsize = i_size_read(inode);
        if (start_pos > oldsize) {
                /* Expand hole size to cover write data, preventing empty gap */
 -              end_pos = round_up(pos + count, root->sectorsize);
 +              end_pos = round_up(pos + count,
 +                                 fs_info->sectorsize);
                err = btrfs_cont_expand(inode, oldsize, end_pos);
                if (err) {
                        inode_unlock(inode);
                        goto out;
                }
 -              if (start_pos > round_up(oldsize, root->sectorsize))
 +              if (start_pos > round_up(oldsize, fs_info->sectorsize))
                        clean_page = 1;
        }
  
@@@ -1951,7 -1935,6 +1951,7 @@@ int btrfs_sync_file(struct file *file, 
  {
        struct dentry *dentry = file_dentry(file);
        struct inode *inode = d_inode(dentry);
 +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
        struct btrfs_log_ctx ctx;
         * commit does not start nor waits for ordered extents to complete.
         */
        smp_mb();
 -      if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
 +      if (btrfs_inode_in_log(inode, fs_info->generation) ||
            (full_sync && BTRFS_I(inode)->last_trans <=
 -           root->fs_info->last_trans_committed) ||
 +           fs_info->last_trans_committed) ||
            (!btrfs_have_ordered_extents_in_range(inode, start, len) &&
             BTRFS_I(inode)->last_trans
 -           <= root->fs_info->last_trans_committed)) {
 +           <= fs_info->last_trans_committed)) {
                /*
                 * We've had everything committed since the last time we were
                 * modified so clear this flag in case it was set for whatever
         * which are indicated by ctx.io_err.
         */
        if (ctx.io_err) {
 -              btrfs_end_transaction(trans, root);
 +              btrfs_end_transaction(trans);
                ret = ctx.io_err;
                goto out;
        }
                if (!ret) {
                        ret = btrfs_sync_log(trans, root, &ctx);
                        if (!ret) {
 -                              ret = btrfs_end_transaction(trans, root);
 +                              ret = btrfs_end_transaction(trans);
                                goto out;
                        }
                }
                if (!full_sync) {
                        ret = btrfs_wait_ordered_range(inode, start, len);
                        if (ret) {
 -                              btrfs_end_transaction(trans, root);
 +                              btrfs_end_transaction(trans);
                                goto out;
                        }
                }
 -              ret = btrfs_commit_transaction(trans, root);
 +              ret = btrfs_commit_transaction(trans);
        } else {
 -              ret = btrfs_end_transaction(trans, root);
 +              ret = btrfs_end_transaction(trans);
        }
  out:
        return ret > 0 ? -EIO : ret;
@@@ -2225,7 -2208,6 +2225,7 @@@ static int hole_mergeable(struct inode 
  static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
                      struct btrfs_path *path, u64 offset, u64 end)
  {
 +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct extent_buffer *leaf;
        struct btrfs_file_extent_item *fi;
        struct btrfs_key key;
        int ret;
  
 -      if (btrfs_fs_incompat(root->fs_info, NO_HOLES))
 +      if (btrfs_fs_incompat(fs_info, NO_HOLES))
                goto out;
  
        key.objectid = btrfs_ino(inode);
        key.offset = offset;
  
        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
 -      if (ret < 0)
 +      if (ret <= 0) {
 +              /*
 +               * We should have dropped this offset, so if we find it then
 +               * something has gone horribly wrong.
 +               */
 +              if (ret == 0)
 +                      ret = -EINVAL;
                return ret;
 -      BUG_ON(!ret);
 +      }
  
        leaf = path->nodes[0];
        if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) {
                u64 num_bytes;
  
                key.offset = offset;
 -              btrfs_set_item_key_safe(root->fs_info, path, &key);
 +              btrfs_set_item_key_safe(fs_info, path, &key);
                fi = btrfs_item_ptr(leaf, path->slots[0],
                                    struct btrfs_file_extent_item);
                num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
@@@ -2308,7 -2284,7 +2308,7 @@@ out
                hole_em->block_start = EXTENT_MAP_HOLE;
                hole_em->block_len = 0;
                hole_em->orig_block_len = 0;
 -              hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
 +              hole_em->bdev = fs_info->fs_devices->latest_bdev;
                hole_em->compress_type = BTRFS_COMPRESS_NONE;
                hole_em->generation = trans->transid;
  
@@@ -2360,7 -2336,6 +2360,7 @@@ static int find_first_non_hole(struct i
  
  static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
  {
 +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct extent_state *cached_state = NULL;
        struct btrfs_path *path;
        u64 tail_len;
        u64 orig_start = offset;
        u64 cur_offset;
 -      u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
 +      u64 min_size = btrfs_calc_trans_metadata_size(fs_info, 1);
        u64 drop_end;
        int ret = 0;
        int err = 0;
        unsigned int rsv_count;
        bool same_block;
 -      bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
 +      bool no_holes = btrfs_fs_incompat(fs_info, NO_HOLES);
        u64 ino_size;
        bool truncated_block = false;
        bool updated_inode = false;
                return ret;
  
        inode_lock(inode);
 -      ino_size = round_up(inode->i_size, root->sectorsize);
 +      ino_size = round_up(inode->i_size, fs_info->sectorsize);
        ret = find_first_non_hole(inode, &offset, &len);
        if (ret < 0)
                goto out_only_mutex;
                goto out_only_mutex;
        }
  
 -      lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
 +      lockstart = round_up(offset, btrfs_inode_sectorsize(inode));
        lockend = round_down(offset + len,
 -                           BTRFS_I(inode)->root->sectorsize) - 1;
 -      same_block = (BTRFS_BYTES_TO_BLKS(root->fs_info, offset))
 -              == (BTRFS_BYTES_TO_BLKS(root->fs_info, offset + len - 1));
 +                           btrfs_inode_sectorsize(inode)) - 1;
 +      same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
 +              == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
        /*
         * We needn't truncate any block which is beyond the end of the file
         * because we are sure there is no data there.
         * Only do this if we are in the same block and we aren't doing the
         * entire block.
         */
 -      if (same_block && len < root->sectorsize) {
 +      if (same_block && len < fs_info->sectorsize) {
                if (offset < ino_size) {
                        truncated_block = true;
                        ret = btrfs_truncate_block(inode, offset, len, 0);
                goto out;
        }
  
 -      rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
 +      rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
        if (!rsv) {
                ret = -ENOMEM;
                goto out_free;
        }
 -      rsv->size = btrfs_calc_trunc_metadata_size(root, 1);
 +      rsv->size = btrfs_calc_trans_metadata_size(fs_info, 1);
        rsv->failfast = 1;
  
        /*
                goto out_free;
        }
  
 -      ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
 +      ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
                                      min_size, 0);
        BUG_ON(ret);
        trans->block_rsv = rsv;
                if (ret != -ENOSPC)
                        break;
  
 -              trans->block_rsv = &root->fs_info->trans_block_rsv;
 +              trans->block_rsv = &fs_info->trans_block_rsv;
  
 -              if (cur_offset < ino_size) {
 +              if (cur_offset < drop_end && cur_offset < ino_size) {
                        ret = fill_holes(trans, inode, path, cur_offset,
                                         drop_end);
                        if (ret) {
 +                              /*
 +                               * If we failed then we didn't insert our hole
 +                               * entries for the area we dropped, so now the
 +                               * fs is corrupted, so we must abort the
 +                               * transaction.
 +                               */
 +                              btrfs_abort_transaction(trans, ret);
                                err = ret;
                                break;
                        }
                        break;
                }
  
 -              btrfs_end_transaction(trans, root);
 -              btrfs_btree_balance_dirty(root);
 +              btrfs_end_transaction(trans);
 +              btrfs_btree_balance_dirty(fs_info);
  
                trans = btrfs_start_transaction(root, rsv_count);
                if (IS_ERR(trans)) {
                        break;
                }
  
 -              ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
 +              ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
                                              rsv, min_size, 0);
                BUG_ON(ret);    /* shouldn't happen */
                trans->block_rsv = rsv;
                goto out_trans;
        }
  
 -      trans->block_rsv = &root->fs_info->trans_block_rsv;
 +      trans->block_rsv = &fs_info->trans_block_rsv;
        /*
         * If we are using the NO_HOLES feature we might have had already an
         * hole that overlaps a part of the region [lockstart, lockend] and
        if (cur_offset < ino_size && cur_offset < drop_end) {
                ret = fill_holes(trans, inode, path, cur_offset, drop_end);
                if (ret) {
 +                      /* Same comment as above. */
 +                      btrfs_abort_transaction(trans, ret);
                        err = ret;
                        goto out_trans;
                }
@@@ -2639,14 -2605,14 +2639,14 @@@ out_trans
        inode_inc_iversion(inode);
        inode->i_mtime = inode->i_ctime = current_time(inode);
  
 -      trans->block_rsv = &root->fs_info->trans_block_rsv;
 +      trans->block_rsv = &fs_info->trans_block_rsv;
        ret = btrfs_update_inode(trans, root, inode);
        updated_inode = true;
 -      btrfs_end_transaction(trans, root);
 -      btrfs_btree_balance_dirty(root);
 +      btrfs_end_transaction(trans);
 +      btrfs_btree_balance_dirty(fs_info);
  out_free:
        btrfs_free_path(path);
 -      btrfs_free_block_rsv(root, rsv);
 +      btrfs_free_block_rsv(fs_info, rsv);
  out:
        unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
                             &cached_state, GFP_NOFS);
@@@ -2664,7 -2630,7 +2664,7 @@@ out_only_mutex
                        err = PTR_ERR(trans);
                } else {
                        err = btrfs_update_inode(trans, root, inode);
 -                      ret = btrfs_end_transaction(trans, root);
 +                      ret = btrfs_end_transaction(trans);
                }
        }
        inode_unlock(inode);
@@@ -2729,7 -2695,7 +2729,7 @@@ static long btrfs_fallocate(struct fil
        u64 locked_end;
        u64 actual_end = 0;
        struct extent_map *em;
 -      int blocksize = BTRFS_I(inode)->root->sectorsize;
 +      int blocksize = btrfs_inode_sectorsize(inode);
        int ret;
  
        alloc_start = round_down(offset, blocksize);
                        btrfs_ordered_update_i_size(inode, actual_end, NULL);
                        ret = btrfs_update_inode(trans, root, inode);
                        if (ret)
 -                              btrfs_end_transaction(trans, root);
 +                              btrfs_end_transaction(trans);
                        else
 -                              ret = btrfs_end_transaction(trans, root);
 +                              ret = btrfs_end_transaction(trans);
                }
        }
  out_unlock:
@@@ -2925,7 -2891,7 +2925,7 @@@ out
  
  static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
  {
 -      struct btrfs_root *root = BTRFS_I(inode)->root;
 +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct extent_map *em = NULL;
        struct extent_state *cached_state = NULL;
        u64 lockstart;
         */
        start = max_t(loff_t, 0, *offset);
  
 -      lockstart = round_down(start, root->sectorsize);
 -      lockend = round_up(i_size_read(inode), root->sectorsize);
 +      lockstart = round_down(start, fs_info->sectorsize);
 +      lockend = round_up(i_size_read(inode),
 +                         fs_info->sectorsize);
        if (lockend <= lockstart)
 -              lockend = lockstart + root->sectorsize;
 +              lockend = lockstart + fs_info->sectorsize;
        lockend--;
        len = lockend - lockstart + 1;
  
@@@ -3033,7 -2998,6 +3033,6 @@@ const struct file_operations btrfs_file
  #ifdef CONFIG_COMPAT
        .compat_ioctl   = btrfs_compat_ioctl,
  #endif
-       .copy_file_range = btrfs_copy_file_range,
        .clone_file_range = btrfs_clone_file_range,
        .dedupe_file_range = btrfs_dedupe_file_range,
  };
diff --combined fs/btrfs/ioctl.c
@@@ -33,6 -33,7 +33,6 @@@
  #include <linux/namei.h>
  #include <linux/swap.h>
  #include <linux/writeback.h>
 -#include <linux/statfs.h>
  #include <linux/compat.h>
  #include <linux/bit_spinlock.h>
  #include <linux/security.h>
@@@ -215,7 -216,6 +215,7 @@@ static int check_flags(unsigned int fla
  static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
  {
        struct inode *inode = file_inode(file);
 +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct btrfs_inode *ip = BTRFS_I(inode);
        struct btrfs_root *root = ip->root;
        struct btrfs_trans_handle *trans;
                ip->flags |= BTRFS_INODE_COMPRESS;
                ip->flags &= ~BTRFS_INODE_NOCOMPRESS;
  
 -              if (root->fs_info->compress_type == BTRFS_COMPRESS_LZO)
 +              if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
                        comp = "lzo";
                else
                        comp = "zlib";
        inode->i_ctime = current_time(inode);
        ret = btrfs_update_inode(trans, root, inode);
  
 -      btrfs_end_transaction(trans, root);
 +      btrfs_end_transaction(trans);
   out_drop:
        if (ret) {
                ip->flags = ip_oldflags;
@@@ -374,8 -374,7 +374,8 @@@ static int btrfs_ioctl_getversion(struc
  
  static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
  {
 -      struct btrfs_fs_info *fs_info = btrfs_sb(file_inode(file)->i_sb);
 +      struct inode *inode = file_inode(file);
 +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct btrfs_device *device;
        struct request_queue *q;
        struct fstrim_range range;
  
        range.len = min(range.len, total_bytes - range.start);
        range.minlen = max(range.minlen, minlen);
 -      ret = btrfs_trim_fs(fs_info->tree_root, &range);
 +      ret = btrfs_trim_fs(fs_info, &range);
        if (ret < 0)
                return ret;
  
@@@ -438,7 -437,6 +438,7 @@@ static noinline int create_subvol(struc
                                  u64 *async_transid,
                                  struct btrfs_qgroup_inherit *inherit)
  {
 +      struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
        struct btrfs_trans_handle *trans;
        struct btrfs_key key;
        struct btrfs_root_item *root_item;
        if (!root_item)
                return -ENOMEM;
  
 -      ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid);
 +      ret = btrfs_find_free_objectid(fs_info->tree_root, &objectid);
        if (ret)
                goto fail_free;
  
        trans = btrfs_start_transaction(root, 0);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
 -              btrfs_subvolume_release_metadata(root, &block_rsv,
 +              btrfs_subvolume_release_metadata(fs_info, &block_rsv,
                                                 qgroup_reserved);
                goto fail_free;
        }
        trans->block_rsv = &block_rsv;
        trans->bytes_reserved = block_rsv.size;
  
 -      ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid, inherit);
 +      ret = btrfs_qgroup_inherit(trans, fs_info, 0, objectid, inherit);
        if (ret)
                goto fail;
  
                goto fail;
        }
  
 -      memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
 +      memzero_extent_buffer(leaf, 0, sizeof(struct btrfs_header));
        btrfs_set_header_bytenr(leaf, leaf->start);
        btrfs_set_header_generation(leaf, trans->transid);
        btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
        btrfs_set_header_owner(leaf, objectid);
  
 -      write_extent_buffer(leaf, root->fs_info->fsid, btrfs_header_fsid(),
 -                          BTRFS_FSID_SIZE);
 -      write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
 -                          btrfs_header_chunk_tree_uuid(leaf),
 -                          BTRFS_UUID_SIZE);
 +      write_extent_buffer_fsid(leaf, fs_info->fsid);
 +      write_extent_buffer_chunk_tree_uuid(leaf, fs_info->chunk_tree_uuid);
        btrfs_mark_buffer_dirty(leaf);
  
        inode_item = &root_item->inode;
        btrfs_set_stack_inode_generation(inode_item, 1);
        btrfs_set_stack_inode_size(inode_item, 3);
        btrfs_set_stack_inode_nlink(inode_item, 1);
 -      btrfs_set_stack_inode_nbytes(inode_item, root->nodesize);
 +      btrfs_set_stack_inode_nbytes(inode_item,
 +                                   fs_info->nodesize);
        btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
  
        btrfs_set_root_flags(root_item, 0);
        key.objectid = objectid;
        key.offset = 0;
        key.type = BTRFS_ROOT_ITEM_KEY;
 -      ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
 +      ret = btrfs_insert_root(trans, fs_info->tree_root, &key,
                                root_item);
        if (ret)
                goto fail;
  
        key.offset = (u64)-1;
 -      new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
 +      new_root = btrfs_read_fs_root_no_name(fs_info, &key);
        if (IS_ERR(new_root)) {
                ret = PTR_ERR(new_root);
                btrfs_abort_transaction(trans, ret);
        ret = btrfs_update_inode(trans, root, dir);
        BUG_ON(ret);
  
 -      ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
 +      ret = btrfs_add_root_ref(trans, fs_info,
                                 objectid, root->root_key.objectid,
                                 btrfs_ino(dir), index, name, namelen);
        BUG_ON(ret);
  
 -      ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root,
 -                                root_item->uuid, BTRFS_UUID_KEY_SUBVOL,
 -                                objectid);
 +      ret = btrfs_uuid_tree_add(trans, fs_info, root_item->uuid,
 +                                BTRFS_UUID_KEY_SUBVOL, objectid);
        if (ret)
                btrfs_abort_transaction(trans, ret);
  
@@@ -613,15 -614,15 +613,15 @@@ fail
        kfree(root_item);
        trans->block_rsv = NULL;
        trans->bytes_reserved = 0;
 -      btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
 +      btrfs_subvolume_release_metadata(fs_info, &block_rsv, qgroup_reserved);
  
        if (async_transid) {
                *async_transid = trans->transid;
 -              err = btrfs_commit_transaction_async(trans, root, 1);
 +              err = btrfs_commit_transaction_async(trans, 1);
                if (err)
 -                      err = btrfs_commit_transaction(trans, root);
 +                      err = btrfs_commit_transaction(trans);
        } else {
 -              err = btrfs_commit_transaction(trans, root);
 +              err = btrfs_commit_transaction(trans);
        }
        if (err && !ret)
                ret = err;
@@@ -661,7 -662,6 +661,7 @@@ static int create_snapshot(struct btrfs
                           u64 *async_transid, bool readonly,
                           struct btrfs_qgroup_inherit *inherit)
  {
 +      struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
        struct inode *inode;
        struct btrfs_pending_snapshot *pending_snapshot;
        struct btrfs_trans_handle *trans;
                goto fail;
        }
  
 -      spin_lock(&root->fs_info->trans_lock);
 +      spin_lock(&fs_info->trans_lock);
        list_add(&pending_snapshot->list,
                 &trans->transaction->pending_snapshots);
 -      spin_unlock(&root->fs_info->trans_lock);
 +      spin_unlock(&fs_info->trans_lock);
        if (async_transid) {
                *async_transid = trans->transid;
 -              ret = btrfs_commit_transaction_async(trans,
 -                                   root->fs_info->extent_root, 1);
 +              ret = btrfs_commit_transaction_async(trans, 1);
                if (ret)
 -                      ret = btrfs_commit_transaction(trans, root);
 +                      ret = btrfs_commit_transaction(trans);
        } else {
 -              ret = btrfs_commit_transaction(trans,
 -                                             root->fs_info->extent_root);
 +              ret = btrfs_commit_transaction(trans);
        }
        if (ret)
                goto fail;
        d_instantiate(dentry, inode);
        ret = 0;
  fail:
 -      btrfs_subvolume_release_metadata(BTRFS_I(dir)->root,
 +      btrfs_subvolume_release_metadata(fs_info,
                                         &pending_snapshot->block_rsv,
                                         pending_snapshot->qgroup_reserved);
  dec_and_free:
@@@ -834,14 -836,13 +834,14 @@@ static inline int btrfs_may_create(stru
   * sys_mkdirat and vfs_mkdir, but we only do a single component lookup
   * inside this filesystem so it's quite a bit simpler.
   */
- static noinline int btrfs_mksubvol(struct path *parent,
+ static noinline int btrfs_mksubvol(const struct path *parent,
                                   char *name, int namelen,
                                   struct btrfs_root *snap_src,
                                   u64 *async_transid, bool readonly,
                                   struct btrfs_qgroup_inherit *inherit)
  {
 -      struct inode *dir  = d_inode(parent->dentry);
 +      struct inode *dir = d_inode(parent->dentry);
 +      struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
        struct dentry *dentry;
        int error;
  
        if (error)
                goto out_dput;
  
 -      down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
 +      down_read(&fs_info->subvol_sem);
  
        if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
                goto out_up_read;
        if (!error)
                fsnotify_mkdir(dir, dentry);
  out_up_read:
 -      up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
 +      up_read(&fs_info->subvol_sem);
  out_dput:
        dput(dentry);
  out_unlock:
@@@ -1267,7 -1268,6 +1267,7 @@@ int btrfs_defrag_file(struct inode *ino
                      struct btrfs_ioctl_defrag_range_args *range,
                      u64 newer_than, unsigned long max_to_defrag)
  {
 +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct file_ra_state *ra = NULL;
        unsigned long last_index;
                if (!(inode->i_sb->s_flags & MS_ACTIVE))
                        break;
  
 -              if (btrfs_defrag_cancelled(root->fs_info)) {
 -                      btrfs_debug(root->fs_info, "defrag_file cancelled");
 +              if (btrfs_defrag_cancelled(fs_info)) {
 +                      btrfs_debug(fs_info, "defrag_file cancelled");
                        ret = -EAGAIN;
                        break;
                }
                 * we have to make sure the IO is actually started and that
                 * ordered extents get created before we return
                 */
 -              atomic_inc(&root->fs_info->async_submit_draining);
 -              while (atomic_read(&root->fs_info->nr_async_submits) ||
 -                    atomic_read(&root->fs_info->async_delalloc_pages)) {
 -                      wait_event(root->fs_info->async_submit_wait,
 -                         (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
 -                          atomic_read(&root->fs_info->async_delalloc_pages) == 0));
 +              atomic_inc(&fs_info->async_submit_draining);
 +              while (atomic_read(&fs_info->nr_async_submits) ||
 +                     atomic_read(&fs_info->async_delalloc_pages)) {
 +                      wait_event(fs_info->async_submit_wait,
 +                                 (atomic_read(&fs_info->nr_async_submits) == 0 &&
 +                                  atomic_read(&fs_info->async_delalloc_pages) == 0));
                }
 -              atomic_dec(&root->fs_info->async_submit_draining);
 +              atomic_dec(&fs_info->async_submit_draining);
        }
  
        if (range->compress_type == BTRFS_COMPRESS_LZO) {
 -              btrfs_set_fs_incompat(root->fs_info, COMPRESS_LZO);
 +              btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
        }
  
        ret = defrag_count;
@@@ -1485,12 -1485,10 +1485,12 @@@ out_ra
  static noinline int btrfs_ioctl_resize(struct file *file,
                                        void __user *arg)
  {
 +      struct inode *inode = file_inode(file);
 +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        u64 new_size;
        u64 old_size;
        u64 devid = 1;
 -      struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
 +      struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_ioctl_vol_args *vol_args;
        struct btrfs_trans_handle *trans;
        struct btrfs_device *device = NULL;
        if (ret)
                return ret;
  
 -      if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
 -                      1)) {
 +      if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) {
                mnt_drop_write_file(file);
                return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
        }
  
 -      mutex_lock(&root->fs_info->volume_mutex);
 +      mutex_lock(&fs_info->volume_mutex);
        vol_args = memdup_user(arg, sizeof(*vol_args));
        if (IS_ERR(vol_args)) {
                ret = PTR_ERR(vol_args);
                        ret = -EINVAL;
                        goto out_free;
                }
 -              btrfs_info(root->fs_info, "resizing devid %llu", devid);
 +              btrfs_info(fs_info, "resizing devid %llu", devid);
        }
  
 -      device = btrfs_find_device(root->fs_info, devid, NULL, NULL);
 +      device = btrfs_find_device(fs_info, devid, NULL, NULL);
        if (!device) {
 -              btrfs_info(root->fs_info, "resizer unable to find device %llu",
 -                     devid);
 +              btrfs_info(fs_info, "resizer unable to find device %llu",
 +                         devid);
                ret = -ENODEV;
                goto out_free;
        }
  
        if (!device->writeable) {
 -              btrfs_info(root->fs_info,
 +              btrfs_info(fs_info,
                           "resizer unable to apply on readonly device %llu",
                       devid);
                ret = -EPERM;
                goto out_free;
        }
  
 -      new_size = div_u64(new_size, root->sectorsize);
 -      new_size *= root->sectorsize;
 +      new_size = div_u64(new_size, fs_info->sectorsize);
 +      new_size *= fs_info->sectorsize;
  
 -      btrfs_info_in_rcu(root->fs_info, "new size for %s is %llu",
 -                    rcu_str_deref(device->name), new_size);
 +      btrfs_info_in_rcu(fs_info, "new size for %s is %llu",
 +                        rcu_str_deref(device->name), new_size);
  
        if (new_size > old_size) {
                trans = btrfs_start_transaction(root, 0);
                        goto out_free;
                }
                ret = btrfs_grow_device(trans, device, new_size);
 -              btrfs_commit_transaction(trans, root);
 +              btrfs_commit_transaction(trans);
        } else if (new_size < old_size) {
                ret = btrfs_shrink_device(device, new_size);
        } /* equal, nothing need to do */
  out_free:
        kfree(vol_args);
  out:
 -      mutex_unlock(&root->fs_info->volume_mutex);
 -      atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
 +      mutex_unlock(&fs_info->volume_mutex);
 +      atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
        mnt_drop_write_file(file);
        return ret;
  }
@@@ -1775,7 -1774,6 +1775,7 @@@ static noinline int btrfs_ioctl_subvol_
                                                void __user *arg)
  {
        struct inode *inode = file_inode(file);
 +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        int ret = 0;
        u64 flags = 0;
        if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID)
                return -EINVAL;
  
 -      down_read(&root->fs_info->subvol_sem);
 +      down_read(&fs_info->subvol_sem);
        if (btrfs_root_readonly(root))
                flags |= BTRFS_SUBVOL_RDONLY;
 -      up_read(&root->fs_info->subvol_sem);
 +      up_read(&fs_info->subvol_sem);
  
        if (copy_to_user(arg, &flags, sizeof(flags)))
                ret = -EFAULT;
@@@ -1798,7 -1796,6 +1798,7 @@@ static noinline int btrfs_ioctl_subvol_
                                              void __user *arg)
  {
        struct inode *inode = file_inode(file);
 +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
        u64 root_flags;
                goto out_drop_write;
        }
  
 -      down_write(&root->fs_info->subvol_sem);
 +      down_write(&fs_info->subvol_sem);
  
        /* nothing to do */
        if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root))
                        spin_unlock(&root->root_item_lock);
                } else {
                        spin_unlock(&root->root_item_lock);
 -                      btrfs_warn(root->fs_info,
 -                      "Attempt to set subvolume %llu read-write during send",
 -                                      root->root_key.objectid);
 +                      btrfs_warn(fs_info,
 +                                 "Attempt to set subvolume %llu read-write during send",
 +                                 root->root_key.objectid);
                        ret = -EPERM;
                        goto out_drop_sem;
                }
                goto out_reset;
        }
  
 -      ret = btrfs_update_root(trans, root->fs_info->tree_root,
 +      ret = btrfs_update_root(trans, fs_info->tree_root,
                                &root->root_key, &root->root_item);
  
 -      btrfs_commit_transaction(trans, root);
 +      btrfs_commit_transaction(trans);
  out_reset:
        if (ret)
                btrfs_set_root_flags(&root->root_item, root_flags);
  out_drop_sem:
 -      up_write(&root->fs_info->subvol_sem);
 +      up_write(&fs_info->subvol_sem);
  out_drop_write:
        mnt_drop_write_file(file);
  out:
   */
  static noinline int may_destroy_subvol(struct btrfs_root *root)
  {
 +      struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_path *path;
        struct btrfs_dir_item *di;
        struct btrfs_key key;
                return -ENOMEM;
  
        /* Make sure this root isn't set as the default subvol */
 -      dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
 -      di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root, path,
 +      dir_id = btrfs_super_root_dir(fs_info->super_copy);
 +      di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
                                   dir_id, "default", 7, 0);
        if (di && !IS_ERR(di)) {
                btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
                if (key.objectid == root->root_key.objectid) {
                        ret = -EPERM;
 -                      btrfs_err(root->fs_info,
 +                      btrfs_err(fs_info,
                                  "deleting default subvolume %llu is not allowed",
                                  key.objectid);
                        goto out;
        key.type = BTRFS_ROOT_REF_KEY;
        key.offset = (u64)-1;
  
 -      ret = btrfs_search_slot(NULL, root->fs_info->tree_root,
 -                              &key, path, 0, 0);
 +      ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
        if (ret < 0)
                goto out;
        BUG_ON(ret == 0);
@@@ -2090,10 -2087,10 +2090,10 @@@ static noinline int search_ioctl(struc
                                 size_t *buf_size,
                                 char __user *ubuf)
  {
 +      struct btrfs_fs_info *info = btrfs_sb(inode->i_sb);
        struct btrfs_root *root;
        struct btrfs_key key;
        struct btrfs_path *path;
 -      struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info;
        int ret;
        int num_found = 0;
        unsigned long sk_offset = 0;
@@@ -2356,7 -2353,6 +2356,7 @@@ static noinline int btrfs_ioctl_snap_de
                                             void __user *arg)
  {
        struct dentry *parent = file->f_path.dentry;
 +      struct btrfs_fs_info *fs_info = btrfs_sb(parent->d_sb);
        struct dentry *dentry;
        struct inode *dir = d_inode(parent);
        struct inode *inode;
                 * rmdir(2).
                 */
                err = -EPERM;
 -              if (!btrfs_test_opt(root->fs_info, USER_SUBVOL_RM_ALLOWED))
 +              if (!btrfs_test_opt(fs_info, USER_SUBVOL_RM_ALLOWED))
                        goto out_dput;
  
                /*
                spin_unlock(&dest->root_item_lock);
        } else {
                spin_unlock(&dest->root_item_lock);
 -              btrfs_warn(root->fs_info,
 -                      "Attempt to delete subvolume %llu during send",
 -                      dest->root_key.objectid);
 +              btrfs_warn(fs_info,
 +                         "Attempt to delete subvolume %llu during send",
 +                         dest->root_key.objectid);
                err = -EPERM;
                goto out_unlock_inode;
        }
  
 -      down_write(&root->fs_info->subvol_sem);
 +      down_write(&fs_info->subvol_sem);
  
        err = may_destroy_subvol(dest);
        if (err)
  
        if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
                ret = btrfs_insert_orphan_item(trans,
 -                                      root->fs_info->tree_root,
 +                                      fs_info->tree_root,
                                        dest->root_key.objectid);
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
                }
        }
  
 -      ret = btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root,
 -                                dest->root_item.uuid, BTRFS_UUID_KEY_SUBVOL,
 +      ret = btrfs_uuid_tree_rem(trans, fs_info, dest->root_item.uuid,
 +                                BTRFS_UUID_KEY_SUBVOL,
                                  dest->root_key.objectid);
        if (ret && ret != -ENOENT) {
                btrfs_abort_transaction(trans, ret);
                goto out_end_trans;
        }
        if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
 -              ret = btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root,
 +              ret = btrfs_uuid_tree_rem(trans, fs_info,
                                          dest->root_item.received_uuid,
                                          BTRFS_UUID_KEY_RECEIVED_SUBVOL,
                                          dest->root_key.objectid);
  out_end_trans:
        trans->block_rsv = NULL;
        trans->bytes_reserved = 0;
 -      ret = btrfs_end_transaction(trans, root);
 +      ret = btrfs_end_transaction(trans);
        if (ret && !err)
                err = ret;
        inode->i_flags |= S_DEAD;
  out_release:
 -      btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
 +      btrfs_subvolume_release_metadata(fs_info, &block_rsv, qgroup_reserved);
  out_up_write:
 -      up_write(&root->fs_info->subvol_sem);
 +      up_write(&fs_info->subvol_sem);
        if (err) {
                spin_lock(&dest->root_item_lock);
                root_flags = btrfs_root_flags(&dest->root_item);
@@@ -2659,7 -2655,7 +2659,7 @@@ out
        return ret;
  }
  
 -static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
 +static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
  {
        struct btrfs_ioctl_vol_args *vol_args;
        int ret;
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
  
 -      if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
 -                      1)) {
 +      if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1))
                return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
 -      }
  
 -      mutex_lock(&root->fs_info->volume_mutex);
 +      mutex_lock(&fs_info->volume_mutex);
        vol_args = memdup_user(arg, sizeof(*vol_args));
        if (IS_ERR(vol_args)) {
                ret = PTR_ERR(vol_args);
        }
  
        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
 -      ret = btrfs_init_new_device(root, vol_args->name);
 +      ret = btrfs_init_new_device(fs_info, vol_args->name);
  
        if (!ret)
 -              btrfs_info(root->fs_info, "disk added %s",vol_args->name);
 +              btrfs_info(fs_info, "disk added %s", vol_args->name);
  
        kfree(vol_args);
  out:
 -      mutex_unlock(&root->fs_info->volume_mutex);
 -      atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
 +      mutex_unlock(&fs_info->volume_mutex);
 +      atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
        return ret;
  }
  
  static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
  {
 -      struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
 +      struct inode *inode = file_inode(file);
 +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct btrfs_ioctl_vol_args_v2 *vol_args;
        int ret;
  
        if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED)
                return -EOPNOTSUPP;
  
 -      if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
 -                      1)) {
 +      if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) {
                ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
                goto out;
        }
  
 -      mutex_lock(&root->fs_info->volume_mutex);
 +      mutex_lock(&fs_info->volume_mutex);
        if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) {
 -              ret = btrfs_rm_device(root, NULL, vol_args->devid);
 +              ret = btrfs_rm_device(fs_info, NULL, vol_args->devid);
        } else {
                vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
 -              ret = btrfs_rm_device(root, vol_args->name, 0);
 +              ret = btrfs_rm_device(fs_info, vol_args->name, 0);
        }
 -      mutex_unlock(&root->fs_info->volume_mutex);
 -      atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
 +      mutex_unlock(&fs_info->volume_mutex);
 +      atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
  
        if (!ret) {
                if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
 -                      btrfs_info(root->fs_info, "device deleted: id %llu",
 +                      btrfs_info(fs_info, "device deleted: id %llu",
                                        vol_args->devid);
                else
 -                      btrfs_info(root->fs_info, "device deleted: %s",
 +                      btrfs_info(fs_info, "device deleted: %s",
                                        vol_args->name);
        }
  out:
@@@ -2746,8 -2744,7 +2746,8 @@@ err_drop
  
  static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
  {
 -      struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
 +      struct inode *inode = file_inode(file);
 +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct btrfs_ioctl_vol_args *vol_args;
        int ret;
  
        if (ret)
                return ret;
  
 -      if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
 -                      1)) {
 +      if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) {
                ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
                goto out_drop_write;
        }
        }
  
        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
 -      mutex_lock(&root->fs_info->volume_mutex);
 -      ret = btrfs_rm_device(root, vol_args->name, 0);
 -      mutex_unlock(&root->fs_info->volume_mutex);
 +      mutex_lock(&fs_info->volume_mutex);
 +      ret = btrfs_rm_device(fs_info, vol_args->name, 0);
 +      mutex_unlock(&fs_info->volume_mutex);
  
        if (!ret)
 -              btrfs_info(root->fs_info, "disk deleted %s",vol_args->name);
 +              btrfs_info(fs_info, "disk deleted %s", vol_args->name);
        kfree(vol_args);
  out:
 -      atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
 +      atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
  out_drop_write:
        mnt_drop_write_file(file);
  
        return ret;
  }
  
 -static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
 +static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
 +                              void __user *arg)
  {
        struct btrfs_ioctl_fs_info_args *fi_args;
        struct btrfs_device *device;
 -      struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
 +      struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
        int ret = 0;
  
        fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL);
  
        mutex_lock(&fs_devices->device_list_mutex);
        fi_args->num_devices = fs_devices->num_devices;
 -      memcpy(&fi_args->fsid, root->fs_info->fsid, sizeof(fi_args->fsid));
 +      memcpy(&fi_args->fsid, fs_info->fsid, sizeof(fi_args->fsid));
  
        list_for_each_entry(device, &fs_devices->devices, dev_list) {
                if (device->devid > fi_args->max_id)
        }
        mutex_unlock(&fs_devices->device_list_mutex);
  
 -      fi_args->nodesize = root->fs_info->super_copy->nodesize;
 -      fi_args->sectorsize = root->fs_info->super_copy->sectorsize;
 -      fi_args->clone_alignment = root->fs_info->super_copy->sectorsize;
 +      fi_args->nodesize = fs_info->super_copy->nodesize;
 +      fi_args->sectorsize = fs_info->super_copy->sectorsize;
 +      fi_args->clone_alignment = fs_info->super_copy->sectorsize;
  
        if (copy_to_user(arg, fi_args, sizeof(*fi_args)))
                ret = -EFAULT;
        return ret;
  }
  
 -static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
 +static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
 +                               void __user *arg)
  {
        struct btrfs_ioctl_dev_info_args *di_args;
        struct btrfs_device *dev;
 -      struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
 +      struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
        int ret = 0;
        char *s_uuid = NULL;
  
                s_uuid = di_args->uuid;
  
        mutex_lock(&fs_devices->device_list_mutex);
 -      dev = btrfs_find_device(root->fs_info, di_args->devid, s_uuid, NULL);
 +      dev = btrfs_find_device(fs_info, di_args->devid, s_uuid, NULL);
  
        if (!dev) {
                ret = -ENODEV;
@@@ -3309,10 -3305,10 +3309,10 @@@ static int clone_finish_inode_update(st
        ret = btrfs_update_inode(trans, root, inode);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
 -              btrfs_end_transaction(trans, root);
 +              btrfs_end_transaction(trans);
                goto out;
        }
 -      ret = btrfs_end_transaction(trans, root);
 +      ret = btrfs_end_transaction(trans);
  out:
        return ret;
  }
@@@ -3410,10 -3406,9 +3410,10 @@@ static int clone_copy_inline_extent(str
                                    const u64 size,
                                    char *inline_data)
  {
 +      struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb);
        struct btrfs_root *root = BTRFS_I(dst)->root;
        const u64 aligned_end = ALIGN(new_key->offset + datal,
 -                                    root->sectorsize);
 +                                    fs_info->sectorsize);
        int ret;
        struct btrfs_key key;
  
@@@ -3534,7 -3529,6 +3534,7 @@@ static int btrfs_clone(struct inode *sr
                       const u64 off, const u64 olen, const u64 olen_aligned,
                       const u64 destoff, int no_time_update)
  {
 +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_path *path = NULL;
        struct extent_buffer *leaf;
        u64 last_dest_end = destoff;
  
        ret = -ENOMEM;
 -      buf = kmalloc(root->nodesize, GFP_KERNEL | __GFP_NOWARN);
 +      buf = kmalloc(fs_info->nodesize, GFP_KERNEL | __GFP_NOWARN);
        if (!buf) {
 -              buf = vmalloc(root->nodesize);
 +              buf = vmalloc(fs_info->nodesize);
                if (!buf)
                        return ret;
        }
@@@ -3713,7 -3707,7 +3713,7 @@@ process_slot
                                        if (ret != -EOPNOTSUPP)
                                                btrfs_abort_transaction(trans,
                                                                        ret);
 -                                      btrfs_end_transaction(trans, root);
 +                                      btrfs_end_transaction(trans);
                                        goto out;
                                }
  
                                                              &new_key, size);
                                if (ret) {
                                        btrfs_abort_transaction(trans, ret);
 -                                      btrfs_end_transaction(trans, root);
 +                                      btrfs_end_transaction(trans);
                                        goto out;
                                }
  
  
                                if (disko) {
                                        inode_add_bytes(inode, datal);
 -                                      ret = btrfs_inc_extent_ref(trans, root,
 +                                      ret = btrfs_inc_extent_ref(trans,
 +                                                      fs_info,
                                                        disko, diskl, 0,
                                                        root->root_key.objectid,
                                                        btrfs_ino(inode),
                                        if (ret) {
                                                btrfs_abort_transaction(trans,
                                                                        ret);
 -                                              btrfs_end_transaction(trans,
 -                                                                    root);
 +                                              btrfs_end_transaction(trans);
                                                goto out;
  
                                        }
  
                                if (comp && (skip || trim)) {
                                        ret = -EINVAL;
 -                                      btrfs_end_transaction(trans, root);
 +                                      btrfs_end_transaction(trans);
                                        goto out;
                                }
                                size -= skip + trim;
                                        if (ret != -EOPNOTSUPP)
                                                btrfs_abort_transaction(trans,
                                                                        ret);
 -                                      btrfs_end_transaction(trans, root);
 +                                      btrfs_end_transaction(trans);
                                        goto out;
                                }
                                leaf = path->nodes[0];
                        btrfs_release_path(path);
  
                        last_dest_end = ALIGN(new_key.offset + datal,
 -                                            root->sectorsize);
 +                                            fs_info->sectorsize);
                        ret = clone_finish_inode_update(trans, inode,
                                                        last_dest_end,
                                                        destoff, olen,
                if (ret) {
                        if (ret != -EOPNOTSUPP)
                                btrfs_abort_transaction(trans, ret);
 -                      btrfs_end_transaction(trans, root);
 +                      btrfs_end_transaction(trans);
                        goto out;
                }
                clone_update_extent_map(inode, trans, NULL, last_dest_end,
@@@ -3869,11 -3863,10 +3869,11 @@@ static noinline int btrfs_clone_files(s
  {
        struct inode *inode = file_inode(file);
        struct inode *src = file_inode(file_src);
 +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        int ret;
        u64 len = olen;
 -      u64 bs = root->fs_info->sb->s_blocksize;
 +      u64 bs = fs_info->sb->s_blocksize;
        int same_inode = src == inode;
  
        /*
@@@ -3987,18 -3980,6 +3987,6 @@@ out_unlock
        return ret;
  }
  
- ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in,
-                             struct file *file_out, loff_t pos_out,
-                             size_t len, unsigned int flags)
- {
-       ssize_t ret;
-       ret = btrfs_clone_files(file_out, file_in, pos_in, len, pos_out);
-       if (ret == 0)
-               ret = len;
-       return ret;
- }
  int btrfs_clone_file_range(struct file *src_file, loff_t off,
                struct file *dst_file, loff_t destoff, u64 len)
  {
  static long btrfs_ioctl_trans_start(struct file *file)
  {
        struct inode *inode = file_inode(file);
 +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
        int ret;
        if (ret)
                goto out;
  
 -      atomic_inc(&root->fs_info->open_ioctl_trans);
 +      atomic_inc(&fs_info->open_ioctl_trans);
  
        ret = -ENOMEM;
        trans = btrfs_start_ioctl_transaction(root);
        return 0;
  
  out_drop:
 -      atomic_dec(&root->fs_info->open_ioctl_trans);
 +      atomic_dec(&fs_info->open_ioctl_trans);
        mnt_drop_write_file(file);
  out:
        return ret;
  static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
  {
        struct inode *inode = file_inode(file);
 +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_root *new_root;
        struct btrfs_dir_item *di;
        location.type = BTRFS_ROOT_ITEM_KEY;
        location.offset = (u64)-1;
  
 -      new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
 +      new_root = btrfs_read_fs_root_no_name(fs_info, &location);
        if (IS_ERR(new_root)) {
                ret = PTR_ERR(new_root);
                goto out;
                goto out;
        }
  
 -      dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
 -      di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
 +      dir_id = btrfs_super_root_dir(fs_info->super_copy);
 +      di = btrfs_lookup_dir_item(trans, fs_info->tree_root, path,
                                   dir_id, "default", 7, 1);
        if (IS_ERR_OR_NULL(di)) {
                btrfs_free_path(path);
 -              btrfs_end_transaction(trans, root);
 -              btrfs_err(new_root->fs_info,
 +              btrfs_end_transaction(trans);
 +              btrfs_err(fs_info,
                          "Umm, you don't have the default diritem, this isn't going to work");
                ret = -ENOENT;
                goto out;
        btrfs_mark_buffer_dirty(path->nodes[0]);
        btrfs_free_path(path);
  
 -      btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL);
 -      btrfs_end_transaction(trans, root);
 +      btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL);
 +      btrfs_end_transaction(trans);
  out:
        mnt_drop_write_file(file);
        return ret;
@@@ -4146,8 -4125,7 +4134,8 @@@ void btrfs_get_block_group_info(struct 
        }
  }
  
 -static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
 +static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info,
 +                                 void __user *arg)
  {
        struct btrfs_ioctl_space_args space_args;
        struct btrfs_ioctl_space_info space;
  
                info = NULL;
                rcu_read_lock();
 -              list_for_each_entry_rcu(tmp, &root->fs_info->space_info,
 +              list_for_each_entry_rcu(tmp, &fs_info->space_info,
                                        list) {
                        if (tmp->flags == types[i]) {
                                info = tmp;
  
                info = NULL;
                rcu_read_lock();
 -              list_for_each_entry_rcu(tmp, &root->fs_info->space_info,
 +              list_for_each_entry_rcu(tmp, &fs_info->space_info,
                                        list) {
                        if (tmp->flags == types[i]) {
                                info = tmp;
         * Add global block reserve
         */
        if (slot_count) {
 -              struct btrfs_block_rsv *block_rsv = &root->fs_info->global_block_rsv;
 +              struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
  
                spin_lock(&block_rsv->lock);
                space.total_bytes = block_rsv->size;
@@@ -4304,7 -4282,7 +4292,7 @@@ long btrfs_ioctl_trans_end(struct file 
                return -EINVAL;
        file->private_data = NULL;
  
 -      btrfs_end_transaction(trans, root);
 +      btrfs_end_transaction(trans);
  
        atomic_dec(&root->fs_info->open_ioctl_trans);
  
@@@ -4329,9 -4307,9 +4317,9 @@@ static noinline long btrfs_ioctl_start_
                goto out;
        }
        transid = trans->transid;
 -      ret = btrfs_commit_transaction_async(trans, root, 0);
 +      ret = btrfs_commit_transaction_async(trans, 0);
        if (ret) {
 -              btrfs_end_transaction(trans, root);
 +              btrfs_end_transaction(trans);
                return ret;
        }
  out:
        return 0;
  }
  
 -static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root,
 +static noinline long btrfs_ioctl_wait_sync(struct btrfs_fs_info *fs_info,
                                           void __user *argp)
  {
        u64 transid;
        } else {
                transid = 0;  /* current trans */
        }
 -      return btrfs_wait_for_commit(root, transid);
 +      return btrfs_wait_for_commit(fs_info, transid);
  }
  
  static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
  {
 -      struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
 +      struct btrfs_fs_info *fs_info = btrfs_sb(file_inode(file)->i_sb);
        struct btrfs_ioctl_scrub_args *sa;
        int ret;
  
                        goto out;
        }
  
 -      ret = btrfs_scrub_dev(root->fs_info, sa->devid, sa->start, sa->end,
 +      ret = btrfs_scrub_dev(fs_info, sa->devid, sa->start, sa->end,
                              &sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
                              0);
  
        return ret;
  }
  
 -static long btrfs_ioctl_scrub_cancel(struct btrfs_root *root, void __user *arg)
 +static long btrfs_ioctl_scrub_cancel(struct btrfs_fs_info *fs_info)
  {
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
  
 -      return btrfs_scrub_cancel(root->fs_info);
 +      return btrfs_scrub_cancel(fs_info);
  }
  
 -static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
 +static long btrfs_ioctl_scrub_progress(struct btrfs_fs_info *fs_info,
                                       void __user *arg)
  {
        struct btrfs_ioctl_scrub_args *sa;
        if (IS_ERR(sa))
                return PTR_ERR(sa);
  
 -      ret = btrfs_scrub_progress(root, sa->devid, &sa->progress);
 +      ret = btrfs_scrub_progress(fs_info, sa->devid, &sa->progress);
  
        if (copy_to_user(arg, sa, sizeof(*sa)))
                ret = -EFAULT;
        return ret;
  }
  
 -static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root,
 +static long btrfs_ioctl_get_dev_stats(struct btrfs_fs_info *fs_info,
                                      void __user *arg)
  {
        struct btrfs_ioctl_get_dev_stats *sa;
                return -EPERM;
        }
  
 -      ret = btrfs_get_dev_stats(root, sa);
 +      ret = btrfs_get_dev_stats(fs_info, sa);
  
        if (copy_to_user(arg, sa, sizeof(*sa)))
                ret = -EFAULT;
        return ret;
  }
  
 -static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg)
 +static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
 +                                  void __user *arg)
  {
        struct btrfs_ioctl_dev_replace_args *p;
        int ret;
  
        switch (p->cmd) {
        case BTRFS_IOCTL_DEV_REPLACE_CMD_START:
 -              if (root->fs_info->sb->s_flags & MS_RDONLY) {
 +              if (fs_info->sb->s_flags & MS_RDONLY) {
                        ret = -EROFS;
                        goto out;
                }
                if (atomic_xchg(
 -                      &root->fs_info->mutually_exclusive_operation_running,
 -                      1)) {
 +                      &fs_info->mutually_exclusive_operation_running, 1)) {
                        ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
                } else {
 -                      ret = btrfs_dev_replace_by_ioctl(root, p);
 +                      ret = btrfs_dev_replace_by_ioctl(fs_info, p);
                        atomic_set(
 -                       &root->fs_info->mutually_exclusive_operation_running,
 -                       0);
 +                       &fs_info->mutually_exclusive_operation_running, 0);
                }
                break;
        case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
 -              btrfs_dev_replace_status(root->fs_info, p);
 +              btrfs_dev_replace_status(fs_info, p);
                ret = 0;
                break;
        case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL:
 -              ret = btrfs_dev_replace_cancel(root->fs_info, p);
 +              ret = btrfs_dev_replace_cancel(fs_info, p);
                break;
        default:
                ret = -EINVAL;
@@@ -4568,7 -4547,7 +4556,7 @@@ static int build_ino_list(u64 inum, u6
        return 0;
  }
  
 -static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
 +static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
                                        void __user *arg)
  {
        int ret = 0;
                return -EPERM;
  
        loi = memdup_user(arg, sizeof(*loi));
 -      if (IS_ERR(loi)) {
 -              ret = PTR_ERR(loi);
 -              loi = NULL;
 -              goto out;
 -      }
 +      if (IS_ERR(loi))
 +              return PTR_ERR(loi);
  
        path = btrfs_alloc_path();
        if (!path) {
                goto out;
        }
  
 -      ret = iterate_inodes_from_logical(loi->logical, root->fs_info, path,
 +      ret = iterate_inodes_from_logical(loi->logical, fs_info, path,
                                          build_ino_list, inodes);
        if (ret == -EINVAL)
                ret = -ENOENT;
        return ret;
  }
  
 -static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd)
 +static long btrfs_ioctl_balance_ctl(struct btrfs_fs_info *fs_info, int cmd)
  {
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
  
        switch (cmd) {
        case BTRFS_BALANCE_CTL_PAUSE:
 -              return btrfs_pause_balance(root->fs_info);
 +              return btrfs_pause_balance(fs_info);
        case BTRFS_BALANCE_CTL_CANCEL:
 -              return btrfs_cancel_balance(root->fs_info);
 +              return btrfs_cancel_balance(fs_info);
        }
  
        return -EINVAL;
  }
  
 -static long btrfs_ioctl_balance_progress(struct btrfs_root *root,
 +static long btrfs_ioctl_balance_progress(struct btrfs_fs_info *fs_info,
                                         void __user *arg)
  {
 -      struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_ioctl_balance_args *bargs;
        int ret = 0;
  
@@@ -4843,8 -4826,7 +4831,8 @@@ out
  
  static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
  {
 -      struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
 +      struct inode *inode = file_inode(file);
 +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct btrfs_ioctl_quota_ctl_args *sa;
        struct btrfs_trans_handle *trans = NULL;
        int ret;
                goto drop_write;
        }
  
 -      down_write(&root->fs_info->subvol_sem);
 -      trans = btrfs_start_transaction(root->fs_info->tree_root, 2);
 +      down_write(&fs_info->subvol_sem);
 +      trans = btrfs_start_transaction(fs_info->tree_root, 2);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
                goto out;
  
        switch (sa->cmd) {
        case BTRFS_QUOTA_CTL_ENABLE:
 -              ret = btrfs_quota_enable(trans, root->fs_info);
 +              ret = btrfs_quota_enable(trans, fs_info);
                break;
        case BTRFS_QUOTA_CTL_DISABLE:
 -              ret = btrfs_quota_disable(trans, root->fs_info);
 +              ret = btrfs_quota_disable(trans, fs_info);
                break;
        default:
                ret = -EINVAL;
                break;
        }
  
 -      err = btrfs_commit_transaction(trans, root->fs_info->tree_root);
 +      err = btrfs_commit_transaction(trans);
        if (err && !ret)
                ret = err;
  out:
        kfree(sa);
 -      up_write(&root->fs_info->subvol_sem);
 +      up_write(&fs_info->subvol_sem);
  drop_write:
        mnt_drop_write_file(file);
        return ret;
  
  static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
  {
 -      struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
 +      struct inode *inode = file_inode(file);
 +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 +      struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_ioctl_qgroup_assign_args *sa;
        struct btrfs_trans_handle *trans;
        int ret;
  
        /* FIXME: check if the IDs really exist */
        if (sa->assign) {
 -              ret = btrfs_add_qgroup_relation(trans, root->fs_info,
 +              ret = btrfs_add_qgroup_relation(trans, fs_info,
                                                sa->src, sa->dst);
        } else {
 -              ret = btrfs_del_qgroup_relation(trans, root->fs_info,
 +              ret = btrfs_del_qgroup_relation(trans, fs_info,
                                                sa->src, sa->dst);
        }
  
        /* update qgroup status and info */
 -      err = btrfs_run_qgroups(trans, root->fs_info);
 +      err = btrfs_run_qgroups(trans, fs_info);
        if (err < 0)
 -              btrfs_handle_fs_error(root->fs_info, err,
 -                          "failed to update qgroup status and info");
 -      err = btrfs_end_transaction(trans, root);
 +              btrfs_handle_fs_error(fs_info, err,
 +                                    "failed to update qgroup status and info");
 +      err = btrfs_end_transaction(trans);
        if (err && !ret)
                ret = err;
  
@@@ -4949,9 -4929,7 +4937,9 @@@ drop_write
  
  static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
  {
 -      struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
 +      struct inode *inode = file_inode(file);
 +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 +      struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_ioctl_qgroup_create_args *sa;
        struct btrfs_trans_handle *trans;
        int ret;
  
        /* FIXME: check if the IDs really exist */
        if (sa->create) {
 -              ret = btrfs_create_qgroup(trans, root->fs_info, sa->qgroupid);
 +              ret = btrfs_create_qgroup(trans, fs_info, sa->qgroupid);
        } else {
 -              ret = btrfs_remove_qgroup(trans, root->fs_info, sa->qgroupid);
 +              ret = btrfs_remove_qgroup(trans, fs_info, sa->qgroupid);
        }
  
 -      err = btrfs_end_transaction(trans, root);
 +      err = btrfs_end_transaction(trans);
        if (err && !ret)
                ret = err;
  
@@@ -5001,9 -4979,7 +4989,9 @@@ drop_write
  
  static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
  {
 -      struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
 +      struct inode *inode = file_inode(file);
 +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 +      struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_ioctl_qgroup_limit_args *sa;
        struct btrfs_trans_handle *trans;
        int ret;
        }
  
        /* FIXME: check if the IDs really exist */
 -      ret = btrfs_limit_qgroup(trans, root->fs_info, qgroupid, &sa->lim);
 +      ret = btrfs_limit_qgroup(trans, fs_info, qgroupid, &sa->lim);
  
 -      err = btrfs_end_transaction(trans, root);
 +      err = btrfs_end_transaction(trans);
        if (err && !ret)
                ret = err;
  
@@@ -5051,8 -5027,7 +5039,8 @@@ drop_write
  
  static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg)
  {
 -      struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
 +      struct inode *inode = file_inode(file);
 +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct btrfs_ioctl_quota_rescan_args *qsa;
        int ret;
  
                goto out;
        }
  
 -      ret = btrfs_qgroup_rescan(root->fs_info);
 +      ret = btrfs_qgroup_rescan(fs_info);
  
  out:
        kfree(qsa);
@@@ -5085,8 -5060,7 +5073,8 @@@ drop_write
  
  static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg)
  {
 -      struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
 +      struct inode *inode = file_inode(file);
 +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct btrfs_ioctl_quota_rescan_args *qsa;
        int ret = 0;
  
        if (!qsa)
                return -ENOMEM;
  
 -      if (root->fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
 +      if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
                qsa->flags = 1;
 -              qsa->progress = root->fs_info->qgroup_rescan_progress.objectid;
 +              qsa->progress = fs_info->qgroup_rescan_progress.objectid;
        }
  
        if (copy_to_user(arg, qsa, sizeof(*qsa)))
  
  static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
  {
 -      struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
 +      struct inode *inode = file_inode(file);
 +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
  
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
  
 -      return btrfs_qgroup_wait_for_completion(root->fs_info, true);
 +      return btrfs_qgroup_wait_for_completion(fs_info, true);
  }
  
  static long _btrfs_ioctl_set_received_subvol(struct file *file,
                                            struct btrfs_ioctl_received_subvol_args *sa)
  {
        struct inode *inode = file_inode(file);
 +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_root_item *root_item = &root->root_item;
        struct btrfs_trans_handle *trans;
        if (ret < 0)
                return ret;
  
 -      down_write(&root->fs_info->subvol_sem);
 +      down_write(&fs_info->subvol_sem);
  
        if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
                ret = -EINVAL;
                                       BTRFS_UUID_SIZE);
        if (received_uuid_changed &&
            !btrfs_is_empty_uuid(root_item->received_uuid))
 -              btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root,
 -                                  root_item->received_uuid,
 +              btrfs_uuid_tree_rem(trans, fs_info, root_item->received_uuid,
                                    BTRFS_UUID_KEY_RECEIVED_SUBVOL,
                                    root->root_key.objectid);
        memcpy(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE);
        btrfs_set_stack_timespec_sec(&root_item->rtime, sa->rtime.sec);
        btrfs_set_stack_timespec_nsec(&root_item->rtime, sa->rtime.nsec);
  
 -      ret = btrfs_update_root(trans, root->fs_info->tree_root,
 +      ret = btrfs_update_root(trans, fs_info->tree_root,
                                &root->root_key, &root->root_item);
        if (ret < 0) {
 -              btrfs_end_transaction(trans, root);
 +              btrfs_end_transaction(trans);
                goto out;
        }
        if (received_uuid_changed && !btrfs_is_empty_uuid(sa->uuid)) {
 -              ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root,
 -                                        sa->uuid,
 +              ret = btrfs_uuid_tree_add(trans, fs_info, sa->uuid,
                                          BTRFS_UUID_KEY_RECEIVED_SUBVOL,
                                          root->root_key.objectid);
                if (ret < 0 && ret != -EEXIST) {
                        goto out;
                }
        }
 -      ret = btrfs_commit_transaction(trans, root);
 +      ret = btrfs_commit_transaction(trans);
        if (ret < 0) {
                btrfs_abort_transaction(trans, ret);
                goto out;
        }
  
  out:
 -      up_write(&root->fs_info->subvol_sem);
 +      up_write(&fs_info->subvol_sem);
        mnt_drop_write_file(file);
        return ret;
  }
@@@ -5217,8 -5191,11 +5205,8 @@@ static long btrfs_ioctl_set_received_su
        int ret = 0;
  
        args32 = memdup_user(arg, sizeof(*args32));
 -      if (IS_ERR(args32)) {
 -              ret = PTR_ERR(args32);
 -              args32 = NULL;
 -              goto out;
 -      }
 +      if (IS_ERR(args32))
 +              return PTR_ERR(args32);
  
        args64 = kmalloc(sizeof(*args64), GFP_KERNEL);
        if (!args64) {
@@@ -5266,8 -5243,11 +5254,8 @@@ static long btrfs_ioctl_set_received_su
        int ret = 0;
  
        sa = memdup_user(arg, sizeof(*sa));
 -      if (IS_ERR(sa)) {
 -              ret = PTR_ERR(sa);
 -              sa = NULL;
 -              goto out;
 -      }
 +      if (IS_ERR(sa))
 +              return PTR_ERR(sa);
  
        ret = _btrfs_ioctl_set_received_subvol(file, sa);
  
  
  static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg)
  {
 -      struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
 +      struct inode *inode = file_inode(file);
 +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        size_t len;
        int ret;
        char label[BTRFS_LABEL_SIZE];
  
 -      spin_lock(&root->fs_info->super_lock);
 -      memcpy(label, root->fs_info->super_copy->label, BTRFS_LABEL_SIZE);
 -      spin_unlock(&root->fs_info->super_lock);
 +      spin_lock(&fs_info->super_lock);
 +      memcpy(label, fs_info->super_copy->label, BTRFS_LABEL_SIZE);
 +      spin_unlock(&fs_info->super_lock);
  
        len = strnlen(label, BTRFS_LABEL_SIZE);
  
        if (len == BTRFS_LABEL_SIZE) {
 -              btrfs_warn(root->fs_info,
 -                      "label is too long, return the first %zu bytes", --len);
 +              btrfs_warn(fs_info,
 +                         "label is too long, return the first %zu bytes",
 +                         --len);
        }
  
        ret = copy_to_user(arg, label, len);
  
  static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
  {
 -      struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
 -      struct btrfs_super_block *super_block = root->fs_info->super_copy;
 +      struct inode *inode = file_inode(file);
 +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 +      struct btrfs_root *root = BTRFS_I(inode)->root;
 +      struct btrfs_super_block *super_block = fs_info->super_copy;
        struct btrfs_trans_handle *trans;
        char label[BTRFS_LABEL_SIZE];
        int ret;
                return -EFAULT;
  
        if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) {
 -              btrfs_err(root->fs_info,
 +              btrfs_err(fs_info,
                          "unable to set label with more than %d bytes",
                          BTRFS_LABEL_SIZE - 1);
                return -EINVAL;
                goto out_unlock;
        }
  
 -      spin_lock(&root->fs_info->super_lock);
 +      spin_lock(&fs_info->super_lock);
        strcpy(super_block->label, label);
 -      spin_unlock(&root->fs_info->super_lock);
 -      ret = btrfs_commit_transaction(trans, root);
 +      spin_unlock(&fs_info->super_lock);
 +      ret = btrfs_commit_transaction(trans);
  
  out_unlock:
        mnt_drop_write_file(file);
@@@ -5372,9 -5348,8 +5360,9 @@@ int btrfs_ioctl_get_supported_features(
  
  static int btrfs_ioctl_get_features(struct file *file, void __user *arg)
  {
 -      struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
 -      struct btrfs_super_block *super_block = root->fs_info->super_copy;
 +      struct inode *inode = file_inode(file);
 +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 +      struct btrfs_super_block *super_block = fs_info->super_copy;
        struct btrfs_ioctl_feature_flags features;
  
        features.compat_flags = btrfs_super_compat_flags(super_block);
        return 0;
  }
  
 -static int check_feature_bits(struct btrfs_root *root,
 +static int check_feature_bits(struct btrfs_fs_info *fs_info,
                              enum btrfs_feature_set set,
                              u64 change_mask, u64 flags, u64 supported_flags,
                              u64 safe_set, u64 safe_clear)
        if (unsupported) {
                names = btrfs_printable_features(set, unsupported);
                if (names) {
 -                      btrfs_warn(root->fs_info,
 -                         "this kernel does not support the %s feature bit%s",
 -                         names, strchr(names, ',') ? "s" : "");
 +                      btrfs_warn(fs_info,
 +                                 "this kernel does not support the %s feature bit%s",
 +                                 names, strchr(names, ',') ? "s" : "");
                        kfree(names);
                } else
 -                      btrfs_warn(root->fs_info,
 -                         "this kernel does not support %s bits 0x%llx",
 -                         type, unsupported);
 +                      btrfs_warn(fs_info,
 +                                 "this kernel does not support %s bits 0x%llx",
 +                                 type, unsupported);
                return -EOPNOTSUPP;
        }
  
        if (disallowed) {
                names = btrfs_printable_features(set, disallowed);
                if (names) {
 -                      btrfs_warn(root->fs_info,
 -                         "can't set the %s feature bit%s while mounted",
 -                         names, strchr(names, ',') ? "s" : "");
 +                      btrfs_warn(fs_info,
 +                                 "can't set the %s feature bit%s while mounted",
 +                                 names, strchr(names, ',') ? "s" : "");
                        kfree(names);
                } else
 -                      btrfs_warn(root->fs_info,
 -                         "can't set %s bits 0x%llx while mounted",
 -                         type, disallowed);
 +                      btrfs_warn(fs_info,
 +                                 "can't set %s bits 0x%llx while mounted",
 +                                 type, disallowed);
                return -EPERM;
        }
  
        if (disallowed) {
                names = btrfs_printable_features(set, disallowed);
                if (names) {
 -                      btrfs_warn(root->fs_info,
 -                         "can't clear the %s feature bit%s while mounted",
 -                         names, strchr(names, ',') ? "s" : "");
 +                      btrfs_warn(fs_info,
 +                                 "can't clear the %s feature bit%s while mounted",
 +                                 names, strchr(names, ',') ? "s" : "");
                        kfree(names);
                } else
 -                      btrfs_warn(root->fs_info,
 -                         "can't clear %s bits 0x%llx while mounted",
 -                         type, disallowed);
 +                      btrfs_warn(fs_info,
 +                                 "can't clear %s bits 0x%llx while mounted",
 +                                 type, disallowed);
                return -EPERM;
        }
  
        return 0;
  }
  
 -#define check_feature(root, change_mask, flags, mask_base)    \
 -check_feature_bits(root, FEAT_##mask_base, change_mask, flags,        \
 +#define check_feature(fs_info, change_mask, flags, mask_base) \
 +check_feature_bits(fs_info, FEAT_##mask_base, change_mask, flags,     \
                   BTRFS_FEATURE_ ## mask_base ## _SUPP,        \
                   BTRFS_FEATURE_ ## mask_base ## _SAFE_SET,    \
                   BTRFS_FEATURE_ ## mask_base ## _SAFE_CLEAR)
  
  static int btrfs_ioctl_set_features(struct file *file, void __user *arg)
  {
 -      struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
 -      struct btrfs_super_block *super_block = root->fs_info->super_copy;
 +      struct inode *inode = file_inode(file);
 +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 +      struct btrfs_root *root = BTRFS_I(inode)->root;
 +      struct btrfs_super_block *super_block = fs_info->super_copy;
        struct btrfs_ioctl_feature_flags flags[2];
        struct btrfs_trans_handle *trans;
        u64 newflags;
            !flags[0].incompat_flags)
                return 0;
  
 -      ret = check_feature(root, flags[0].compat_flags,
 +      ret = check_feature(fs_info, flags[0].compat_flags,
                            flags[1].compat_flags, COMPAT);
        if (ret)
                return ret;
  
 -      ret = check_feature(root, flags[0].compat_ro_flags,
 +      ret = check_feature(fs_info, flags[0].compat_ro_flags,
                            flags[1].compat_ro_flags, COMPAT_RO);
        if (ret)
                return ret;
  
 -      ret = check_feature(root, flags[0].incompat_flags,
 +      ret = check_feature(fs_info, flags[0].incompat_flags,
                            flags[1].incompat_flags, INCOMPAT);
        if (ret)
                return ret;
                goto out_drop_write;
        }
  
 -      spin_lock(&root->fs_info->super_lock);
 +      spin_lock(&fs_info->super_lock);
        newflags = btrfs_super_compat_flags(super_block);
        newflags |= flags[0].compat_flags & flags[1].compat_flags;
        newflags &= ~(flags[0].compat_flags & ~flags[1].compat_flags);
        newflags |= flags[0].incompat_flags & flags[1].incompat_flags;
        newflags &= ~(flags[0].incompat_flags & ~flags[1].incompat_flags);
        btrfs_set_super_incompat_flags(super_block, newflags);
 -      spin_unlock(&root->fs_info->super_lock);
 +      spin_unlock(&fs_info->super_lock);
  
 -      ret = btrfs_commit_transaction(trans, root);
 +      ret = btrfs_commit_transaction(trans);
  out_drop_write:
        mnt_drop_write_file(file);
  
  long btrfs_ioctl(struct file *file, unsigned int
                cmd, unsigned long arg)
  {
 -      struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
 +      struct inode *inode = file_inode(file);
 +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 +      struct btrfs_root *root = BTRFS_I(inode)->root;
        void __user *argp = (void __user *)arg;
  
        switch (cmd) {
        case BTRFS_IOC_RESIZE:
                return btrfs_ioctl_resize(file, argp);
        case BTRFS_IOC_ADD_DEV:
 -              return btrfs_ioctl_add_dev(root, argp);
 +              return btrfs_ioctl_add_dev(fs_info, argp);
        case BTRFS_IOC_RM_DEV:
                return btrfs_ioctl_rm_dev(file, argp);
        case BTRFS_IOC_RM_DEV_V2:
                return btrfs_ioctl_rm_dev_v2(file, argp);
        case BTRFS_IOC_FS_INFO:
 -              return btrfs_ioctl_fs_info(root, argp);
 +              return btrfs_ioctl_fs_info(fs_info, argp);
        case BTRFS_IOC_DEV_INFO:
 -              return btrfs_ioctl_dev_info(root, argp);
 +              return btrfs_ioctl_dev_info(fs_info, argp);
        case BTRFS_IOC_BALANCE:
                return btrfs_ioctl_balance(file, NULL);
        case BTRFS_IOC_TRANS_START:
        case BTRFS_IOC_INO_PATHS:
                return btrfs_ioctl_ino_to_path(root, argp);
        case BTRFS_IOC_LOGICAL_INO:
 -              return btrfs_ioctl_logical_to_ino(root, argp);
 +              return btrfs_ioctl_logical_to_ino(fs_info, argp);
        case BTRFS_IOC_SPACE_INFO:
 -              return btrfs_ioctl_space_info(root, argp);
 +              return btrfs_ioctl_space_info(fs_info, argp);
        case BTRFS_IOC_SYNC: {
                int ret;
  
 -              ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1);
 +              ret = btrfs_start_delalloc_roots(fs_info, 0, -1);
                if (ret)
                        return ret;
 -              ret = btrfs_sync_fs(file_inode(file)->i_sb, 1);
 +              ret = btrfs_sync_fs(inode->i_sb, 1);
                /*
                 * The transaction thread may want to do more work,
                 * namely it pokes the cleaner kthread that will start
                 * processing uncleaned subvols.
                 */
 -              wake_up_process(root->fs_info->transaction_kthread);
 +              wake_up_process(fs_info->transaction_kthread);
                return ret;
        }
        case BTRFS_IOC_START_SYNC:
                return btrfs_ioctl_start_sync(root, argp);
        case BTRFS_IOC_WAIT_SYNC:
 -              return btrfs_ioctl_wait_sync(root, argp);
 +              return btrfs_ioctl_wait_sync(fs_info, argp);
        case BTRFS_IOC_SCRUB:
                return btrfs_ioctl_scrub(file, argp);
        case BTRFS_IOC_SCRUB_CANCEL:
 -              return btrfs_ioctl_scrub_cancel(root, argp);
 +              return btrfs_ioctl_scrub_cancel(fs_info);
        case BTRFS_IOC_SCRUB_PROGRESS:
 -              return btrfs_ioctl_scrub_progress(root, argp);
 +              return btrfs_ioctl_scrub_progress(fs_info, argp);
        case BTRFS_IOC_BALANCE_V2:
                return btrfs_ioctl_balance(file, argp);
        case BTRFS_IOC_BALANCE_CTL:
 -              return btrfs_ioctl_balance_ctl(root, arg);
 +              return btrfs_ioctl_balance_ctl(fs_info, arg);
        case BTRFS_IOC_BALANCE_PROGRESS:
 -              return btrfs_ioctl_balance_progress(root, argp);
 +              return btrfs_ioctl_balance_progress(fs_info, argp);
        case BTRFS_IOC_SET_RECEIVED_SUBVOL:
                return btrfs_ioctl_set_received_subvol(file, argp);
  #ifdef CONFIG_64BIT
        case BTRFS_IOC_SEND:
                return btrfs_ioctl_send(file, argp);
        case BTRFS_IOC_GET_DEV_STATS:
 -              return btrfs_ioctl_get_dev_stats(root, argp);
 +              return btrfs_ioctl_get_dev_stats(fs_info, argp);
        case BTRFS_IOC_QUOTA_CTL:
                return btrfs_ioctl_quota_ctl(file, argp);
        case BTRFS_IOC_QGROUP_ASSIGN:
        case BTRFS_IOC_QUOTA_RESCAN_WAIT:
                return btrfs_ioctl_quota_rescan_wait(file, argp);
        case BTRFS_IOC_DEV_REPLACE:
 -              return btrfs_ioctl_dev_replace(root, argp);
 +              return btrfs_ioctl_dev_replace(fs_info, argp);
        case BTRFS_IOC_GET_FSLABEL:
                return btrfs_ioctl_get_fslabel(file, argp);
        case BTRFS_IOC_SET_FSLABEL:
diff --combined fs/ceph/addr.c
@@@ -315,32 -315,7 +315,32 @@@ static int start_read(struct inode *ino
        struct page **pages;
        pgoff_t next_index;
        int nr_pages = 0;
 -      int ret;
 +      int got = 0;
 +      int ret = 0;
 +
 +      if (!current->journal_info) {
 +              /* caller of readpages does not hold buffer and read caps
 +               * (fadvise, madvise and readahead cases) */
 +              int want = CEPH_CAP_FILE_CACHE;
 +              ret = ceph_try_get_caps(ci, CEPH_CAP_FILE_RD, want, &got);
 +              if (ret < 0) {
 +                      dout("start_read %p, error getting cap\n", inode);
 +              } else if (!(got & want)) {
 +                      dout("start_read %p, no cache cap\n", inode);
 +                      ret = 0;
 +              }
 +              if (ret <= 0) {
 +                      if (got)
 +                              ceph_put_cap_refs(ci, got);
 +                      while (!list_empty(page_list)) {
 +                              page = list_entry(page_list->prev,
 +                                                struct page, lru);
 +                              list_del(&page->lru);
 +                              put_page(page);
 +                      }
 +                      return ret;
 +              }
 +      }
  
        off = (u64) page_offset(page);
  
                                    CEPH_OSD_FLAG_READ, NULL,
                                    ci->i_truncate_seq, ci->i_truncate_size,
                                    false);
 -      if (IS_ERR(req))
 -              return PTR_ERR(req);
 +      if (IS_ERR(req)) {
 +              ret = PTR_ERR(req);
 +              goto out;
 +      }
  
        /* build page vector */
        nr_pages = calc_pages_for(0, len);
        pages = kmalloc(sizeof(*pages) * nr_pages, GFP_KERNEL);
 -      ret = -ENOMEM;
 -      if (!pages)
 -              goto out;
 +      if (!pages) {
 +              ret = -ENOMEM;
 +              goto out_put;
 +      }
        for (i = 0; i < nr_pages; ++i) {
                page = list_entry(page_list->prev, struct page, lru);
                BUG_ON(PageLocked(page));
        if (ret < 0)
                goto out_pages;
        ceph_osdc_put_request(req);
 +
 +      /* After adding locked pages to page cache, the inode holds cache cap.
 +       * So we can drop our cap refs. */
 +      if (got)
 +              ceph_put_cap_refs(ci, got);
 +
        return nr_pages;
  
  out_pages:
                unlock_page(pages[i]);
        }
        ceph_put_page_vector(pages, nr_pages, false);
 -out:
 +out_put:
        ceph_osdc_put_request(req);
 +out:
 +      if (got)
 +              ceph_put_cap_refs(ci, got);
        return ret;
  }
  
@@@ -461,6 -424,7 +461,6 @@@ static int ceph_readpages(struct file *
                rc = start_read(inode, page_list, max);
                if (rc < 0)
                        goto out;
 -              BUG_ON(rc == 0);
        }
  out:
        ceph_fscache_readpages_cancel(inode, page_list);
   * only snap context we are allowed to write back.
   */
  static struct ceph_snap_context *get_oldest_context(struct inode *inode,
 -                                                  loff_t *snap_size)
 +                                                  loff_t *snap_size,
 +                                                  u64 *truncate_size,
 +                                                  u32 *truncate_seq)
  {
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_snap_context *snapc = NULL;
                        snapc = ceph_get_snap_context(capsnap->context);
                        if (snap_size)
                                *snap_size = capsnap->size;
 +                      if (truncate_size)
 +                              *truncate_size = capsnap->truncate_size;
 +                      if (truncate_seq)
 +                              *truncate_seq = capsnap->truncate_seq;
                        break;
                }
        }
                snapc = ceph_get_snap_context(ci->i_head_snapc);
                dout(" head snapc %p has %d dirty pages\n",
                     snapc, ci->i_wrbuffer_ref_head);
 +              if (truncate_size)
 +                      *truncate_size = capsnap->truncate_size;
 +              if (truncate_seq)
 +                      *truncate_seq = capsnap->truncate_seq;
        }
        spin_unlock(&ci->i_ceph_lock);
        return snapc;
@@@ -547,8 -501,7 +547,8 @@@ static int writepage_nounlock(struct pa
                dout("writepage %p page %p not dirty?\n", inode, page);
                goto out;
        }
 -      oldest = get_oldest_context(inode, &snap_size);
 +      oldest = get_oldest_context(inode, &snap_size,
 +                                  &truncate_size, &truncate_seq);
        if (snapc->seq > oldest->seq) {
                dout("writepage %p page %p snapc %p not writeable - noop\n",
                     inode, page, snapc);
        }
        ceph_put_snap_context(oldest);
  
 -      spin_lock(&ci->i_ceph_lock);
 -      truncate_seq = ci->i_truncate_seq;
 -      truncate_size = ci->i_truncate_size;
        if (snap_size == -1)
                snap_size = i_size_read(inode);
 -      spin_unlock(&ci->i_ceph_lock);
  
        /* is this a partial page at end of file? */
        if (page_off >= snap_size) {
@@@ -807,8 -764,7 +807,8 @@@ retry
        /* find oldest snap context with dirty data */
        ceph_put_snap_context(snapc);
        snap_size = -1;
 -      snapc = get_oldest_context(inode, &snap_size);
 +      snapc = get_oldest_context(inode, &snap_size,
 +                                 &truncate_size, &truncate_seq);
        if (!snapc) {
                /* hmm, why does writepages get called when there
                   is no dirty data? */
        dout(" oldest snapc is %p seq %lld (%d snaps)\n",
             snapc, snapc->seq, snapc->num_snaps);
  
 -      spin_lock(&ci->i_ceph_lock);
 -      truncate_seq = ci->i_truncate_seq;
 -      truncate_size = ci->i_truncate_size;
        i_size = i_size_read(inode);
 -      spin_unlock(&ci->i_ceph_lock);
  
        if (last_snapc && snapc != last_snapc) {
                /* if we switched to a newer snapc, restart our scan at the
@@@ -1164,8 -1124,7 +1164,8 @@@ out
  static int context_is_writeable_or_written(struct inode *inode,
                                           struct ceph_snap_context *snapc)
  {
 -      struct ceph_snap_context *oldest = get_oldest_context(inode, NULL);
 +      struct ceph_snap_context *oldest = get_oldest_context(inode, NULL,
 +                                                            NULL, NULL);
        int ret = !oldest || snapc->seq <= oldest->seq;
  
        ceph_put_snap_context(oldest);
@@@ -1210,7 -1169,7 +1210,7 @@@ retry_locked
                 * this page is already dirty in another (older) snap
                 * context!  is it writeable now?
                 */
 -              oldest = get_oldest_context(inode, NULL);
 +              oldest = get_oldest_context(inode, NULL, NULL, NULL);
  
                if (snapc->seq > oldest->seq) {
                        ceph_put_snap_context(oldest);
@@@ -1317,25 -1276,27 +1317,27 @@@ static int ceph_write_end(struct file *
                          struct page *page, void *fsdata)
  {
        struct inode *inode = file_inode(file);
-       unsigned from = pos & (PAGE_SIZE - 1);
        int check_cap = 0;
  
        dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
             inode, page, (int)pos, (int)copied, (int)len);
  
        /* zero the stale part of the page if we did a short copy */
-       if (copied < len)
-               zero_user_segment(page, from+copied, len);
+       if (!PageUptodate(page)) {
+               if (copied < len) {
+                       copied = 0;
+                       goto out;
+               }
+               SetPageUptodate(page);
+       }
  
        /* did file size increase? */
        if (pos+copied > i_size_read(inode))
                check_cap = ceph_inode_set_size(inode, pos+copied);
  
-       if (!PageUptodate(page))
-               SetPageUptodate(page);
        set_page_dirty(page);
  
+ out:
        unlock_page(page);
        put_page(page);
  
@@@ -1412,11 -1373,9 +1414,11 @@@ static int ceph_filemap_fault(struct vm
             inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got));
  
        if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) ||
 -          ci->i_inline_version == CEPH_INLINE_NONE)
 +          ci->i_inline_version == CEPH_INLINE_NONE) {
 +              current->journal_info = vma->vm_file;
                ret = filemap_fault(vma, vmf);
 -      else
 +              current->journal_info = NULL;
 +      } else
                ret = -EAGAIN;
  
        dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n",
@@@ -1948,15 -1907,6 +1950,15 @@@ int ceph_pool_perm_check(struct ceph_in
        struct ceph_string *pool_ns;
        int ret, flags;
  
 +      if (ci->i_vino.snap != CEPH_NOSNAP) {
 +              /*
 +               * Pool permission check needs to write to the first object.
 +               * But for snapshot, head of the first object may have alread
 +               * been deleted. Skip check to avoid creating orphan object.
 +               */
 +              return 0;
 +      }
 +
        if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode),
                                NOPOOLPERM))
                return 0;
diff --combined fs/ext4/super.c
@@@ -863,6 -863,7 +863,6 @@@ static void ext4_put_super(struct super
        percpu_counter_destroy(&sbi->s_dirs_counter);
        percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
        percpu_free_rwsem(&sbi->s_journal_flag_rwsem);
 -      brelse(sbi->s_sbh);
  #ifdef CONFIG_QUOTA
        for (i = 0; i < EXT4_MAXQUOTAS; i++)
                kfree(sbi->s_qf_names[i]);
        }
        if (sbi->s_mmp_tsk)
                kthread_stop(sbi->s_mmp_tsk);
 +      brelse(sbi->s_sbh);
        sb->s_fs_info = NULL;
        /*
         * Now that we are completely done shutting down the
@@@ -1114,55 -1114,37 +1114,55 @@@ static int ext4_prepare_context(struct 
  static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
                                                        void *fs_data)
  {
 -      handle_t *handle;
 -      int res, res2;
 +      handle_t *handle = fs_data;
 +      int res, res2, retries = 0;
 +
 +      /*
 +       * If a journal handle was specified, then the encryption context is
 +       * being set on a new inode via inheritance and is part of a larger
 +       * transaction to create the inode.  Otherwise the encryption context is
 +       * being set on an existing inode in its own transaction.  Only in the
 +       * latter case should the "retry on ENOSPC" logic be used.
 +       */
  
 -      /* fs_data is null when internally used. */
 -      if (fs_data) {
 -              res  = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION,
 -                              EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx,
 -                              len, 0);
 +      if (handle) {
 +              res = ext4_xattr_set_handle(handle, inode,
 +                                          EXT4_XATTR_INDEX_ENCRYPTION,
 +                                          EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
 +                                          ctx, len, 0);
                if (!res) {
                        ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
                        ext4_clear_inode_state(inode,
                                        EXT4_STATE_MAY_INLINE_DATA);
 +                      /*
 +                       * Update inode->i_flags - e.g. S_DAX may get disabled
 +                       */
 +                      ext4_set_inode_flags(inode);
                }
                return res;
        }
  
 +retry:
        handle = ext4_journal_start(inode, EXT4_HT_MISC,
                        ext4_jbd2_credits_xattr(inode));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
  
 -      res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION,
 -                      EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx,
 -                      len, 0);
 +      res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION,
 +                                  EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
 +                                  ctx, len, 0);
        if (!res) {
                ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
 +              /* Update inode->i_flags - e.g. S_DAX may get disabled */
 +              ext4_set_inode_flags(inode);
                res = ext4_mark_inode_dirty(handle, inode);
                if (res)
                        EXT4_ERROR_INODE(inode, "Failed to mark inode dirty");
        }
        res2 = ext4_journal_stop(handle);
 +
 +      if (res == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
 +              goto retry;
        if (!res)
                res = res2;
        return res;
@@@ -1205,7 -1187,7 +1205,7 @@@ static int ext4_release_dquot(struct dq
  static int ext4_mark_dquot_dirty(struct dquot *dquot);
  static int ext4_write_info(struct super_block *sb, int type);
  static int ext4_quota_on(struct super_block *sb, int type, int format_id,
-                        struct path *path);
+                        const struct path *path);
  static int ext4_quota_off(struct super_block *sb, int type);
  static int ext4_quota_on_mount(struct super_block *sb, int type);
  static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
@@@ -1901,6 -1883,12 +1901,6 @@@ static int parse_options(char *options
                        return 0;
                }
        }
 -      if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
 -          test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
 -              ext4_msg(sb, KERN_ERR, "can't mount with journal_async_commit "
 -                       "in data=ordered mode");
 -              return 0;
 -      }
        return 1;
  }
  
@@@ -2342,7 -2330,7 +2342,7 @@@ static void ext4_orphan_cleanup(struct 
                                struct ext4_super_block *es)
  {
        unsigned int s_flags = sb->s_flags;
 -      int nr_orphans = 0, nr_truncates = 0;
 +      int ret, nr_orphans = 0, nr_truncates = 0;
  #ifdef CONFIG_QUOTA
        int i;
  #endif
                                  inode->i_ino, inode->i_size);
                        inode_lock(inode);
                        truncate_inode_pages(inode->i_mapping, inode->i_size);
 -                      ext4_truncate(inode);
 +                      ret = ext4_truncate(inode);
 +                      if (ret)
 +                              ext4_std_error(inode->i_sb, ret);
                        inode_unlock(inode);
                        nr_truncates++;
                } else {
@@@ -3207,15 -3193,10 +3207,15 @@@ static int count_overhead(struct super_
                        ext4_set_bit(s++, buf);
                        count++;
                }
 -              for (j = ext4_bg_num_gdb(sb, grp); j > 0; j--) {
 -                      ext4_set_bit(EXT4_B2C(sbi, s++), buf);
 -                      count++;
 +              j = ext4_bg_num_gdb(sb, grp);
 +              if (s + j > EXT4_BLOCKS_PER_GROUP(sb)) {
 +                      ext4_error(sb, "Invalid number of block group "
 +                                 "descriptor blocks: %d", j);
 +                      j = EXT4_BLOCKS_PER_GROUP(sb) - s;
                }
 +              count += j;
 +              for (; j > 0; j--)
 +                      ext4_set_bit(EXT4_B2C(sbi, s++), buf);
        }
        if (!count)
                return 0;
@@@ -3320,7 -3301,7 +3320,7 @@@ static int ext4_fill_super(struct super
        char *orig_data = kstrdup(data, GFP_KERNEL);
        struct buffer_head *bh;
        struct ext4_super_block *es = NULL;
 -      struct ext4_sb_info *sbi;
 +      struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        ext4_fsblk_t block;
        ext4_fsblk_t sb_block = get_sb_block(&data);
        ext4_fsblk_t logical_sb_block;
        unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
        ext4_group_t first_not_zeroed;
  
 -      sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 -      if (!sbi)
 -              goto out_free_orig;
 +      if ((data && !orig_data) || !sbi)
 +              goto out_free_base;
  
        sbi->s_blockgroup_lock =
                kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
 -      if (!sbi->s_blockgroup_lock) {
 -              kfree(sbi);
 -              goto out_free_orig;
 -      }
 +      if (!sbi->s_blockgroup_lock)
 +              goto out_free_base;
 +
        sb->s_fs_info = sbi;
        sbi->s_sb = sb;
        sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
         */
        sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
  
 -      if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
 -                         &journal_devnum, &journal_ioprio, 0)) {
 -              ext4_msg(sb, KERN_WARNING,
 -                       "failed to parse options in superblock: %s",
 -                       sbi->s_es->s_mount_opts);
 +      if (sbi->s_es->s_mount_opts[0]) {
 +              char *s_mount_opts = kstrndup(sbi->s_es->s_mount_opts,
 +                                            sizeof(sbi->s_es->s_mount_opts),
 +                                            GFP_KERNEL);
 +              if (!s_mount_opts)
 +                      goto failed_mount;
 +              if (!parse_options(s_mount_opts, sb, &journal_devnum,
 +                                 &journal_ioprio, 0)) {
 +                      ext4_msg(sb, KERN_WARNING,
 +                               "failed to parse options in superblock: %s",
 +                               s_mount_opts);
 +              }
 +              kfree(s_mount_opts);
        }
        sbi->s_def_mount_opt = sbi->s_mount_opt;
        if (!parse_options((char *) data, sb, &journal_devnum,
                                 "both data=journal and dax");
                        goto failed_mount;
                }
 +              if (ext4_has_feature_encrypt(sb)) {
 +                      ext4_msg(sb, KERN_WARNING,
 +                               "encrypted files will use data=ordered "
 +                               "instead of data journaling mode");
 +              }
                if (test_opt(sb, DELALLOC))
                        clear_opt(sb, DELALLOC);
        } else {
  
        sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
        sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
 -      if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0)
 -              goto cantfind_ext4;
  
        sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
        if (sbi->s_inodes_per_block == 0)
                goto cantfind_ext4;
 +      if (sbi->s_inodes_per_group < sbi->s_inodes_per_block ||
 +          sbi->s_inodes_per_group > blocksize * 8) {
 +              ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n",
 +                       sbi->s_blocks_per_group);
 +              goto failed_mount;
 +      }
        sbi->s_itb_per_group = sbi->s_inodes_per_group /
                                        sbi->s_inodes_per_block;
        sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb);
        }
        sbi->s_cluster_ratio = clustersize / blocksize;
  
 -      if (sbi->s_inodes_per_group > blocksize * 8) {
 -              ext4_msg(sb, KERN_ERR,
 -                     "#inodes per group too big: %lu",
 -                     sbi->s_inodes_per_group);
 -              goto failed_mount;
 -      }
 -
        /* Do we have standard group size of clustersize * 8 blocks ? */
        if (sbi->s_blocks_per_group == clustersize << 3)
                set_opt2(sb, STD_GROUP_SIZE);
                        (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
        db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
                   EXT4_DESC_PER_BLOCK(sb);
 +      if (ext4_has_feature_meta_bg(sb)) {
 +              if (le32_to_cpu(es->s_first_meta_bg) >= db_count) {
 +                      ext4_msg(sb, KERN_WARNING,
 +                               "first meta block group too large: %u "
 +                               "(group descriptor block count %u)",
 +                               le32_to_cpu(es->s_first_meta_bg), db_count);
 +                      goto failed_mount;
 +              }
 +      }
        sbi->s_group_desc = ext4_kvmalloc(db_count *
                                          sizeof(struct buffer_head *),
                                          GFP_KERNEL);
        default:
                break;
        }
 +
 +      if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
 +          test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
 +              ext4_msg(sb, KERN_ERR, "can't mount with "
 +                      "journal_async_commit in data=ordered mode");
 +              goto failed_mount_wq;
 +      }
 +
        set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
  
        sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
@@@ -4204,9 -4160,7 +4204,9 @@@ no_journal
  
        if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount"))
                ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
 -                       "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
 +                       "Opts: %.*s%s%s", descr,
 +                       (int) sizeof(sbi->s_es->s_mount_opts),
 +                       sbi->s_es->s_mount_opts,
                         *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
  
        if (es->s_error_count)
@@@ -4285,8 -4239,8 +4285,8 @@@ failed_mount
  out_fail:
        sb->s_fs_info = NULL;
        kfree(sbi->s_blockgroup_lock);
 +out_free_base:
        kfree(sbi);
 -out_free_orig:
        kfree(orig_data);
        return err ? err : ret;
  }
@@@ -4596,8 -4550,7 +4596,8 @@@ static int ext4_commit_super(struct sup
                                &EXT4_SB(sb)->s_freeinodes_counter));
        BUFFER_TRACE(sbh, "marking dirty");
        ext4_superblock_csum_set(sb);
 -      lock_buffer(sbh);
 +      if (sync)
 +              lock_buffer(sbh);
        if (buffer_write_io_error(sbh)) {
                /*
                 * Oh, dear.  A previous attempt to write the
                set_buffer_uptodate(sbh);
        }
        mark_buffer_dirty(sbh);
 -      unlock_buffer(sbh);
        if (sync) {
 +              unlock_buffer(sbh);
                error = __sync_dirty_buffer(sbh,
 -                      test_opt(sb, BARRIER) ? WRITE_FUA : WRITE_SYNC);
 +                      test_opt(sb, BARRIER) ? REQ_FUA : REQ_SYNC);
                if (error)
                        return error;
  
@@@ -4904,13 -4857,6 +4904,13 @@@ static int ext4_remount(struct super_bl
                        err = -EINVAL;
                        goto restore_opts;
                }
 +      } else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) {
 +              if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
 +                      ext4_msg(sb, KERN_ERR, "can't mount with "
 +                              "journal_async_commit in data=ordered mode");
 +                      err = -EINVAL;
 +                      goto restore_opts;
 +              }
        }
  
        if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) {
@@@ -5293,7 -5239,7 +5293,7 @@@ static void lockdep_set_quota_inode(str
   * Standard function to be called on quota_on
   */
  static int ext4_quota_on(struct super_block *sb, int type, int format_id,
-                        struct path *path)
+                        const struct path *path)
  {
        int err;
  
@@@ -5420,7 -5366,7 +5420,7 @@@ static int ext4_quota_off(struct super_
        handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
        if (IS_ERR(handle))
                goto out;
 -      inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 +      inode->i_mtime = inode->i_ctime = current_time(inode);
        ext4_mark_inode_dirty(handle, inode);
        ext4_journal_stop(handle);
  
diff --combined fs/internal.h
@@@ -62,7 -62,7 +62,7 @@@ extern int vfs_path_lookup(struct dentr
  extern void *copy_mount_options(const void __user *);
  extern char *copy_mount_string(const void __user *);
  
- extern struct vfsmount *lookup_mnt(struct path *);
+ extern struct vfsmount *lookup_mnt(const struct path *);
  extern int finish_automount(struct vfsmount *, struct path *);
  
  extern int sb_prepare_remount_readonly(struct super_block *);
@@@ -184,6 -184,3 +184,6 @@@ typedef loff_t (*iomap_actor_t)(struct 
  loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length,
                unsigned flags, struct iomap_ops *ops, void *data,
                iomap_actor_t actor);
 +
 +/* direct-io.c: */
 +int sb_init_dio_done_wq(struct super_block *sb);
diff --combined fs/namei.c
@@@ -1200,7 -1200,7 +1200,7 @@@ static int follow_managed(struct path *
                if (managed & DCACHE_MANAGE_TRANSIT) {
                        BUG_ON(!path->dentry->d_op);
                        BUG_ON(!path->dentry->d_op->d_manage);
-                       ret = path->dentry->d_op->d_manage(path->dentry, false);
+                       ret = path->dentry->d_op->d_manage(path, false);
                        if (ret < 0)
                                break;
                }
@@@ -1263,10 -1263,10 +1263,10 @@@ int follow_down_one(struct path *path
  }
  EXPORT_SYMBOL(follow_down_one);
  
- static inline int managed_dentry_rcu(struct dentry *dentry)
+ static inline int managed_dentry_rcu(const struct path *path)
  {
-       return (dentry->d_flags & DCACHE_MANAGE_TRANSIT) ?
-               dentry->d_op->d_manage(dentry, true) : 0;
+       return (path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) ?
+               path->dentry->d_op->d_manage(path, true) : 0;
  }
  
  /*
@@@ -1282,7 -1282,7 +1282,7 @@@ static bool __follow_mount_rcu(struct n
                 * Don't forget we might have a non-mountpoint managed dentry
                 * that wants to block transit.
                 */
-               switch (managed_dentry_rcu(path->dentry)) {
+               switch (managed_dentry_rcu(path)) {
                case -ECHILD:
                default:
                        return false;
@@@ -1392,8 -1392,7 +1392,7 @@@ int follow_down(struct path *path
                if (managed & DCACHE_MANAGE_TRANSIT) {
                        BUG_ON(!path->dentry->d_op);
                        BUG_ON(!path->dentry->d_op->d_manage);
-                       ret = path->dentry->d_op->d_manage(
-                               path->dentry, false);
+                       ret = path->dentry->d_op->d_manage(path, false);
                        if (ret < 0)
                                return ret == -EISDIR ? 0 : ret;
                }
@@@ -1725,35 -1724,30 +1724,35 @@@ static int pick_link(struct nameidata *
        return 1;
  }
  
 +enum {WALK_FOLLOW = 1, WALK_MORE = 2};
 +
  /*
   * Do we need to follow links? We _really_ want to be able
   * to do this check without having to look at inode->i_op,
   * so we keep a cache of "no, this doesn't need follow_link"
   * for the common case.
   */
 -static inline int should_follow_link(struct nameidata *nd, struct path *link,
 -                                   int follow,
 -                                   struct inode *inode, unsigned seq)
 +static inline int step_into(struct nameidata *nd, struct path *path,
 +                          int flags, struct inode *inode, unsigned seq)
  {
 -      if (likely(!d_is_symlink(link->dentry)))
 -              return 0;
 -      if (!follow)
 +      if (!(flags & WALK_MORE) && nd->depth)
 +              put_link(nd);
 +      if (likely(!d_is_symlink(path->dentry)) ||
 +         !(flags & WALK_FOLLOW || nd->flags & LOOKUP_FOLLOW)) {
 +              /* not a symlink or should not follow */
 +              path_to_nameidata(path, nd);
 +              nd->inode = inode;
 +              nd->seq = seq;
                return 0;
 +      }
        /* make sure that d_is_symlink above matches inode */
        if (nd->flags & LOOKUP_RCU) {
 -              if (read_seqcount_retry(&link->dentry->d_seq, seq))
 +              if (read_seqcount_retry(&path->dentry->d_seq, seq))
                        return -ECHILD;
        }
 -      return pick_link(nd, link, inode, seq);
 +      return pick_link(nd, path, inode, seq);
  }
  
 -enum {WALK_GET = 1, WALK_PUT = 2};
 -
  static int walk_component(struct nameidata *nd, int flags)
  {
        struct path path;
         */
        if (unlikely(nd->last_type != LAST_NORM)) {
                err = handle_dots(nd, nd->last_type);
 -              if (flags & WALK_PUT)
 +              if (!(flags & WALK_MORE) && nd->depth)
                        put_link(nd);
                return err;
        }
                inode = d_backing_inode(path.dentry);
        }
  
 -      if (flags & WALK_PUT)
 -              put_link(nd);
 -      err = should_follow_link(nd, &path, flags & WALK_GET, inode, seq);
 -      if (unlikely(err))
 -              return err;
 -      path_to_nameidata(&path, nd);
 -      nd->inode = inode;
 -      nd->seq = seq;
 -      return 0;
 +      return step_into(nd, &path, flags, inode, seq);
  }
  
  /*
                        if (!name)
                                return 0;
                        /* last component of nested symlink */
 -                      err = walk_component(nd, WALK_GET | WALK_PUT);
 +                      err = walk_component(nd, WALK_FOLLOW);
                } else {
 -                      err = walk_component(nd, WALK_GET);
 +                      /* not the last component */
 +                      err = walk_component(nd, WALK_FOLLOW | WALK_MORE);
                }
                if (err < 0)
                        return err;
@@@ -2246,7 -2247,12 +2245,7 @@@ static inline int lookup_last(struct na
                nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
  
        nd->flags &= ~LOOKUP_PARENT;
 -      return walk_component(nd,
 -                      nd->flags & LOOKUP_FOLLOW
 -                              ? nd->depth
 -                                      ? WALK_PUT | WALK_GET
 -                                      : WALK_GET
 -                              : 0);
 +      return walk_component(nd, 0);
  }
  
  /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
@@@ -2551,9 -2557,28 +2550,9 @@@ int user_path_at_empty(int dfd, const c
  }
  EXPORT_SYMBOL(user_path_at_empty);
  
 -/*
 - * NB: most callers don't do anything directly with the reference to the
 - *     to struct filename, but the nd->last pointer points into the name string
 - *     allocated by getname. So we must hold the reference to it until all
 - *     path-walking is complete.
 - */
 -static inline struct filename *
 -user_path_parent(int dfd, const char __user *path,
 -               struct path *parent,
 -               struct qstr *last,
 -               int *type,
 -               unsigned int flags)
 -{
 -      /* only LOOKUP_REVAL is allowed in extra flags */
 -      return filename_parentat(dfd, getname(path), flags & LOOKUP_REVAL,
 -                               parent, last, type);
 -}
 -
  /**
   * mountpoint_last - look up last component for umount
   * @nd:   pathwalk nameidata - currently pointing at parent directory of "last"
 - * @path: pointer to container for result
   *
   * This is a special lookup_last function just for umount. In this case, we
   * need to resolve the path without doing any revalidation.
   *
   * Returns:
   * -error: if there was an error during lookup. This includes -ENOENT if the
 - *         lookup found a negative dentry. The nd->path reference will also be
 - *         put in this case.
 + *         lookup found a negative dentry.
   *
 - * 0:      if we successfully resolved nd->path and found it to not to be a
 - *         symlink that needs to be followed. "path" will also be populated.
 - *         The nd->path reference will also be put.
 + * 0:      if we successfully resolved nd->last and found it to not to be a
 + *         symlink that needs to be followed.
   *
   * 1:      if we successfully resolved nd->last and found it to be a symlink
 - *         that needs to be followed. "path" will be populated with the path
 - *         to the link, and nd->path will *not* be put.
 + *         that needs to be followed.
   */
  static int
 -mountpoint_last(struct nameidata *nd, struct path *path)
 +mountpoint_last(struct nameidata *nd)
  {
        int error = 0;
 -      struct dentry *dentry;
        struct dentry *dir = nd->path.dentry;
 +      struct path path;
  
        /* If we're in rcuwalk, drop out of it to handle last component */
        if (nd->flags & LOOKUP_RCU) {
                error = handle_dots(nd, nd->last_type);
                if (error)
                        return error;
 -              dentry = dget(nd->path.dentry);
 +              path.dentry = dget(nd->path.dentry);
        } else {
 -              dentry = d_lookup(dir, &nd->last);
 -              if (!dentry) {
 +              path.dentry = d_lookup(dir, &nd->last);
 +              if (!path.dentry) {
                        /*
                         * No cached dentry. Mounted dentries are pinned in the
                         * cache, so that means that this dentry is probably
                         * a symlink or the path doesn't actually point
                         * to a mounted dentry.
                         */
 -                      dentry = lookup_slow(&nd->last, dir,
 +                      path.dentry = lookup_slow(&nd->last, dir,
                                             nd->flags | LOOKUP_NO_REVAL);
 -                      if (IS_ERR(dentry))
 -                              return PTR_ERR(dentry);
 +                      if (IS_ERR(path.dentry))
 +                              return PTR_ERR(path.dentry);
                }
        }
 -      if (d_is_negative(dentry)) {
 -              dput(dentry);
 +      if (d_is_negative(path.dentry)) {
 +              dput(path.dentry);
                return -ENOENT;
        }
 -      if (nd->depth)
 -              put_link(nd);
 -      path->dentry = dentry;
 -      path->mnt = nd->path.mnt;
 -      error = should_follow_link(nd, path, nd->flags & LOOKUP_FOLLOW,
 -                                 d_backing_inode(dentry), 0);
 -      if (unlikely(error))
 -              return error;
 -      mntget(path->mnt);
 -      follow_mount(path);
 -      return 0;
 +      path.mnt = nd->path.mnt;
 +      return step_into(nd, &path, 0, d_backing_inode(path.dentry), 0);
  }
  
  /**
@@@ -2634,19 -2671,13 +2633,19 @@@ path_mountpoint(struct nameidata *nd, u
        if (IS_ERR(s))
                return PTR_ERR(s);
        while (!(err = link_path_walk(s, nd)) &&
 -              (err = mountpoint_last(nd, path)) > 0) {
 +              (err = mountpoint_last(nd)) > 0) {
                s = trailing_symlink(nd);
                if (IS_ERR(s)) {
                        err = PTR_ERR(s);
                        break;
                }
        }
 +      if (!err) {
 +              *path = nd->path;
 +              nd->path.mnt = NULL;
 +              nd->path.dentry = NULL;
 +              follow_mount(path);
 +      }
        terminate_walk(nd);
        return err;
  }
@@@ -2863,7 -2894,7 +2862,7 @@@ bool may_open_dev(const struct path *pa
                !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
  }
  
- static int may_open(struct path *path, int acc_mode, int flag)
+ static int may_open(const struct path *path, int acc_mode, int flag)
  {
        struct dentry *dentry = path->dentry;
        struct inode *inode = dentry->d_inode;
  
  static int handle_truncate(struct file *filp)
  {
-       struct path *path = &filp->f_path;
+       const struct path *path = &filp->f_path;
        struct inode *inode = path->dentry->d_inode;
        int error = get_write_access(inode);
        if (error)
@@@ -3303,11 -3334,18 +3302,11 @@@ static int do_last(struct nameidata *nd
        seq = 0;        /* out of RCU mode, so the value doesn't matter */
        inode = d_backing_inode(path.dentry);
  finish_lookup:
 -      if (nd->depth)
 -              put_link(nd);
 -      error = should_follow_link(nd, &path, nd->flags & LOOKUP_FOLLOW,
 -                                 inode, seq);
 +      error = step_into(nd, &path, 0, inode, seq);
        if (unlikely(error))
                return error;
 -
 -      path_to_nameidata(&path, nd);
 -      nd->inode = inode;
 -      nd->seq = seq;
 -      /* Why this, you ask?  _Now_ we might have grown LOOKUP_JUMPED... */
  finish_open:
 +      /* Why this, you ask?  _Now_ we might have grown LOOKUP_JUMPED... */
        error = complete_walk(nd);
        if (error)
                return error;
@@@ -3822,8 -3860,8 +3821,8 @@@ static long do_rmdir(int dfd, const cha
        int type;
        unsigned int lookup_flags = 0;
  retry:
 -      name = user_path_parent(dfd, pathname,
 -                              &path, &last, &type, lookup_flags);
 +      name = filename_parentat(dfd, getname(pathname), lookup_flags,
 +                              &path, &last, &type);
        if (IS_ERR(name))
                return PTR_ERR(name);
  
@@@ -3952,8 -3990,8 +3951,8 @@@ static long do_unlinkat(int dfd, const 
        struct inode *delegated_inode = NULL;
        unsigned int lookup_flags = 0;
  retry:
 -      name = user_path_parent(dfd, pathname,
 -                              &path, &last, &type, lookup_flags);
 +      name = filename_parentat(dfd, getname(pathname), lookup_flags,
 +                              &path, &last, &type);
        if (IS_ERR(name))
                return PTR_ERR(name);
  
@@@ -4306,7 -4344,11 +4305,7 @@@ int vfs_rename(struct inode *old_dir, s
        bool new_is_dir = false;
        unsigned max_links = new_dir->i_sb->s_max_links;
  
 -      /*
 -       * Check source == target.
 -       * On overlayfs need to look at underlying inodes.
 -       */
 -      if (d_real_inode(old_dentry) == d_real_inode(new_dentry))
 +      if (source == target)
                return 0;
  
        error = may_delete(old_dir, old_dentry, is_dir);
@@@ -4448,15 -4490,15 +4447,15 @@@ SYSCALL_DEFINE5(renameat2, int, olddfd
                target_flags = 0;
  
  retry:
 -      from = user_path_parent(olddfd, oldname,
 -                              &old_path, &old_last, &old_type, lookup_flags);
 +      from = filename_parentat(olddfd, getname(oldname), lookup_flags,
 +                              &old_path, &old_last, &old_type);
        if (IS_ERR(from)) {
                error = PTR_ERR(from);
                goto exit;
        }
  
 -      to = user_path_parent(newdfd, newname,
 -                              &new_path, &new_last, &new_type, lookup_flags);
 +      to = filename_parentat(newdfd, getname(newname), lookup_flags,
 +                              &new_path, &new_last, &new_type);
        if (IS_ERR(to)) {
                error = PTR_ERR(to);
                goto exit1;
diff --combined fs/nfs/file.c
@@@ -102,11 -102,8 +102,11 @@@ static int nfs_revalidate_file_size(str
  {
        struct nfs_server *server = NFS_SERVER(inode);
        struct nfs_inode *nfsi = NFS_I(inode);
 +      const unsigned long force_reval = NFS_INO_REVAL_PAGECACHE|NFS_INO_REVAL_FORCED;
 +      unsigned long cache_validity = nfsi->cache_validity;
  
 -      if (nfs_have_delegated_attributes(inode))
 +      if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ) &&
 +          (cache_validity & force_reval) != force_reval)
                goto out_noreval;
  
        if (filp->f_flags & O_DIRECT)
@@@ -377,7 -374,7 +377,7 @@@ static int nfs_write_end(struct file *f
         */
        if (!PageUptodate(page)) {
                unsigned pglen = nfs_page_length(page);
-               unsigned end = offset + len;
+               unsigned end = offset + copied;
  
                if (pglen == 0) {
                        zero_user_segments(page, 0, offset,
diff --combined fs/ocfs2/aops.c
@@@ -464,6 -464,15 +464,15 @@@ static sector_t ocfs2_bmap(struct addre
        trace_ocfs2_bmap((unsigned long long)OCFS2_I(inode)->ip_blkno,
                         (unsigned long long)block);
  
+       /*
+        * The swap code (ab-)uses ->bmap to get a block mapping and then
+        * bypasseÑ• the file system for actual I/O.  We really can't allow
+        * that on refcounted inodes, so we have to skip out here.  And yes,
+        * 0 is the magic code for a bmap error..
+        */
+       if (ocfs2_is_refcount_inode(inode))
+               return 0;
        /* We don't need to lock journal system files, since they aren't
         * accessed concurrently from multiple nodes.
         */
@@@ -630,7 -639,7 +639,7 @@@ int ocfs2_map_page_blocks(struct page *
  
                if (!buffer_mapped(bh)) {
                        map_bh(bh, inode->i_sb, *p_blkno);
 -                      unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
 +                      clean_bdev_bh_alias(bh);
                }
  
                if (PageUptodate(page)) {
@@@ -1950,7 -1959,8 +1959,7 @@@ static void ocfs2_write_end_inline(stru
  }
  
  int ocfs2_write_end_nolock(struct address_space *mapping,
 -                         loff_t pos, unsigned len, unsigned copied,
 -                         struct page *page, void *fsdata)
 +                         loff_t pos, unsigned len, unsigned copied, void *fsdata)
  {
        int i, ret;
        unsigned from, to, start = pos & (PAGE_SIZE - 1);
@@@ -2063,7 -2073,7 +2072,7 @@@ static int ocfs2_write_end(struct file 
        int ret;
        struct inode *inode = mapping->host;
  
 -      ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata);
 +      ret = ocfs2_write_end_nolock(mapping, pos, len, copied, fsdata);
  
        up_write(&OCFS2_I(inode)->ip_alloc_sem);
        ocfs2_inode_unlock(inode, 1);
@@@ -2240,7 -2250,7 +2249,7 @@@ static int ocfs2_dio_get_block(struct i
                dwc->dw_zero_count++;
        }
  
 -      ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, NULL, wc);
 +      ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, wc);
        BUG_ON(ret != len);
        ret = 0;
  unlock:
        return ret;
  }
  
- static void ocfs2_dio_end_io_write(struct inode *inode,
-                                  struct ocfs2_dio_write_ctxt *dwc,
-                                  loff_t offset,
-                                  ssize_t bytes)
+ static int ocfs2_dio_end_io_write(struct inode *inode,
+                                 struct ocfs2_dio_write_ctxt *dwc,
+                                 loff_t offset,
+                                 ssize_t bytes)
  {
        struct ocfs2_cached_dealloc_ctxt dealloc;
        struct ocfs2_extent_tree et;
                        mlog_errno(ret);
        }
  
-       di = (struct ocfs2_dinode *)di_bh;
+       di = (struct ocfs2_dinode *)di_bh->b_data;
  
        ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
  
@@@ -2364,6 -2374,8 +2373,8 @@@ out
        if (locked)
                inode_unlock(inode);
        ocfs2_dio_free_write_ctx(inode, dwc);
+       return ret;
  }
  
  /*
@@@ -2378,21 -2390,19 +2389,19 @@@ static int ocfs2_dio_end_io(struct kioc
  {
        struct inode *inode = file_inode(iocb->ki_filp);
        int level;
-       if (bytes <= 0)
-               return 0;
+       int ret = 0;
  
        /* this io's submitter should not have unlocked this before we could */
        BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
  
-       if (private)
-               ocfs2_dio_end_io_write(inode, private, offset, bytes);
+       if (bytes > 0 && private)
+               ret = ocfs2_dio_end_io_write(inode, private, offset, bytes);
  
        ocfs2_iocb_clear_rw_locked(iocb);
  
        level = ocfs2_iocb_rw_locked_level(iocb);
        ocfs2_rw_unlock(inode, level);
-       return 0;
+       return ret;
  }
  
  static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
diff --combined fs/ocfs2/refcounttree.c
@@@ -34,6 -34,7 +34,7 @@@
  #include "xattr.h"
  #include "namei.h"
  #include "ocfs2_trace.h"
+ #include "file.h"
  
  #include <linux/bio.h>
  #include <linux/blkdev.h>
@@@ -410,7 -411,7 +411,7 @@@ static int ocfs2_get_refcount_block(str
                goto out;
        }
  
-       BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+       BUG_ON(!ocfs2_is_refcount_inode(inode));
  
        di = (struct ocfs2_dinode *)di_bh->b_data;
        *ref_blkno = le64_to_cpu(di->i_refcount_loc);
@@@ -478,6 -479,7 +479,6 @@@ again
        if (ret) {
                mlog_errno(ret);
                ocfs2_unlock_refcount_tree(osb, tree, rw);
 -              ocfs2_refcount_tree_put(tree);
                goto out;
        }
  
@@@ -569,7 -571,7 +570,7 @@@ static int ocfs2_create_refcount_tree(s
        u32 num_got;
        u64 suballoc_loc, first_blkno;
  
-       BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
+       BUG_ON(ocfs2_is_refcount_inode(inode));
  
        trace_ocfs2_create_refcount_tree(
                (unsigned long long)OCFS2_I(inode)->ip_blkno);
@@@ -707,7 -709,7 +708,7 @@@ static int ocfs2_set_refcount_tree(stru
        struct ocfs2_refcount_block *rb;
        struct ocfs2_refcount_tree *ref_tree;
  
-       BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
+       BUG_ON(ocfs2_is_refcount_inode(inode));
  
        ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
                                       &ref_tree, &ref_root_bh);
@@@ -774,7 -776,7 +775,7 @@@ int ocfs2_remove_refcount_tree(struct i
        u64 blk = 0, bg_blkno = 0, ref_blkno = le64_to_cpu(di->i_refcount_loc);
        u16 bit = 0;
  
-       if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL))
+       if (!ocfs2_is_refcount_inode(inode))
                return 0;
  
        BUG_ON(!ref_blkno);
@@@ -2298,11 -2300,10 +2299,10 @@@ int ocfs2_decrease_refcount(struct inod
  {
        int ret;
        u64 ref_blkno;
-       struct ocfs2_inode_info *oi = OCFS2_I(inode);
        struct buffer_head *ref_root_bh = NULL;
        struct ocfs2_refcount_tree *tree;
  
-       BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+       BUG_ON(!ocfs2_is_refcount_inode(inode));
  
        ret = ocfs2_get_refcount_block(inode, &ref_blkno);
        if (ret) {
@@@ -2532,7 -2533,6 +2532,6 @@@ int ocfs2_prepare_refcount_change_for_d
                                          int *ref_blocks)
  {
        int ret;
-       struct ocfs2_inode_info *oi = OCFS2_I(inode);
        struct buffer_head *ref_root_bh = NULL;
        struct ocfs2_refcount_tree *tree;
        u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno);
                goto out;
        }
  
-       BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+       BUG_ON(!ocfs2_is_refcount_inode(inode));
  
        ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb),
                                      refcount_loc, &tree);
@@@ -3411,14 -3411,13 +3410,13 @@@ static int ocfs2_refcount_cow_hunk(stru
  {
        int ret;
        u32 cow_start = 0, cow_len = 0;
-       struct ocfs2_inode_info *oi = OCFS2_I(inode);
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
        struct buffer_head *ref_root_bh = NULL;
        struct ocfs2_refcount_tree *ref_tree;
        struct ocfs2_cow_context *context = NULL;
  
-       BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+       BUG_ON(!ocfs2_is_refcount_inode(inode));
  
        ret = ocfs2_refcount_cal_cow_clusters(inode, &di->id2.i_list,
                                              cpos, write_len, max_cpos,
@@@ -3628,11 -3627,10 +3626,10 @@@ int ocfs2_refcount_cow_xattr(struct ino
  {
        int ret;
        struct ocfs2_xattr_value_root *xv = vb->vb_xv;
-       struct ocfs2_inode_info *oi = OCFS2_I(inode);
        struct ocfs2_cow_context *context = NULL;
        u32 cow_start, cow_len;
  
-       BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+       BUG_ON(!ocfs2_is_refcount_inode(inode));
  
        ret = ocfs2_refcount_cal_cow_clusters(inode, &xv->xr_list,
                                              cpos, write_len, UINT_MAX,
@@@ -3695,6 -3693,9 +3692,9 @@@ int ocfs2_add_refcount_flag(struct inod
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_alloc_context *meta_ac = NULL;
  
+       /* We need to be able to handle at least an extent tree split. */
+       ref_blocks = ocfs2_extend_meta_needed(data_et->et_root_el);
        ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
                                               ref_ci, ref_root_bh,
                                               p_cluster, num_clusters,
@@@ -3806,7 -3807,7 +3806,7 @@@ static int ocfs2_attach_refcount_tree(s
  
        ocfs2_init_dealloc_ctxt(&dealloc);
  
-       if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) {
+       if (!ocfs2_is_refcount_inode(inode)) {
                ret = ocfs2_create_refcount_tree(inode, di_bh);
                if (ret) {
                        mlog_errno(ret);
@@@ -3933,6 -3934,13 +3933,13 @@@ static int ocfs2_add_refcounted_extent(
        ret = ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
                                      p_cluster, num_clusters,
                                      meta_ac, dealloc);
+       if (ret) {
+               mlog_errno(ret);
+               goto out_commit;
+       }
+       ret = dquot_alloc_space_nodirty(inode,
+               ocfs2_clusters_to_bytes(osb->sb, num_clusters));
        if (ret)
                mlog_errno(ret);
  
  
        return error;
  }
+ /* Update destination inode size, if necessary. */
+ static int ocfs2_reflink_update_dest(struct inode *dest,
+                                    struct buffer_head *d_bh,
+                                    loff_t newlen)
+ {
+       handle_t *handle;
+       int ret;
+       dest->i_blocks = ocfs2_inode_sector_count(dest);
+       if (newlen <= i_size_read(dest))
+               return 0;
+       handle = ocfs2_start_trans(OCFS2_SB(dest->i_sb),
+                                  OCFS2_INODE_UPDATE_CREDITS);
+       if (IS_ERR(handle)) {
+               ret = PTR_ERR(handle);
+               mlog_errno(ret);
+               return ret;
+       }
+       /* Extend i_size if needed. */
+       spin_lock(&OCFS2_I(dest)->ip_lock);
+       if (newlen > i_size_read(dest))
+               i_size_write(dest, newlen);
+       spin_unlock(&OCFS2_I(dest)->ip_lock);
+       dest->i_ctime = dest->i_mtime = current_time(dest);
+       ret = ocfs2_mark_inode_dirty(handle, dest, d_bh);
+       if (ret) {
+               mlog_errno(ret);
+               goto out_commit;
+       }
+ out_commit:
+       ocfs2_commit_trans(OCFS2_SB(dest->i_sb), handle);
+       return ret;
+ }
+ /* Remap the range pos_in:len in s_inode to pos_out:len in t_inode. */
+ static int ocfs2_reflink_remap_extent(struct inode *s_inode,
+                                     struct buffer_head *s_bh,
+                                     loff_t pos_in,
+                                     struct inode *t_inode,
+                                     struct buffer_head *t_bh,
+                                     loff_t pos_out,
+                                     loff_t len,
+                                     struct ocfs2_cached_dealloc_ctxt *dealloc)
+ {
+       struct ocfs2_extent_tree s_et;
+       struct ocfs2_extent_tree t_et;
+       struct ocfs2_dinode *dis;
+       struct buffer_head *ref_root_bh = NULL;
+       struct ocfs2_refcount_tree *ref_tree;
+       struct ocfs2_super *osb;
+       loff_t pstart, plen;
+       u32 p_cluster, num_clusters, slast, spos, tpos;
+       unsigned int ext_flags;
+       int ret = 0;
+       osb = OCFS2_SB(s_inode->i_sb);
+       dis = (struct ocfs2_dinode *)s_bh->b_data;
+       ocfs2_init_dinode_extent_tree(&s_et, INODE_CACHE(s_inode), s_bh);
+       ocfs2_init_dinode_extent_tree(&t_et, INODE_CACHE(t_inode), t_bh);
+       spos = ocfs2_bytes_to_clusters(s_inode->i_sb, pos_in);
+       tpos = ocfs2_bytes_to_clusters(t_inode->i_sb, pos_out);
+       slast = ocfs2_clusters_for_bytes(s_inode->i_sb, pos_in + len);
+       while (spos < slast) {
+               if (fatal_signal_pending(current)) {
+                       ret = -EINTR;
+                       goto out;
+               }
+               /* Look up the extent. */
+               ret = ocfs2_get_clusters(s_inode, spos, &p_cluster,
+                                        &num_clusters, &ext_flags);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+               num_clusters = min_t(u32, num_clusters, slast - spos);
+               /* Punch out the dest range. */
+               pstart = ocfs2_clusters_to_bytes(t_inode->i_sb, tpos);
+               plen = ocfs2_clusters_to_bytes(t_inode->i_sb, num_clusters);
+               ret = ocfs2_remove_inode_range(t_inode, t_bh, pstart, plen);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+               if (p_cluster == 0)
+                       goto next_loop;
+               /* Lock the refcount btree... */
+               ret = ocfs2_lock_refcount_tree(osb,
+                                              le64_to_cpu(dis->i_refcount_loc),
+                                              1, &ref_tree, &ref_root_bh);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+               /* Mark s_inode's extent as refcounted. */
+               if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) {
+                       ret = ocfs2_add_refcount_flag(s_inode, &s_et,
+                                                     &ref_tree->rf_ci,
+                                                     ref_root_bh, spos,
+                                                     p_cluster, num_clusters,
+                                                     dealloc, NULL);
+                       if (ret) {
+                               mlog_errno(ret);
+                               goto out_unlock_refcount;
+                       }
+               }
+               /* Map in the new extent. */
+               ext_flags |= OCFS2_EXT_REFCOUNTED;
+               ret = ocfs2_add_refcounted_extent(t_inode, &t_et,
+                                                 &ref_tree->rf_ci,
+                                                 ref_root_bh,
+                                                 tpos, p_cluster,
+                                                 num_clusters,
+                                                 ext_flags,
+                                                 dealloc);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out_unlock_refcount;
+               }
+               ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+               brelse(ref_root_bh);
+ next_loop:
+               spos += num_clusters;
+               tpos += num_clusters;
+       }
+ out:
+       return ret;
+ out_unlock_refcount:
+       ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+       brelse(ref_root_bh);
+       return ret;
+ }
+ /* Set up refcount tree and remap s_inode to t_inode. */
+ static int ocfs2_reflink_remap_blocks(struct inode *s_inode,
+                                     struct buffer_head *s_bh,
+                                     loff_t pos_in,
+                                     struct inode *t_inode,
+                                     struct buffer_head *t_bh,
+                                     loff_t pos_out,
+                                     loff_t len)
+ {
+       struct ocfs2_cached_dealloc_ctxt dealloc;
+       struct ocfs2_super *osb;
+       struct ocfs2_dinode *dis;
+       struct ocfs2_dinode *dit;
+       int ret;
+       osb = OCFS2_SB(s_inode->i_sb);
+       dis = (struct ocfs2_dinode *)s_bh->b_data;
+       dit = (struct ocfs2_dinode *)t_bh->b_data;
+       ocfs2_init_dealloc_ctxt(&dealloc);
+       /*
+        * If we're reflinking the entire file and the source is inline
+        * data, just copy the contents.
+        */
+       if (pos_in == pos_out && pos_in == 0 && len == i_size_read(s_inode) &&
+           i_size_read(t_inode) <= len &&
+           (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) {
+               ret = ocfs2_duplicate_inline_data(s_inode, s_bh, t_inode, t_bh);
+               if (ret)
+                       mlog_errno(ret);
+               goto out;
+       }
+       /*
+        * If both inodes belong to two different refcount groups then
+        * forget it because we don't know how (or want) to go merging
+        * refcount trees.
+        */
+       ret = -EOPNOTSUPP;
+       if (ocfs2_is_refcount_inode(s_inode) &&
+           ocfs2_is_refcount_inode(t_inode) &&
+           le64_to_cpu(dis->i_refcount_loc) !=
+           le64_to_cpu(dit->i_refcount_loc))
+               goto out;
+       /* Neither inode has a refcount tree.  Add one to s_inode. */
+       if (!ocfs2_is_refcount_inode(s_inode) &&
+           !ocfs2_is_refcount_inode(t_inode)) {
+               ret = ocfs2_create_refcount_tree(s_inode, s_bh);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+       }
+       /* Ensure that both inodes end up with the same refcount tree. */
+       if (!ocfs2_is_refcount_inode(s_inode)) {
+               ret = ocfs2_set_refcount_tree(s_inode, s_bh,
+                                             le64_to_cpu(dit->i_refcount_loc));
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+       }
+       if (!ocfs2_is_refcount_inode(t_inode)) {
+               ret = ocfs2_set_refcount_tree(t_inode, t_bh,
+                                             le64_to_cpu(dis->i_refcount_loc));
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+       }
+       /* Turn off inline data in the dest file. */
+       if (OCFS2_I(t_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+               ret = ocfs2_convert_inline_data_to_extents(t_inode, t_bh);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+       }
+       /* Actually remap extents now. */
+       ret = ocfs2_reflink_remap_extent(s_inode, s_bh, pos_in, t_inode, t_bh,
+                                        pos_out, len, &dealloc);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+ out:
+       if (ocfs2_dealloc_has_cluster(&dealloc)) {
+               ocfs2_schedule_truncate_log_flush(osb, 1);
+               ocfs2_run_deallocs(osb, &dealloc);
+       }
+       return ret;
+ }
+ /* Lock an inode and grab a bh pointing to the inode. */
+ static int ocfs2_reflink_inodes_lock(struct inode *s_inode,
+                                    struct buffer_head **bh1,
+                                    struct inode *t_inode,
+                                    struct buffer_head **bh2)
+ {
+       struct inode *inode1;
+       struct inode *inode2;
+       struct ocfs2_inode_info *oi1;
+       struct ocfs2_inode_info *oi2;
+       bool same_inode = (s_inode == t_inode);
+       int status;
+       /* First grab the VFS and rw locks. */
+       lock_two_nondirectories(s_inode, t_inode);
+       inode1 = s_inode;
+       inode2 = t_inode;
+       if (inode1->i_ino > inode2->i_ino)
+               swap(inode1, inode2);
+       status = ocfs2_rw_lock(inode1, 1);
+       if (status) {
+               mlog_errno(status);
+               goto out_i1;
+       }
+       if (!same_inode) {
+               status = ocfs2_rw_lock(inode2, 1);
+               if (status) {
+                       mlog_errno(status);
+                       goto out_i2;
+               }
+       }
+       /* Now go for the cluster locks */
+       oi1 = OCFS2_I(inode1);
+       oi2 = OCFS2_I(inode2);
+       trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno,
+                               (unsigned long long)oi2->ip_blkno);
+       if (*bh1)
+               *bh1 = NULL;
+       if (*bh2)
+               *bh2 = NULL;
+       /* We always want to lock the one with the lower lockid first. */
+       if (oi1->ip_blkno > oi2->ip_blkno)
+               mlog_errno(-ENOLCK);
+       /* lock id1 */
+       status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_REFLINK_TARGET);
+       if (status < 0) {
+               if (status != -ENOENT)
+                       mlog_errno(status);
+               goto out_rw2;
+       }
+       /* lock id2 */
+       if (!same_inode) {
+               status = ocfs2_inode_lock_nested(inode2, bh2, 1,
+                                                OI_LS_REFLINK_TARGET);
+               if (status < 0) {
+                       if (status != -ENOENT)
+                               mlog_errno(status);
+                       goto out_cl1;
+               }
+       } else
+               *bh2 = *bh1;
+       trace_ocfs2_double_lock_end(
+                       (unsigned long long)OCFS2_I(inode1)->ip_blkno,
+                       (unsigned long long)OCFS2_I(inode2)->ip_blkno);
+       return 0;
+ out_cl1:
+       ocfs2_inode_unlock(inode1, 1);
+       brelse(*bh1);
+       *bh1 = NULL;
+ out_rw2:
+       ocfs2_rw_unlock(inode2, 1);
+ out_i2:
+       ocfs2_rw_unlock(inode1, 1);
+ out_i1:
+       unlock_two_nondirectories(s_inode, t_inode);
+       return status;
+ }
+ /* Unlock both inodes and release buffers. */
+ static void ocfs2_reflink_inodes_unlock(struct inode *s_inode,
+                                       struct buffer_head *s_bh,
+                                       struct inode *t_inode,
+                                       struct buffer_head *t_bh)
+ {
+       ocfs2_inode_unlock(s_inode, 1);
+       ocfs2_rw_unlock(s_inode, 1);
+       brelse(s_bh);
+       if (s_inode != t_inode) {
+               ocfs2_inode_unlock(t_inode, 1);
+               ocfs2_rw_unlock(t_inode, 1);
+               brelse(t_bh);
+       }
+       unlock_two_nondirectories(s_inode, t_inode);
+ }
+ /* Link a range of blocks from one file to another. */
+ int ocfs2_reflink_remap_range(struct file *file_in,
+                             loff_t pos_in,
+                             struct file *file_out,
+                             loff_t pos_out,
+                             u64 len,
+                             bool is_dedupe)
+ {
+       struct inode *inode_in = file_inode(file_in);
+       struct inode *inode_out = file_inode(file_out);
+       struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb);
+       struct buffer_head *in_bh = NULL, *out_bh = NULL;
+       bool same_inode = (inode_in == inode_out);
+       ssize_t ret;
+       if (!ocfs2_refcount_tree(osb))
+               return -EOPNOTSUPP;
+       if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+               return -EROFS;
+       /* Lock both files against IO */
+       ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh);
+       if (ret)
+               return ret;
+       /* Check file eligibility and prepare for block sharing. */
+       ret = -EINVAL;
+       if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) ||
+           (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
+               goto out_unlock;
+       ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out,
+                       &len, is_dedupe);
+       if (ret || len == 0)
+               goto out_unlock;
+       /* Lock out changes to the allocation maps and remap. */
+       down_write(&OCFS2_I(inode_in)->ip_alloc_sem);
+       if (!same_inode)
+               down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem,
+                                 SINGLE_DEPTH_NESTING);
+       ret = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, inode_out,
+                                        out_bh, pos_out, len);
+       /* Zap any page cache for the destination file's range. */
+       if (!ret)
+               truncate_inode_pages_range(&inode_out->i_data, pos_out,
+                                          PAGE_ALIGN(pos_out + len) - 1);
+       up_write(&OCFS2_I(inode_in)->ip_alloc_sem);
+       if (!same_inode)
+               up_write(&OCFS2_I(inode_out)->ip_alloc_sem);
+       if (ret) {
+               mlog_errno(ret);
+               goto out_unlock;
+       }
+       /*
+        * Empty the extent map so that we may get the right extent
+        * record from the disk.
+        */
+       ocfs2_extent_map_trunc(inode_in, 0);
+       ocfs2_extent_map_trunc(inode_out, 0);
+       ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len);
+       if (ret) {
+               mlog_errno(ret);
+               goto out_unlock;
+       }
+       ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
+       return 0;
+ out_unlock:
+       ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
+       return ret;
+ }
diff --combined fs/read_write.c
@@@ -1538,26 -1538,45 +1538,43 @@@ ssize_t vfs_copy_file_range(struct fil
        if (len == 0)
                return 0;
  
 -      ret = mnt_want_write_file(file_out);
 -      if (ret)
 -              return ret;
 +      sb_start_write(inode_out->i_sb);
  
-       ret = -EOPNOTSUPP;
-       if (file_out->f_op->copy_file_range)
+       /*
+        * Try cloning first, this is supported by more file systems, and
+        * more efficient if both clone and copy are supported (e.g. NFS).
+        */
+       if (file_in->f_op->clone_file_range) {
+               ret = file_in->f_op->clone_file_range(file_in, pos_in,
+                               file_out, pos_out, len);
+               if (ret == 0) {
+                       ret = len;
+                       goto done;
+               }
+       }
+       if (file_out->f_op->copy_file_range) {
                ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out,
                                                      pos_out, len, flags);
-       if (ret == -EOPNOTSUPP)
-               ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
-                               len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
+               if (ret != -EOPNOTSUPP)
+                       goto done;
+       }
+       ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
+                       len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
  
+ done:
        if (ret > 0) {
                fsnotify_access(file_in);
                add_rchar(current, ret);
                fsnotify_modify(file_out);
                add_wchar(current, ret);
        }
        inc_syscr(current);
        inc_syscw(current);
  
 -      mnt_drop_write_file(file_out);
 +      sb_end_write(inode_out->i_sb);
  
        return ret;
  }
@@@ -1648,6 -1667,114 +1665,114 @@@ static int clone_verify_area(struct fil
        return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
  }
  
+ /*
+  * Check that the two inodes are eligible for cloning, the ranges make
+  * sense, and then flush all dirty data.  Caller must ensure that the
+  * inodes have been locked against any other modifications.
+  */
+ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
+                              struct inode *inode_out, loff_t pos_out,
+                              u64 *len, bool is_dedupe)
+ {
+       loff_t bs = inode_out->i_sb->s_blocksize;
+       loff_t blen;
+       loff_t isize;
+       bool same_inode = (inode_in == inode_out);
+       int ret;
+       /* Don't touch certain kinds of inodes */
+       if (IS_IMMUTABLE(inode_out))
+               return -EPERM;
+       if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
+               return -ETXTBSY;
+       /* Don't reflink dirs, pipes, sockets... */
+       if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
+               return -EISDIR;
+       if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
+               return -EINVAL;
+       /* Are we going all the way to the end? */
+       isize = i_size_read(inode_in);
+       if (isize == 0) {
+               *len = 0;
+               return 0;
+       }
+       /* Zero length dedupe exits immediately; reflink goes to EOF. */
+       if (*len == 0) {
+               if (is_dedupe) {
+                       *len = 0;
+                       return 0;
+               }
+               *len = isize - pos_in;
+       }
+       /* Ensure offsets don't wrap and the input is inside i_size */
+       if (pos_in + *len < pos_in || pos_out + *len < pos_out ||
+           pos_in + *len > isize)
+               return -EINVAL;
+       /* Don't allow dedupe past EOF in the dest file */
+       if (is_dedupe) {
+               loff_t  disize;
+               disize = i_size_read(inode_out);
+               if (pos_out >= disize || pos_out + *len > disize)
+                       return -EINVAL;
+       }
+       /* If we're linking to EOF, continue to the block boundary. */
+       if (pos_in + *len == isize)
+               blen = ALIGN(isize, bs) - pos_in;
+       else
+               blen = *len;
+       /* Only reflink if we're aligned to block boundaries */
+       if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
+           !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
+               return -EINVAL;
+       /* Don't allow overlapped reflink within the same file */
+       if (same_inode) {
+               if (pos_out + blen > pos_in && pos_out < pos_in + blen)
+                       return -EINVAL;
+       }
+       /* Wait for the completion of any pending IOs on both files */
+       inode_dio_wait(inode_in);
+       if (!same_inode)
+               inode_dio_wait(inode_out);
+       ret = filemap_write_and_wait_range(inode_in->i_mapping,
+                       pos_in, pos_in + *len - 1);
+       if (ret)
+               return ret;
+       ret = filemap_write_and_wait_range(inode_out->i_mapping,
+                       pos_out, pos_out + *len - 1);
+       if (ret)
+               return ret;
+       /*
+        * Check that the extents are the same.
+        */
+       if (is_dedupe) {
+               bool            is_same = false;
+               ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
+                               inode_out, pos_out, *len, &is_same);
+               if (ret)
+                       return ret;
+               if (!is_same)
+                       return -EBADE;
+       }
+       return 0;
+ }
+ EXPORT_SYMBOL(vfs_clone_file_prep_inodes);
  int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
                struct file *file_out, loff_t pos_out, u64 len)
  {
        struct inode *inode_out = file_inode(file_out);
        int ret;
  
 -      if (inode_in->i_sb != inode_out->i_sb ||
 -          file_in->f_path.mnt != file_out->f_path.mnt)
 -              return -EXDEV;
 -
        if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
                return -EISDIR;
        if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
                return -EINVAL;
  
 +      /*
 +       * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
 +       * the same mount. Practically, they only need to be on the same file
 +       * system.
 +       */
 +      if (inode_in->i_sb != inode_out->i_sb)
 +              return -EXDEV;
 +
        if (!(file_in->f_mode & FMODE_READ) ||
            !(file_out->f_mode & FMODE_WRITE) ||
            (file_out->f_flags & O_APPEND))
        if (pos_in + len > i_size_read(inode_in))
                return -EINVAL;
  
 -      ret = mnt_want_write_file(file_out);
 -      if (ret)
 -              return ret;
 -
        ret = file_in->f_op->clone_file_range(file_in, pos_in,
                        file_out, pos_out, len);
        if (!ret) {
                fsnotify_modify(file_out);
        }
  
 -      mnt_drop_write_file(file_out);
        return ret;
  }
  EXPORT_SYMBOL(vfs_clone_file_range);
  
+ /*
+  * Read a page's worth of file data into the page cache.  Return the page
+  * locked.
+  */
+ static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
+ {
+       struct address_space *mapping;
+       struct page *page;
+       pgoff_t n;
+       n = offset >> PAGE_SHIFT;
+       mapping = inode->i_mapping;
+       page = read_mapping_page(mapping, n, NULL);
+       if (IS_ERR(page))
+               return page;
+       if (!PageUptodate(page)) {
+               put_page(page);
+               return ERR_PTR(-EIO);
+       }
+       lock_page(page);
+       return page;
+ }
+ /*
+  * Compare extents of two files to see if they are the same.
+  * Caller must have locked both inodes to prevent write races.
+  */
+ int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
+                                 struct inode *dest, loff_t destoff,
+                                 loff_t len, bool *is_same)
+ {
+       loff_t src_poff;
+       loff_t dest_poff;
+       void *src_addr;
+       void *dest_addr;
+       struct page *src_page;
+       struct page *dest_page;
+       loff_t cmp_len;
+       bool same;
+       int error;
+       error = -EINVAL;
+       same = true;
+       while (len) {
+               src_poff = srcoff & (PAGE_SIZE - 1);
+               dest_poff = destoff & (PAGE_SIZE - 1);
+               cmp_len = min(PAGE_SIZE - src_poff,
+                             PAGE_SIZE - dest_poff);
+               cmp_len = min(cmp_len, len);
+               if (cmp_len <= 0)
+                       goto out_error;
+               src_page = vfs_dedupe_get_page(src, srcoff);
+               if (IS_ERR(src_page)) {
+                       error = PTR_ERR(src_page);
+                       goto out_error;
+               }
+               dest_page = vfs_dedupe_get_page(dest, destoff);
+               if (IS_ERR(dest_page)) {
+                       error = PTR_ERR(dest_page);
+                       unlock_page(src_page);
+                       put_page(src_page);
+                       goto out_error;
+               }
+               src_addr = kmap_atomic(src_page);
+               dest_addr = kmap_atomic(dest_page);
+               flush_dcache_page(src_page);
+               flush_dcache_page(dest_page);
+               if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
+                       same = false;
+               kunmap_atomic(dest_addr);
+               kunmap_atomic(src_addr);
+               unlock_page(dest_page);
+               unlock_page(src_page);
+               put_page(dest_page);
+               put_page(src_page);
+               if (!same)
+                       break;
+               srcoff += cmp_len;
+               destoff += cmp_len;
+               len -= cmp_len;
+       }
+       *is_same = same;
+       return 0;
+ out_error:
+       return error;
+ }
+ EXPORT_SYMBOL(vfs_dedupe_file_range_compare);
  int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
  {
        struct file_dedupe_range_info *info;
diff --combined fs/xfs/xfs_file.c
  
  static const struct vm_operations_struct xfs_file_vm_ops;
  
 -/*
 - * Locking primitives for read and write IO paths to ensure we consistently use
 - * and order the inode->i_mutex, ip->i_lock and ip->i_iolock.
 - */
 -static inline void
 -xfs_rw_ilock(
 -      struct xfs_inode        *ip,
 -      int                     type)
 -{
 -      if (type & XFS_IOLOCK_EXCL)
 -              inode_lock(VFS_I(ip));
 -      xfs_ilock(ip, type);
 -}
 -
 -static inline void
 -xfs_rw_iunlock(
 -      struct xfs_inode        *ip,
 -      int                     type)
 -{
 -      xfs_iunlock(ip, type);
 -      if (type & XFS_IOLOCK_EXCL)
 -              inode_unlock(VFS_I(ip));
 -}
 -
 -static inline void
 -xfs_rw_ilock_demote(
 -      struct xfs_inode        *ip,
 -      int                     type)
 -{
 -      xfs_ilock_demote(ip, type);
 -      if (type & XFS_IOLOCK_EXCL)
 -              inode_unlock(VFS_I(ip));
 -}
 -
  /*
   * Clear the specified ranges to zero through either the pagecache or DAX.
   * Holes and unwritten extents will be left as-is as they already are zeroed.
@@@ -149,16 -183,19 +149,16 @@@ xfs_file_fsync
  
        xfs_iflags_clear(ip, XFS_ITRUNCATED);
  
 -      if (mp->m_flags & XFS_MOUNT_BARRIER) {
 -              /*
 -               * If we have an RT and/or log subvolume we need to make sure
 -               * to flush the write cache the device used for file data
 -               * first.  This is to ensure newly written file data make
 -               * it to disk before logging the new inode size in case of
 -               * an extending write.
 -               */
 -              if (XFS_IS_REALTIME_INODE(ip))
 -                      xfs_blkdev_issue_flush(mp->m_rtdev_targp);
 -              else if (mp->m_logdev_targp != mp->m_ddev_targp)
 -                      xfs_blkdev_issue_flush(mp->m_ddev_targp);
 -      }
 +      /*
 +       * If we have an RT and/or log subvolume we need to make sure to flush
 +       * the write cache the device used for file data first.  This is to
 +       * ensure newly written file data make it to disk before logging the new
 +       * inode size in case of an extending write.
 +       */
 +      if (XFS_IS_REALTIME_INODE(ip))
 +              xfs_blkdev_issue_flush(mp->m_rtdev_targp);
 +      else if (mp->m_logdev_targp != mp->m_ddev_targp)
 +              xfs_blkdev_issue_flush(mp->m_ddev_targp);
  
        /*
         * All metadata updates are logged, which means that we just have to
         * an already allocated file and thus do not have any metadata to
         * commit.
         */
 -      if ((mp->m_flags & XFS_MOUNT_BARRIER) &&
 -          mp->m_logdev_targp == mp->m_ddev_targp &&
 -          !XFS_IS_REALTIME_INODE(ip) &&
 -          !log_flushed)
 +      if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
 +          mp->m_logdev_targp == mp->m_ddev_targp)
                xfs_blkdev_issue_flush(mp->m_ddev_targp);
  
        return error;
@@@ -205,21 -244,62 +205,21 @@@ xfs_file_dio_aio_read
        struct kiocb            *iocb,
        struct iov_iter         *to)
  {
 -      struct address_space    *mapping = iocb->ki_filp->f_mapping;
 -      struct inode            *inode = mapping->host;
 -      struct xfs_inode        *ip = XFS_I(inode);
 -      loff_t                  isize = i_size_read(inode);
 +      struct xfs_inode        *ip = XFS_I(file_inode(iocb->ki_filp));
        size_t                  count = iov_iter_count(to);
 -      loff_t                  end = iocb->ki_pos + count - 1;
 -      struct iov_iter         data;
 -      struct xfs_buftarg      *target;
 -      ssize_t                 ret = 0;
 +      ssize_t                 ret;
  
        trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
  
        if (!count)
                return 0; /* skip atime */
  
 -      if (XFS_IS_REALTIME_INODE(ip))
 -              target = ip->i_mount->m_rtdev_targp;
 -      else
 -              target = ip->i_mount->m_ddev_targp;
 -
 -      /* DIO must be aligned to device logical sector size */
 -      if ((iocb->ki_pos | count) & target->bt_logical_sectormask) {
 -              if (iocb->ki_pos == isize)
 -                      return 0;
 -              return -EINVAL;
 -      }
 -
        file_accessed(iocb->ki_filp);
  
 -      xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
 -      if (mapping->nrpages) {
 -              ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end);
 -              if (ret)
 -                      goto out_unlock;
 -
 -              /*
 -               * Invalidate whole pages. This can return an error if we fail
 -               * to invalidate a page, but this should never happen on XFS.
 -               * Warn if it does fail.
 -               */
 -              ret = invalidate_inode_pages2_range(mapping,
 -                              iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
 -              WARN_ON_ONCE(ret);
 -              ret = 0;
 -      }
 +      xfs_ilock(ip, XFS_IOLOCK_SHARED);
 +      ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL);
 +      xfs_iunlock(ip, XFS_IOLOCK_SHARED);
  
 -      data = *to;
 -      ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
 -                      xfs_get_blocks_direct, NULL, NULL, 0);
 -      if (ret >= 0) {
 -              iocb->ki_pos += ret;
 -              iov_iter_advance(to, ret);
 -      }
 -
 -out_unlock:
 -      xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
        return ret;
  }
  
@@@ -237,9 -317,9 +237,9 @@@ xfs_file_dax_read
        if (!count)
                return 0; /* skip atime */
  
 -      xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
 -      ret = iomap_dax_rw(iocb, to, &xfs_iomap_ops);
 -      xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
 +      xfs_ilock(ip, XFS_IOLOCK_SHARED);
 +      ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops);
 +      xfs_iunlock(ip, XFS_IOLOCK_SHARED);
  
        file_accessed(iocb->ki_filp);
        return ret;
@@@ -255,9 -335,9 +255,9 @@@ xfs_file_buffered_aio_read
  
        trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
  
 -      xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
 +      xfs_ilock(ip, XFS_IOLOCK_SHARED);
        ret = generic_file_read_iter(iocb, to);
 -      xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
 +      xfs_iunlock(ip, XFS_IOLOCK_SHARED);
  
        return ret;
  }
@@@ -338,18 -418,15 +338,18 @@@ restart
        if (error <= 0)
                return error;
  
 -      error = xfs_break_layouts(inode, iolock, true);
 +      error = xfs_break_layouts(inode, iolock);
        if (error)
                return error;
  
 -      /* For changing security info in file_remove_privs() we need i_mutex */
 +      /*
 +       * For changing security info in file_remove_privs() we need i_rwsem
 +       * exclusively.
 +       */
        if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
 -              xfs_rw_iunlock(ip, *iolock);
 +              xfs_iunlock(ip, *iolock);
                *iolock = XFS_IOLOCK_EXCL;
 -              xfs_rw_ilock(ip, *iolock);
 +              xfs_ilock(ip, *iolock);
                goto restart;
        }
        /*
                spin_unlock(&ip->i_flags_lock);
                if (!drained_dio) {
                        if (*iolock == XFS_IOLOCK_SHARED) {
 -                              xfs_rw_iunlock(ip, *iolock);
 +                              xfs_iunlock(ip, *iolock);
                                *iolock = XFS_IOLOCK_EXCL;
 -                              xfs_rw_ilock(ip, *iolock);
 +                              xfs_ilock(ip, *iolock);
                                iov_iter_reexpand(from, count);
                        }
                        /*
        return 0;
  }
  
 +static int
 +xfs_dio_write_end_io(
 +      struct kiocb            *iocb,
 +      ssize_t                 size,
 +      unsigned                flags)
 +{
 +      struct inode            *inode = file_inode(iocb->ki_filp);
 +      struct xfs_inode        *ip = XFS_I(inode);
 +      loff_t                  offset = iocb->ki_pos;
 +      bool                    update_size = false;
 +      int                     error = 0;
 +
 +      trace_xfs_end_io_direct_write(ip, offset, size);
 +
 +      if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 +              return -EIO;
 +
 +      if (size <= 0)
 +              return size;
 +
 +      /*
 +       * We need to update the in-core inode size here so that we don't end up
 +       * with the on-disk inode size being outside the in-core inode size. We
 +       * have no other method of updating EOF for AIO, so always do it here
 +       * if necessary.
 +       *
 +       * We need to lock the test/set EOF update as we can be racing with
 +       * other IO completions here to update the EOF. Failing to serialise
 +       * here can result in EOF moving backwards and Bad Things Happen when
 +       * that occurs.
 +       */
 +      spin_lock(&ip->i_flags_lock);
 +      if (offset + size > i_size_read(inode)) {
 +              i_size_write(inode, offset + size);
 +              update_size = true;
 +      }
 +      spin_unlock(&ip->i_flags_lock);
 +
 +      if (flags & IOMAP_DIO_COW) {
 +              error = xfs_reflink_end_cow(ip, offset, size);
 +              if (error)
 +                      return error;
 +      }
 +
 +      if (flags & IOMAP_DIO_UNWRITTEN)
 +              error = xfs_iomap_write_unwritten(ip, offset, size);
 +      else if (update_size)
 +              error = xfs_setfilesize(ip, offset, size);
 +
 +      return error;
 +}
 +
  /*
   * xfs_file_dio_aio_write - handle direct IO writes
   *
@@@ -510,7 -535,9 +510,7 @@@ xfs_file_dio_aio_write
        int                     unaligned_io = 0;
        int                     iolock;
        size_t                  count = iov_iter_count(from);
 -      loff_t                  end;
 -      struct iov_iter         data;
 -      struct xfs_buftarg      *target = XFS_IS_REALTIME_INODE(ip) ?
 +      struct xfs_buftarg      *target = XFS_IS_REALTIME_INODE(ip) ?
                                        mp->m_rtdev_targp : mp->m_ddev_targp;
  
        /* DIO must be aligned to device logical sector size */
                iolock = XFS_IOLOCK_SHARED;
        }
  
 -      xfs_rw_ilock(ip, iolock);
 +      xfs_ilock(ip, iolock);
  
        ret = xfs_file_aio_write_checks(iocb, from, &iolock);
        if (ret)
                goto out;
        count = iov_iter_count(from);
 -      end = iocb->ki_pos + count - 1;
 -
 -      if (mapping->nrpages) {
 -              ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end);
 -              if (ret)
 -                      goto out;
 -
 -              /*
 -               * Invalidate whole pages. This can return an error if we fail
 -               * to invalidate a page, but this should never happen on XFS.
 -               * Warn if it does fail.
 -               */
 -              ret = invalidate_inode_pages2_range(mapping,
 -                              iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
 -              WARN_ON_ONCE(ret);
 -              ret = 0;
 -      }
  
        /*
         * If we are doing unaligned IO, wait for all other IO to drain,
        if (unaligned_io)
                inode_dio_wait(inode);
        else if (iolock == XFS_IOLOCK_EXCL) {
 -              xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
 +              xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
                iolock = XFS_IOLOCK_SHARED;
        }
  
                        goto out;
        }
  
 -      data = *from;
 -      ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
 -                      xfs_get_blocks_direct, xfs_end_io_direct_write,
 -                      NULL, DIO_ASYNC_EXTEND);
 -
 -      /* see generic_file_direct_write() for why this is necessary */
 -      if (mapping->nrpages) {
 -              invalidate_inode_pages2_range(mapping,
 -                                            iocb->ki_pos >> PAGE_SHIFT,
 -                                            end >> PAGE_SHIFT);
 -      }
 -
 -      if (ret > 0) {
 -              iocb->ki_pos += ret;
 -              iov_iter_advance(from, ret);
 -      }
 +      ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io);
  out:
 -      xfs_rw_iunlock(ip, iolock);
 +      xfs_iunlock(ip, iolock);
  
        /*
         * No fallback to buffered IO on errors for XFS, direct IO will either
@@@ -584,7 -643,7 +584,7 @@@ xfs_file_dax_write
        size_t                  count;
        loff_t                  pos;
  
 -      xfs_rw_ilock(ip, iolock);
 +      xfs_ilock(ip, iolock);
        ret = xfs_file_aio_write_checks(iocb, from, &iolock);
        if (ret)
                goto out;
        count = iov_iter_count(from);
  
        trace_xfs_file_dax_write(ip, count, pos);
 -
 -      ret = iomap_dax_rw(iocb, from, &xfs_iomap_ops);
 +      ret = dax_iomap_rw(iocb, from, &xfs_iomap_ops);
        if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
                i_size_write(inode, iocb->ki_pos);
                error = xfs_setfilesize(ip, pos, ret);
        }
 -
  out:
 -      xfs_rw_iunlock(ip, iolock);
 +      xfs_iunlock(ip, iolock);
        return error ? error : ret;
  }
  
@@@ -616,7 -677,7 +616,7 @@@ xfs_file_buffered_aio_write
        int                     enospc = 0;
        int                     iolock = XFS_IOLOCK_EXCL;
  
 -      xfs_rw_ilock(ip, iolock);
 +      xfs_ilock(ip, iolock);
  
        ret = xfs_file_aio_write_checks(iocb, from, &iolock);
        if (ret)
@@@ -660,7 -721,7 +660,7 @@@ write_retry
  
        current->backing_dev_info = NULL;
  out:
 -      xfs_rw_iunlock(ip, iolock);
 +      xfs_iunlock(ip, iolock);
        return ret;
  }
  
@@@ -736,7 -797,7 +736,7 @@@ xfs_file_fallocate
                return -EOPNOTSUPP;
  
        xfs_ilock(ip, iolock);
 -      error = xfs_break_layouts(inode, &iolock, false);
 +      error = xfs_break_layouts(inode, &iolock);
        if (error)
                goto out_unlock;
  
@@@ -848,24 -909,6 +848,6 @@@ out_unlock
        return error;
  }
  
- STATIC ssize_t
- xfs_file_copy_range(
-       struct file     *file_in,
-       loff_t          pos_in,
-       struct file     *file_out,
-       loff_t          pos_out,
-       size_t          len,
-       unsigned int    flags)
- {
-       int             error;
-       error = xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
-                                    len, false);
-       if (error)
-               return error;
-       return len;
- }
  STATIC int
  xfs_file_clone_range(
        struct file     *file_in,
                                     len, false);
  }
  
 -#define XFS_MAX_DEDUPE_LEN    (16 * 1024 * 1024)
  STATIC ssize_t
  xfs_file_dedupe_range(
        struct file     *src_file,
  {
        int             error;
  
 -      /*
 -       * Limit the total length we will dedupe for each operation.
 -       * This is intended to bound the total time spent in this
 -       * ioctl to something sane.
 -       */
 -      if (len > XFS_MAX_DEDUPE_LEN)
 -              len = XFS_MAX_DEDUPE_LEN;
 -
        error = xfs_reflink_remap_range(src_file, loff, dst_file, dst_loff,
                                     len, true);
        if (error)
@@@ -1404,7 -1456,7 +1386,7 @@@ xfs_filemap_page_mkwrite
        xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
  
        if (IS_DAX(inode)) {
 -              ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops);
 +              ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
        } else {
                ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
                ret = block_page_mkwrite_return(ret);
@@@ -1431,9 -1483,15 +1413,9 @@@ xfs_filemap_fault
                return xfs_filemap_page_mkwrite(vma, vmf);
  
        xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 -      if (IS_DAX(inode)) {
 -              /*
 -               * we do not want to trigger unwritten extent conversion on read
 -               * faults - that is unnecessary overhead and would also require
 -               * changes to xfs_get_blocks_direct() to map unwritten extent
 -               * ioend for conversion on read-only mappings.
 -               */
 -              ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops);
 -      } else
 +      if (IS_DAX(inode))
 +              ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
 +      else
                ret = filemap_fault(vma, vmf);
        xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
  
@@@ -1469,7 -1527,7 +1451,7 @@@ xfs_filemap_pmd_fault
        }
  
        xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 -      ret = dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault);
 +      ret = dax_iomap_pmd_fault(vma, addr, pmd, flags, &xfs_iomap_ops);
        xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
  
        if (flags & FAULT_FLAG_WRITE)
@@@ -1549,7 -1607,6 +1531,6 @@@ const struct file_operations xfs_file_o
        .fsync          = xfs_file_fsync,
        .get_unmapped_area = thp_get_unmapped_area,
        .fallocate      = xfs_file_fallocate,
-       .copy_file_range = xfs_file_copy_range,
        .clone_file_range = xfs_file_clone_range,
        .dedupe_file_range = xfs_file_dedupe_range,
  };
diff --combined fs/xfs/xfs_reflink.c
@@@ -243,11 -243,12 +243,11 @@@ xfs_reflink_reserve_cow
        struct xfs_bmbt_irec    *imap,
        bool                    *shared)
  {
 -      struct xfs_bmbt_irec    got, prev;
 -      xfs_fileoff_t           end_fsb, orig_end_fsb;
 -      int                     eof = 0, error = 0;
 -      bool                    trimmed;
 +      struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
 +      struct xfs_bmbt_irec    got;
 +      int                     error = 0;
 +      bool                    eof = false, trimmed;
        xfs_extnum_t            idx;
 -      xfs_extlen_t            align;
  
        /*
         * Search the COW fork extent list first.  This serves two purposes:
         * extent list is generally faster than going out to the shared extent
         * tree.
         */
 -      xfs_bmap_search_extents(ip, imap->br_startoff, XFS_COW_FORK, &eof, &idx,
 -                      &got, &prev);
 +
 +      if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &idx, &got))
 +              eof = true;
        if (!eof && got.br_startoff <= imap->br_startoff) {
                trace_xfs_reflink_cow_found(ip, imap);
                xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
        if (error)
                return error;
  
 -      end_fsb = orig_end_fsb = imap->br_startoff + imap->br_blockcount;
 -
 -      align = xfs_eof_alignment(ip, xfs_get_cowextsz_hint(ip));
 -      if (align)
 -              end_fsb = roundup_64(end_fsb, align);
 -
 -retry:
        error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff,
 -                      end_fsb - imap->br_startoff, &got, &prev, &idx, eof);
 -      switch (error) {
 -      case 0:
 -              break;
 -      case -ENOSPC:
 -      case -EDQUOT:
 -              /* retry without any preallocation */
 +                      imap->br_blockcount, 0, &got, &idx, eof);
 +      if (error == -ENOSPC || error == -EDQUOT)
                trace_xfs_reflink_cow_enospc(ip, imap);
 -              if (end_fsb != orig_end_fsb) {
 -                      end_fsb = orig_end_fsb;
 -                      goto retry;
 -              }
 -              /*FALLTHRU*/
 -      default:
 +      if (error)
                return error;
 -      }
 -
 -      if (end_fsb != orig_end_fsb)
 -              xfs_inode_set_cowblocks_tag(ip);
  
        trace_xfs_reflink_cow_alloc(ip, &got);
        return 0;
@@@ -397,65 -418,87 +397,65 @@@ xfs_reflink_allocate_cow_range
  }
  
  /*
 - * Find the CoW reservation (and whether or not it needs block allocation)
 - * for a given byte offset of a file.
 + * Find the CoW reservation for a given byte offset of a file.
   */
  bool
  xfs_reflink_find_cow_mapping(
        struct xfs_inode                *ip,
        xfs_off_t                       offset,
 -      struct xfs_bmbt_irec            *imap,
 -      bool                            *need_alloc)
 +      struct xfs_bmbt_irec            *imap)
  {
 -      struct xfs_bmbt_irec            irec;
 -      struct xfs_ifork                *ifp;
 -      struct xfs_bmbt_rec_host        *gotp;
 -      xfs_fileoff_t                   bno;
 +      struct xfs_ifork                *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
 +      xfs_fileoff_t                   offset_fsb;
 +      struct xfs_bmbt_irec            got;
        xfs_extnum_t                    idx;
  
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED));
        ASSERT(xfs_is_reflink_inode(ip));
  
 -      /* Find the extent in the CoW fork. */
 -      ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
 -      bno = XFS_B_TO_FSBT(ip->i_mount, offset);
 -      gotp = xfs_iext_bno_to_ext(ifp, bno, &idx);
 -      if (!gotp)
 +      offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
 +      if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got))
                return false;
 -
 -      xfs_bmbt_get_all(gotp, &irec);
 -      if (bno >= irec.br_startoff + irec.br_blockcount ||
 -          bno < irec.br_startoff)
 +      if (got.br_startoff > offset_fsb)
                return false;
  
        trace_xfs_reflink_find_cow_mapping(ip, offset, 1, XFS_IO_OVERWRITE,
 -                      &irec);
 -
 -      /* If it's still delalloc, we must allocate later. */
 -      *imap = irec;
 -      *need_alloc = !!(isnullstartblock(irec.br_startblock));
 -
 +                      &got);
 +      *imap = got;
        return true;
  }
  
  /*
   * Trim an extent to end at the next CoW reservation past offset_fsb.
   */
 -int
 +void
  xfs_reflink_trim_irec_to_next_cow(
        struct xfs_inode                *ip,
        xfs_fileoff_t                   offset_fsb,
        struct xfs_bmbt_irec            *imap)
  {
 -      struct xfs_bmbt_irec            irec;
 -      struct xfs_ifork                *ifp;
 -      struct xfs_bmbt_rec_host        *gotp;
 +      struct xfs_ifork                *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
 +      struct xfs_bmbt_irec            got;
        xfs_extnum_t                    idx;
  
        if (!xfs_is_reflink_inode(ip))
 -              return 0;
 +              return;
  
        /* Find the extent in the CoW fork. */
 -      ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
 -      gotp = xfs_iext_bno_to_ext(ifp, offset_fsb, &idx);
 -      if (!gotp)
 -              return 0;
 -      xfs_bmbt_get_all(gotp, &irec);
 +      if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got))
 +              return;
  
        /* This is the extent before; try sliding up one. */
 -      if (irec.br_startoff < offset_fsb) {
 -              idx++;
 -              if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
 -                      return 0;
 -              gotp = xfs_iext_get_ext(ifp, idx);
 -              xfs_bmbt_get_all(gotp, &irec);
 +      if (got.br_startoff < offset_fsb) {
 +              if (!xfs_iext_get_extent(ifp, idx + 1, &got))
 +                      return;
        }
  
 -      if (irec.br_startoff >= imap->br_startoff + imap->br_blockcount)
 -              return 0;
 +      if (got.br_startoff >= imap->br_startoff + imap->br_blockcount)
 +              return;
  
 -      imap->br_blockcount = irec.br_startoff - imap->br_startoff;
 +      imap->br_blockcount = got.br_startoff - imap->br_startoff;
        trace_xfs_reflink_trim_irec(ip, imap);
 -
 -      return 0;
  }
  
  /*
@@@ -469,15 -512,18 +469,15 @@@ xfs_reflink_cancel_cow_blocks
        xfs_fileoff_t                   end_fsb)
  {
        struct xfs_ifork                *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
 -      struct xfs_bmbt_irec            got, prev, del;
 +      struct xfs_bmbt_irec            got, del;
        xfs_extnum_t                    idx;
        xfs_fsblock_t                   firstfsb;
        struct xfs_defer_ops            dfops;
 -      int                             error = 0, eof = 0;
 +      int                             error = 0;
  
        if (!xfs_is_reflink_inode(ip))
                return 0;
 -
 -      xfs_bmap_search_extents(ip, offset_fsb, XFS_COW_FORK, &eof, &idx,
 -                      &got, &prev);
 -      if (eof)
 +      if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got))
                return 0;
  
        while (got.br_startoff < end_fsb) {
                        xfs_bmap_del_extent_cow(ip, &idx, &got, &del);
                }
  
 -              if (++idx >= ifp->if_bytes / sizeof(struct xfs_bmbt_rec))
 +              if (!xfs_iext_get_extent(ifp, ++idx, &got))
                        break;
 -              xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &got);
        }
  
        /* clear tag if cow fork is emptied */
@@@ -591,13 -638,13 +591,13 @@@ xfs_reflink_end_cow
        xfs_off_t                       count)
  {
        struct xfs_ifork                *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
 -      struct xfs_bmbt_irec            got, prev, del;
 +      struct xfs_bmbt_irec            got, del;
        struct xfs_trans                *tp;
        xfs_fileoff_t                   offset_fsb;
        xfs_fileoff_t                   end_fsb;
        xfs_fsblock_t                   firstfsb;
        struct xfs_defer_ops            dfops;
 -      int                             error, eof = 0;
 +      int                             error;
        unsigned int                    resblks;
        xfs_filblks_t                   rlen;
        xfs_extnum_t                    idx;
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        xfs_trans_ijoin(tp, ip, 0);
  
 -      xfs_bmap_search_extents(ip, end_fsb - 1, XFS_COW_FORK, &eof, &idx,
 -                      &got, &prev);
 -
        /* If there is a hole at end_fsb - 1 go to the previous extent */
 -      if (eof || got.br_startoff > end_fsb) {
 +      if (!xfs_iext_lookup_extent(ip, ifp, end_fsb - 1, &idx, &got) ||
 +          got.br_startoff > end_fsb) {
                ASSERT(idx > 0);
 -              xfs_bmbt_get_all(xfs_iext_get_ext(ifp, --idx), &got);
 +              xfs_iext_get_extent(ifp, --idx, &got);
        }
  
        /* Walk backwards until we're out of the I/O range... */
                error = xfs_defer_finish(&tp, &dfops, ip);
                if (error)
                        goto out_defer;
 -
  next_extent:
 -              if (idx < 0)
 +              if (!xfs_iext_get_extent(ifp, idx, &got))
                        break;
 -              xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &got);
        }
  
        error = xfs_trans_commit(tp);
        return error;
  }
  
- /*
-  * Read a page's worth of file data into the page cache.  Return the page
-  * locked.
-  */
- static struct page *
- xfs_get_page(
-       struct inode    *inode,
-       xfs_off_t       offset)
- {
-       struct address_space    *mapping;
-       struct page             *page;
-       pgoff_t                 n;
-       n = offset >> PAGE_SHIFT;
-       mapping = inode->i_mapping;
-       page = read_mapping_page(mapping, n, NULL);
-       if (IS_ERR(page))
-               return page;
-       if (!PageUptodate(page)) {
-               put_page(page);
-               return ERR_PTR(-EIO);
-       }
-       lock_page(page);
-       return page;
- }
- /*
-  * Compare extents of two files to see if they are the same.
-  */
- static int
- xfs_compare_extents(
-       struct inode    *src,
-       xfs_off_t       srcoff,
-       struct inode    *dest,
-       xfs_off_t       destoff,
-       xfs_off_t       len,
-       bool            *is_same)
- {
-       xfs_off_t       src_poff;
-       xfs_off_t       dest_poff;
-       void            *src_addr;
-       void            *dest_addr;
-       struct page     *src_page;
-       struct page     *dest_page;
-       xfs_off_t       cmp_len;
-       bool            same;
-       int             error;
-       error = -EINVAL;
-       same = true;
-       while (len) {
-               src_poff = srcoff & (PAGE_SIZE - 1);
-               dest_poff = destoff & (PAGE_SIZE - 1);
-               cmp_len = min(PAGE_SIZE - src_poff,
-                             PAGE_SIZE - dest_poff);
-               cmp_len = min(cmp_len, len);
-               ASSERT(cmp_len > 0);
-               trace_xfs_reflink_compare_extents(XFS_I(src), srcoff, cmp_len,
-                               XFS_I(dest), destoff);
-               src_page = xfs_get_page(src, srcoff);
-               if (IS_ERR(src_page)) {
-                       error = PTR_ERR(src_page);
-                       goto out_error;
-               }
-               dest_page = xfs_get_page(dest, destoff);
-               if (IS_ERR(dest_page)) {
-                       error = PTR_ERR(dest_page);
-                       unlock_page(src_page);
-                       put_page(src_page);
-                       goto out_error;
-               }
-               src_addr = kmap_atomic(src_page);
-               dest_addr = kmap_atomic(dest_page);
-               flush_dcache_page(src_page);
-               flush_dcache_page(dest_page);
-               if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
-                       same = false;
-               kunmap_atomic(dest_addr);
-               kunmap_atomic(src_addr);
-               unlock_page(dest_page);
-               unlock_page(src_page);
-               put_page(dest_page);
-               put_page(src_page);
-               if (!same)
-                       break;
-               srcoff += cmp_len;
-               destoff += cmp_len;
-               len -= cmp_len;
-       }
-       *is_same = same;
-       return 0;
- out_error:
-       trace_xfs_reflink_compare_extents_error(XFS_I(dest), error, _RET_IP_);
-       return error;
- }
  /*
   * Link a range of blocks from one file to another.
   */
@@@ -1235,14 -1181,11 +1130,11 @@@ xfs_reflink_remap_range
        struct inode            *inode_out = file_inode(file_out);
        struct xfs_inode        *dest = XFS_I(inode_out);
        struct xfs_mount        *mp = src->i_mount;
-       loff_t                  bs = inode_out->i_sb->s_blocksize;
        bool                    same_inode = (inode_in == inode_out);
        xfs_fileoff_t           sfsbno, dfsbno;
        xfs_filblks_t           fsblen;
        xfs_extlen_t            cowextsize;
-       loff_t                  isize;
        ssize_t                 ret;
-       loff_t                  blen;
  
        if (!xfs_sb_version_hasreflink(&mp->m_sb))
                return -EOPNOTSUPP;
                return -EIO;
  
        /* Lock both files against IO */
 -      if (same_inode) {
 -              xfs_ilock(src, XFS_IOLOCK_EXCL);
 +      lock_two_nondirectories(inode_in, inode_out);
 +      if (same_inode)
                xfs_ilock(src, XFS_MMAPLOCK_EXCL);
 -      } else {
 -              xfs_lock_two_inodes(src, dest, XFS_IOLOCK_EXCL);
 +      else
                xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL);
 -      }
  
-       /* Don't touch certain kinds of inodes */
-       ret = -EPERM;
-       if (IS_IMMUTABLE(inode_out))
-               goto out_unlock;
-       ret = -ETXTBSY;
-       if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
-               goto out_unlock;
-       /* Don't reflink dirs, pipes, sockets... */
-       ret = -EISDIR;
-       if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
-               goto out_unlock;
+       /* Check file eligibility and prepare for block sharing. */
        ret = -EINVAL;
-       if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode))
-               goto out_unlock;
-       if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
-               goto out_unlock;
        /* Don't reflink realtime inodes */
        if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest))
                goto out_unlock;
        if (IS_DAX(inode_in) || IS_DAX(inode_out))
                goto out_unlock;
  
-       /* Are we going all the way to the end? */
-       isize = i_size_read(inode_in);
-       if (isize == 0) {
-               ret = 0;
-               goto out_unlock;
-       }
-       /* Zero length dedupe exits immediately; reflink goes to EOF. */
-       if (len == 0) {
-               if (is_dedupe) {
-                       ret = 0;
-                       goto out_unlock;
-               }
-               len = isize - pos_in;
-       }
-       /* Ensure offsets don't wrap and the input is inside i_size */
-       if (pos_in + len < pos_in || pos_out + len < pos_out ||
-           pos_in + len > isize)
-               goto out_unlock;
-       /* Don't allow dedupe past EOF in the dest file */
-       if (is_dedupe) {
-               loff_t  disize;
-               disize = i_size_read(inode_out);
-               if (pos_out >= disize || pos_out + len > disize)
-                       goto out_unlock;
-       }
-       /* If we're linking to EOF, continue to the block boundary. */
-       if (pos_in + len == isize)
-               blen = ALIGN(isize, bs) - pos_in;
-       else
-               blen = len;
-       /* Only reflink if we're aligned to block boundaries */
-       if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
-           !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
-               goto out_unlock;
-       /* Don't allow overlapped reflink within the same file */
-       if (same_inode) {
-               if (pos_out + blen > pos_in && pos_out < pos_in + blen)
-                       goto out_unlock;
-       }
-       /* Wait for the completion of any pending IOs on both files */
-       inode_dio_wait(inode_in);
-       if (!same_inode)
-               inode_dio_wait(inode_out);
-       ret = filemap_write_and_wait_range(inode_in->i_mapping,
-                       pos_in, pos_in + len - 1);
-       if (ret)
-               goto out_unlock;
-       ret = filemap_write_and_wait_range(inode_out->i_mapping,
-                       pos_out, pos_out + len - 1);
-       if (ret)
+       ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out,
+                       &len, is_dedupe);
+       if (ret || len == 0)
                goto out_unlock;
  
        trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
  
-       /*
-        * Check that the extents are the same.
-        */
-       if (is_dedupe) {
-               bool            is_same = false;
-               ret = xfs_compare_extents(inode_in, pos_in, inode_out, pos_out,
-                               len, &is_same);
-               if (ret)
-                       goto out_unlock;
-               if (!is_same) {
-                       ret = -EBADE;
-                       goto out_unlock;
-               }
-       }
+       /* Set flags and remap blocks. */
        ret = xfs_reflink_set_inode_flag(src, dest);
        if (ret)
                goto out_unlock;
  
-       /*
-        * Invalidate the page cache so that we can clear any CoW mappings
-        * in the destination file.
-        */
-       truncate_inode_pages_range(&inode_out->i_data, pos_out,
-                                  PAGE_ALIGN(pos_out + len) - 1);
        dfsbno = XFS_B_TO_FSBT(mp, pos_out);
        sfsbno = XFS_B_TO_FSBT(mp, pos_in);
        fsblen = XFS_B_TO_FSB(mp, len);
        if (ret)
                goto out_unlock;
  
+       /* Zap any page cache for the destination file's range. */
+       truncate_inode_pages_range(&inode_out->i_data, pos_out,
+                                  PAGE_ALIGN(pos_out + len) - 1);
        /*
         * Carry the cowextsize hint from src to dest if we're sharing the
         * entire source file to the entire destination file, the source file
  
  out_unlock:
        xfs_iunlock(src, XFS_MMAPLOCK_EXCL);
 -      xfs_iunlock(src, XFS_IOLOCK_EXCL);
 -      if (src->i_ino != dest->i_ino) {
 +      if (!same_inode)
                xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
 -              xfs_iunlock(dest, XFS_IOLOCK_EXCL);
 -      }
 +      unlock_two_nondirectories(inode_in, inode_out);
        if (ret)
                trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
        return ret;
@@@ -1648,3 -1502,37 +1447,3 @@@ out
        trace_xfs_reflink_unshare_error(ip, error, _RET_IP_);
        return error;
  }
 -
 -/*
 - * Does this inode have any real CoW reservations?
 - */
 -bool
 -xfs_reflink_has_real_cow_blocks(
 -      struct xfs_inode                *ip)
 -{
 -      struct xfs_bmbt_irec            irec;
 -      struct xfs_ifork                *ifp;
 -      struct xfs_bmbt_rec_host        *gotp;
 -      xfs_extnum_t                    idx;
 -
 -      if (!xfs_is_reflink_inode(ip))
 -              return false;
 -
 -      /* Go find the old extent in the CoW fork. */
 -      ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
 -      gotp = xfs_iext_bno_to_ext(ifp, 0, &idx);
 -      while (gotp) {
 -              xfs_bmbt_get_all(gotp, &irec);
 -
 -              if (!isnullstartblock(irec.br_startblock))
 -                      return true;
 -
 -              /* Roll on... */
 -              idx++;
 -              if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
 -                      break;
 -              gotp = xfs_iext_get_ext(ifp, idx);
 -      }
 -
 -      return false;
 -}
diff --combined include/linux/fs.h
@@@ -28,6 -28,7 +28,6 @@@
  #include <linux/uidgid.h>
  #include <linux/lockdep.h>
  #include <linux/percpu-rwsem.h>
 -#include <linux/blk_types.h>
  #include <linux/workqueue.h>
  #include <linux/percpu-rwsem.h>
  #include <linux/delayed_call.h>
@@@ -37,7 -38,6 +37,7 @@@
  
  struct backing_dev_info;
  struct bdi_writeback;
 +struct bio;
  struct export_operations;
  struct hd_geometry;
  struct iovec;
@@@ -151,6 -151,58 +151,6 @@@ typedef int (dio_iodone_t)(struct kioc
   */
  #define CHECK_IOVEC_ONLY -1
  
 -/*
 - * The below are the various read and write flags that we support. Some of
 - * them include behavioral modifiers that send information down to the
 - * block layer and IO scheduler. They should be used along with a req_op.
 - * Terminology:
 - *
 - *    The block layer uses device plugging to defer IO a little bit, in
 - *    the hope that we will see more IO very shortly. This increases
 - *    coalescing of adjacent IO and thus reduces the number of IOs we
 - *    have to send to the device. It also allows for better queuing,
 - *    if the IO isn't mergeable. If the caller is going to be waiting
 - *    for the IO, then he must ensure that the device is unplugged so
 - *    that the IO is dispatched to the driver.
 - *
 - *    All IO is handled async in Linux. This is fine for background
 - *    writes, but for reads or writes that someone waits for completion
 - *    on, we want to notify the block layer and IO scheduler so that they
 - *    know about it. That allows them to make better scheduling
 - *    decisions. So when the below references 'sync' and 'async', it
 - *    is referencing this priority hint.
 - *
 - * With that in mind, the available types are:
 - *
 - * READ                       A normal read operation. Device will be plugged.
 - * READ_SYNC          A synchronous read. Device is not plugged, caller can
 - *                    immediately wait on this read without caring about
 - *                    unplugging.
 - * WRITE              A normal async write. Device will be plugged.
 - * WRITE_SYNC         Synchronous write. Identical to WRITE, but passes down
 - *                    the hint that someone will be waiting on this IO
 - *                    shortly. The write equivalent of READ_SYNC.
 - * WRITE_ODIRECT      Special case write for O_DIRECT only.
 - * WRITE_FLUSH                Like WRITE_SYNC but with preceding cache flush.
 - * WRITE_FUA          Like WRITE_SYNC but data is guaranteed to be on
 - *                    non-volatile media on completion.
 - * WRITE_FLUSH_FUA    Combination of WRITE_FLUSH and FUA. The IO is preceded
 - *                    by a cache flush and data is guaranteed to be on
 - *                    non-volatile media on completion.
 - *
 - */
 -#define RW_MASK                       REQ_OP_WRITE
 -
 -#define READ                  REQ_OP_READ
 -#define WRITE                 REQ_OP_WRITE
 -
 -#define READ_SYNC             REQ_SYNC
 -#define WRITE_SYNC            (REQ_SYNC | REQ_NOIDLE)
 -#define WRITE_ODIRECT         REQ_SYNC
 -#define WRITE_FLUSH           (REQ_SYNC | REQ_NOIDLE | REQ_PREFLUSH)
 -#define WRITE_FUA             (REQ_SYNC | REQ_NOIDLE | REQ_FUA)
 -#define WRITE_FLUSH_FUA               (REQ_SYNC | REQ_NOIDLE | REQ_PREFLUSH | REQ_FUA)
 -
  /*
   * Attribute flags.  These should be or-ed together to figure out what
   * has been changed!
@@@ -1726,24 -1778,17 +1726,30 @@@ extern ssize_t vfs_writev(struct file *
                unsigned long, loff_t *, int);
  extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
                                   loff_t, size_t, unsigned int);
+ extern int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
+                                     struct inode *inode_out, loff_t pos_out,
+                                     u64 *len, bool is_dedupe);
  extern int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
                struct file *file_out, loff_t pos_out, u64 len);
+ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
+                                        struct inode *dest, loff_t destoff,
+                                        loff_t len, bool *is_same);
  extern int vfs_dedupe_file_range(struct file *file,
                                 struct file_dedupe_range *same);
  
 +static inline int do_clone_file_range(struct file *file_in, loff_t pos_in,
 +                                    struct file *file_out, loff_t pos_out,
 +                                    u64 len)
 +{
 +      int ret;
 +
 +      sb_start_write(file_inode(file_out)->i_sb);
 +      ret = vfs_clone_file_range(file_in, pos_in, file_out, pos_out, len);
 +      sb_end_write(file_inode(file_out)->i_sb);
 +
 +      return ret;
 +}
 +
  struct super_operations {
        struct inode *(*alloc_inode)(struct super_block *sb);
        void (*destroy_inode)(struct inode *);
@@@ -2084,11 -2129,11 +2090,11 @@@ extern int may_umount_tree(struct vfsmo
  extern int may_umount(struct vfsmount *);
  extern long do_mount(const char *, const char __user *,
                     const char *, unsigned long, void *);
- extern struct vfsmount *collect_mounts(struct path *);
+ extern struct vfsmount *collect_mounts(const struct path *);
  extern void drop_collected_mounts(struct vfsmount *);
  extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *,
                          struct vfsmount *);
- extern int vfs_statfs(struct path *, struct kstatfs *);
+ extern int vfs_statfs(const struct path *, struct kstatfs *);
  extern int user_statfs(const char __user *, struct kstatfs *);
  extern int fd_statfs(int, struct kstatfs *);
  extern int vfs_ustat(dev_t, struct kstatfs *);
@@@ -2460,6 -2505,19 +2466,6 @@@ extern void make_bad_inode(struct inod
  extern bool is_bad_inode(struct inode *);
  
  #ifdef CONFIG_BLOCK
 -static inline bool op_is_write(unsigned int op)
 -{
 -      return op == REQ_OP_READ ? false : true;
 -}
 -
 -/*
 - * return data direction, READ or WRITE
 - */
 -static inline int bio_data_dir(struct bio *bio)
 -{
 -      return op_is_write(bio_op(bio)) ? WRITE : READ;
 -}
 -
  extern void check_disk_size_change(struct gendisk *disk,
                                   struct block_device *bdev);
  extern int revalidate_disk(struct gendisk *);
@@@ -2657,7 -2715,7 +2663,7 @@@ extern struct file * open_exec(const ch
   
  /* fs/dcache.c -- generic fs support functions */
  extern bool is_subdir(struct dentry *, struct dentry *);
- extern bool path_is_under(struct path *, struct path *);
+ extern bool path_is_under(const struct path *, const struct path *);
  
  extern char *file_path(struct file *, char *, int);
  
@@@ -2730,6 -2788,7 +2736,6 @@@ static inline void remove_inode_hash(st
  extern void inode_sb_list_add(struct inode *inode);
  
  #ifdef CONFIG_BLOCK
 -extern blk_qc_t submit_bio(struct bio *);
  extern int bdev_read_only(struct block_device *);
  #endif
  extern int set_blocksize(struct block_device *, int);
diff --combined kernel/audit.c
@@@ -107,6 -107,7 +107,6 @@@ static u32 audit_rate_limit
   * When set to zero, this means unlimited. */
  static u32    audit_backlog_limit = 64;
  #define AUDIT_BACKLOG_WAIT_TIME (60 * HZ)
 -static u32    audit_backlog_wait_time_master = AUDIT_BACKLOG_WAIT_TIME;
  static u32    audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
  
  /* The identity of the user shutting down the audit system. */
@@@ -125,7 -126,7 +125,7 @@@ static atomic_t    audit_lost = ATOMIC_
  
  /* The netlink socket. */
  static struct sock *audit_sock;
 -static int audit_net_id;
 +static unsigned int audit_net_id;
  
  /* Hash for inode-based rules */
  struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
@@@ -137,18 -138,11 +137,18 @@@ static DEFINE_SPINLOCK(audit_freelist_l
  static int       audit_freelist_count;
  static LIST_HEAD(audit_freelist);
  
 -static struct sk_buff_head audit_skb_queue;
 -/* queue of skbs to send to auditd when/if it comes back */
 -static struct sk_buff_head audit_skb_hold_queue;
 +/* queue msgs to send via kauditd_task */
 +static struct sk_buff_head audit_queue;
 +/* queue msgs due to temporary unicast send problems */
 +static struct sk_buff_head audit_retry_queue;
 +/* queue msgs waiting for new auditd connection */
 +static struct sk_buff_head audit_hold_queue;
 +
 +/* queue servicing thread */
  static struct task_struct *kauditd_task;
  static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
 +
 +/* waitqueue for callers who are blocked on the audit backlog */
  static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
  
  static struct audit_features af = {.vers = AUDIT_FEATURE_VERSION,
@@@ -344,7 -338,7 +344,7 @@@ static int audit_set_backlog_limit(u32 
  static int audit_set_backlog_wait_time(u32 timeout)
  {
        return audit_do_config_change("audit_backlog_wait_time",
 -                                    &audit_backlog_wait_time_master, timeout);
 +                                    &audit_backlog_wait_time, timeout);
  }
  
  static int audit_set_enabled(u32 state)
@@@ -370,11 -364,30 +370,11 @@@ static int audit_set_failure(u32 state
        return audit_do_config_change("audit_failure", &audit_failure, state);
  }
  
 -/*
 - * Queue skbs to be sent to auditd when/if it comes back.  These skbs should
 - * already have been sent via prink/syslog and so if these messages are dropped
 - * it is not a huge concern since we already passed the audit_log_lost()
 - * notification and stuff.  This is just nice to get audit messages during
 - * boot before auditd is running or messages generated while auditd is stopped.
 - * This only holds messages is audit_default is set, aka booting with audit=1
 - * or building your kernel that way.
 - */
 -static void audit_hold_skb(struct sk_buff *skb)
 -{
 -      if (audit_default &&
 -          (!audit_backlog_limit ||
 -           skb_queue_len(&audit_skb_hold_queue) < audit_backlog_limit))
 -              skb_queue_tail(&audit_skb_hold_queue, skb);
 -      else
 -              kfree_skb(skb);
 -}
 -
  /*
   * For one reason or another this nlh isn't getting delivered to the userspace
   * audit daemon, just send it to printk.
   */
 -static void audit_printk_skb(struct sk_buff *skb)
 +static void kauditd_printk_skb(struct sk_buff *skb)
  {
        struct nlmsghdr *nlh = nlmsg_hdr(skb);
        char *data = nlmsg_data(nlh);
                else
                        audit_log_lost("printk limit exceeded");
        }
 +}
 +
 +/**
 + * kauditd_hold_skb - Queue an audit record, waiting for auditd
 + * @skb: audit record
 + *
 + * Description:
 + * Queue the audit record, waiting for an instance of auditd.  When this
 + * function is called we haven't given up yet on sending the record, but things
 + * are not looking good.  The first thing we want to do is try to write the
 + * record via printk and then see if we want to try and hold on to the record
 + * and queue it, if we have room.  If we want to hold on to the record, but we
 + * don't have room, record a record lost message.
 + */
 +static void kauditd_hold_skb(struct sk_buff *skb)
 +{
 +      /* at this point it is uncertain if we will ever send this to auditd so
 +       * try to send the message via printk before we go any further */
 +      kauditd_printk_skb(skb);
 +
 +      /* can we just silently drop the message? */
 +      if (!audit_default) {
 +              kfree_skb(skb);
 +              return;
 +      }
 +
 +      /* if we have room, queue the message */
 +      if (!audit_backlog_limit ||
 +          skb_queue_len(&audit_hold_queue) < audit_backlog_limit) {
 +              skb_queue_tail(&audit_hold_queue, skb);
 +              return;
 +      }
  
 -      audit_hold_skb(skb);
 +      /* we have no other options - drop the message */
 +      audit_log_lost("kauditd hold queue overflow");
 +      kfree_skb(skb);
  }
  
 -static void kauditd_send_skb(struct sk_buff *skb)
 +/**
 + * kauditd_retry_skb - Queue an audit record, attempt to send again to auditd
 + * @skb: audit record
 + *
 + * Description:
 + * Not as serious as kauditd_hold_skb() as we still have a connected auditd,
 + * but for some reason we are having problems sending it audit records so
 + * queue the given record and attempt to resend.
 + */
 +static void kauditd_retry_skb(struct sk_buff *skb)
  {
 -      int err;
 -      int attempts = 0;
 -#define AUDITD_RETRIES 5
 +      /* NOTE: because records should only live in the retry queue for a
 +       * short period of time, before either being sent or moved to the hold
 +       * queue, we don't currently enforce a limit on this queue */
 +      skb_queue_tail(&audit_retry_queue, skb);
 +}
 +
 +/**
 + * auditd_reset - Disconnect the auditd connection
 + *
 + * Description:
 + * Break the auditd/kauditd connection and move all the records in the retry
 + * queue into the hold queue in case auditd reconnects.  The audit_cmd_mutex
 + * must be held when calling this function.
 + */
 +static void auditd_reset(void)
 +{
 +      struct sk_buff *skb;
 +
 +      /* break the connection */
 +      if (audit_sock) {
 +              sock_put(audit_sock);
 +              audit_sock = NULL;
 +      }
 +      audit_pid = 0;
 +      audit_nlk_portid = 0;
 +
 +      /* flush all of the retry queue to the hold queue */
 +      while ((skb = skb_dequeue(&audit_retry_queue)))
 +              kauditd_hold_skb(skb);
 +}
 +
 +/**
 + * kauditd_send_unicast_skb - Send a record via unicast to auditd
 + * @skb: audit record
 + */
 +static int kauditd_send_unicast_skb(struct sk_buff *skb)
 +{
 +      int rc;
  
 -restart:
 -      /* take a reference in case we can't send it and we want to hold it */
 +      /* if we know nothing is connected, don't even try the netlink call */
 +      if (!audit_pid)
 +              return -ECONNREFUSED;
 +
 +      /* get an extra skb reference in case we fail to send */
        skb_get(skb);
 -      err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0);
 -      if (err < 0) {
 -              pr_err("netlink_unicast sending to audit_pid=%d returned error: %d\n",
 -                     audit_pid, err);
 -              if (audit_pid) {
 -                      if (err == -ECONNREFUSED || err == -EPERM
 -                          || ++attempts >= AUDITD_RETRIES) {
 -                              char s[32];
 -
 -                              snprintf(s, sizeof(s), "audit_pid=%d reset", audit_pid);
 -                              audit_log_lost(s);
 -                              audit_pid = 0;
 -                              audit_sock = NULL;
 -                      } else {
 -                              pr_warn("re-scheduling(#%d) write to audit_pid=%d\n",
 -                                      attempts, audit_pid);
 -                              set_current_state(TASK_INTERRUPTIBLE);
 -                              schedule();
 -                              goto restart;
 -                      }
 -              }
 -              /* we might get lucky and get this in the next auditd */
 -              audit_hold_skb(skb);
 -      } else
 -              /* drop the extra reference if sent ok */
 +      rc = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0);
 +      if (rc >= 0) {
                consume_skb(skb);
 +              rc = 0;
 +      }
 +
 +      return rc;
  }
  
  /*
 - * kauditd_send_multicast_skb - send the skb to multicast userspace listeners
 + * kauditd_send_multicast_skb - Send a record to any multicast listeners
 + * @skb: audit record
   *
 + * Description:
   * This function doesn't consume an skb as might be expected since it has to
   * copy it anyways.
   */
 -static void kauditd_send_multicast_skb(struct sk_buff *skb, gfp_t gfp_mask)
 +static void kauditd_send_multicast_skb(struct sk_buff *skb)
  {
 -      struct sk_buff          *copy;
 -      struct audit_net        *aunet = net_generic(&init_net, audit_net_id);
 -      struct sock             *sock = aunet->nlsk;
 +      struct sk_buff *copy;
 +      struct audit_net *aunet = net_generic(&init_net, audit_net_id);
 +      struct sock *sock = aunet->nlsk;
 +      struct nlmsghdr *nlh;
  
        if (!netlink_has_listeners(sock, AUDIT_NLGRP_READLOG))
                return;
         * no reason for new multicast clients to continue with this
         * non-compliance.
         */
 -      copy = skb_copy(skb, gfp_mask);
 +      copy = skb_copy(skb, GFP_KERNEL);
        if (!copy)
                return;
 +      nlh = nlmsg_hdr(copy);
 +      nlh->nlmsg_len = skb->len;
  
 -      nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, gfp_mask);
 +      nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL);
  }
  
 -/*
 - * flush_hold_queue - empty the hold queue if auditd appears
 - *
 - * If auditd just started, drain the queue of messages already
 - * sent to syslog/printk.  Remember loss here is ok.  We already
 - * called audit_log_lost() if it didn't go out normally.  so the
 - * race between the skb_dequeue and the next check for audit_pid
 - * doesn't matter.
 +/**
 + * kauditd_wake_condition - Return true when it is time to wake kauditd_thread
   *
 - * If you ever find kauditd to be too slow we can get a perf win
 - * by doing our own locking and keeping better track if there
 - * are messages in this queue.  I don't see the need now, but
 - * in 5 years when I want to play with this again I'll see this
 - * note and still have no friggin idea what i'm thinking today.
 + * Description:
 + * This function is for use by the wait_event_freezable() call in
 + * kauditd_thread().
   */
 -static void flush_hold_queue(void)
 +static int kauditd_wake_condition(void)
  {
 -      struct sk_buff *skb;
 -
 -      if (!audit_default || !audit_pid)
 -              return;
 -
 -      skb = skb_dequeue(&audit_skb_hold_queue);
 -      if (likely(!skb))
 -              return;
 +      static int pid_last = 0;
 +      int rc;
 +      int pid = audit_pid;
  
 -      while (skb && audit_pid) {
 -              kauditd_send_skb(skb);
 -              skb = skb_dequeue(&audit_skb_hold_queue);
 -      }
 +      /* wake on new messages or a change in the connected auditd */
 +      rc = skb_queue_len(&audit_queue) || (pid && pid != pid_last);
 +      if (rc)
 +              pid_last = pid;
  
 -      /*
 -       * if auditd just disappeared but we
 -       * dequeued an skb we need to drop ref
 -       */
 -      consume_skb(skb);
 +      return rc;
  }
  
  static int kauditd_thread(void *dummy)
  {
 +      int rc;
 +      int auditd = 0;
 +      int reschedule = 0;
 +      struct sk_buff *skb;
 +      struct nlmsghdr *nlh;
 +
 +#define UNICAST_RETRIES 5
 +#define AUDITD_BAD(x,y) \
 +      ((x) == -ECONNREFUSED || (x) == -EPERM || ++(y) >= UNICAST_RETRIES)
 +
 +      /* NOTE: we do invalidate the auditd connection flag on any sending
 +       * errors, but we only "restore" the connection flag at specific places
 +       * in the loop in order to help ensure proper ordering of audit
 +       * records */
 +
        set_freezable();
        while (!kthread_should_stop()) {
 -              struct sk_buff *skb;
 -
 -              flush_hold_queue();
 +              /* NOTE: possible area for future improvement is to look at
 +               *       the hold and retry queues, since only this thread
 +               *       has access to these queues we might be able to do
 +               *       our own queuing and skip some/all of the locking */
 +
 +              /* NOTE: it might be a fun experiment to split the hold and
 +               *       retry queue handling to another thread, but the
 +               *       synchronization issues and other overhead might kill
 +               *       any performance gains */
 +
 +              /* attempt to flush the hold queue */
 +              while (auditd && (skb = skb_dequeue(&audit_hold_queue))) {
 +                      rc = kauditd_send_unicast_skb(skb);
 +                      if (rc) {
 +                              /* requeue to the same spot */
 +                              skb_queue_head(&audit_hold_queue, skb);
 +
 +                              auditd = 0;
 +                              if (AUDITD_BAD(rc, reschedule)) {
 +                                      mutex_lock(&audit_cmd_mutex);
 +                                      auditd_reset();
 +                                      mutex_unlock(&audit_cmd_mutex);
 +                                      reschedule = 0;
 +                              }
 +                      } else
 +                              /* we were able to send successfully */
 +                              reschedule = 0;
 +              }
  
 -              skb = skb_dequeue(&audit_skb_queue);
 +              /* attempt to flush the retry queue */
 +              while (auditd && (skb = skb_dequeue(&audit_retry_queue))) {
 +                      rc = kauditd_send_unicast_skb(skb);
 +                      if (rc) {
 +                              auditd = 0;
 +                              if (AUDITD_BAD(rc, reschedule)) {
 +                                      kauditd_hold_skb(skb);
 +                                      mutex_lock(&audit_cmd_mutex);
 +                                      auditd_reset();
 +                                      mutex_unlock(&audit_cmd_mutex);
 +                                      reschedule = 0;
 +                              } else
 +                                      /* temporary problem (we hope), queue
 +                                       * to the same spot and retry */
 +                                      skb_queue_head(&audit_retry_queue, skb);
 +                      } else
 +                              /* we were able to send successfully */
 +                              reschedule = 0;
 +              }
  
 +              /* standard queue processing, try to be as quick as possible */
 +quick_loop:
 +              skb = skb_dequeue(&audit_queue);
                if (skb) {
 -                      if (!audit_backlog_limit ||
 -                          (skb_queue_len(&audit_skb_queue) <= audit_backlog_limit))
 -                              wake_up(&audit_backlog_wait);
 -                      if (audit_pid)
 -                              kauditd_send_skb(skb);
 +                      /* setup the netlink header, see the comments in
 +                       * kauditd_send_multicast_skb() for length quirks */
 +                      nlh = nlmsg_hdr(skb);
 +                      nlh->nlmsg_len = skb->len - NLMSG_HDRLEN;
 +
 +                      /* attempt to send to any multicast listeners */
 +                      kauditd_send_multicast_skb(skb);
 +
 +                      /* attempt to send to auditd, queue on failure */
 +                      if (auditd) {
 +                              rc = kauditd_send_unicast_skb(skb);
 +                              if (rc) {
 +                                      auditd = 0;
 +                                      if (AUDITD_BAD(rc, reschedule)) {
 +                                              mutex_lock(&audit_cmd_mutex);
 +                                              auditd_reset();
 +                                              mutex_unlock(&audit_cmd_mutex);
 +                                              reschedule = 0;
 +                                      }
 +
 +                                      /* move to the retry queue */
 +                                      kauditd_retry_skb(skb);
 +                              } else
 +                                      /* everything is working so go fast! */
 +                                      goto quick_loop;
 +                      } else if (reschedule)
 +                              /* we are currently having problems, move to
 +                               * the retry queue */
 +                              kauditd_retry_skb(skb);
                        else
 -                              audit_printk_skb(skb);
 -                      continue;
 -              }
 +                              /* dump the message via printk and hold it */
 +                              kauditd_hold_skb(skb);
 +              } else {
 +                      /* we have flushed the backlog so wake everyone */
 +                      wake_up(&audit_backlog_wait);
 +
 +                      /* if everything is okay with auditd (if present), go
 +                       * to sleep until there is something new in the queue
 +                       * or we have a change in the connected auditd;
 +                       * otherwise simply reschedule to give things a chance
 +                       * to recover */
 +                      if (reschedule) {
 +                              set_current_state(TASK_INTERRUPTIBLE);
 +                              schedule();
 +                      } else
 +                              wait_event_freezable(kauditd_wait,
 +                                                   kauditd_wake_condition());
  
 -              wait_event_freezable(kauditd_wait, skb_queue_len(&audit_skb_queue));
 +                      /* update the auditd connection status */
 +                      auditd = (audit_pid ? 1 : 0);
 +              }
        }
 +
        return 0;
  }
  
@@@ -735,7 -596,6 +735,7 @@@ static int audit_send_reply_thread(voi
        kfree(reply);
        return 0;
  }
 +
  /**
   * audit_send_reply - send an audit reply message via netlink
   * @request_skb: skb of request we are replying to (used to target the reply)
@@@ -972,6 -832,16 +972,6 @@@ static int audit_receive_msg(struct sk_
        if (err)
                return err;
  
 -      /* As soon as there's any sign of userspace auditd,
 -       * start kauditd to talk to it */
 -      if (!kauditd_task) {
 -              kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
 -              if (IS_ERR(kauditd_task)) {
 -                      err = PTR_ERR(kauditd_task);
 -                      kauditd_task = NULL;
 -                      return err;
 -              }
 -      }
        seq  = nlh->nlmsg_seq;
        data = nlmsg_data(nlh);
  
                s.rate_limit            = audit_rate_limit;
                s.backlog_limit         = audit_backlog_limit;
                s.lost                  = atomic_read(&audit_lost);
 -              s.backlog               = skb_queue_len(&audit_skb_queue);
 +              s.backlog               = skb_queue_len(&audit_queue);
                s.feature_bitmap        = AUDIT_FEATURE_BITMAP_ALL;
 -              s.backlog_wait_time     = audit_backlog_wait_time_master;
 +              s.backlog_wait_time     = audit_backlog_wait_time;
                audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s));
                break;
        }
                        }
                        if (audit_enabled != AUDIT_OFF)
                                audit_log_config_change("audit_pid", new_pid, audit_pid, 1);
 -                      audit_pid = new_pid;
 -                      audit_nlk_portid = NETLINK_CB(skb).portid;
 -                      audit_sock = skb->sk;
 +                      if (new_pid) {
 +                              if (audit_sock)
 +                                      sock_put(audit_sock);
 +                              audit_pid = new_pid;
 +                              audit_nlk_portid = NETLINK_CB(skb).portid;
 +                              sock_hold(skb->sk);
 +                              audit_sock = skb->sk;
 +                      } else {
 +                              auditd_reset();
 +                      }
 +                      wake_up_interruptible(&kauditd_wait);
                }
                if (s.mask & AUDIT_STATUS_RATE_LIMIT) {
                        err = audit_set_rate_limit(s.rate_limit);
@@@ -1305,13 -1167,14 +1305,13 @@@ static void __net_exit audit_net_exit(s
  {
        struct audit_net *aunet = net_generic(net, audit_net_id);
        struct sock *sock = aunet->nlsk;
 -      if (sock == audit_sock) {
 -              audit_pid = 0;
 -              audit_sock = NULL;
 -      }
 +      mutex_lock(&audit_cmd_mutex);
 +      if (sock == audit_sock)
 +              auditd_reset();
 +      mutex_unlock(&audit_cmd_mutex);
  
 -      RCU_INIT_POINTER(aunet->nlsk, NULL);
 -      synchronize_net();
        netlink_kernel_release(sock);
 +      aunet->nlsk = NULL;
  }
  
  static struct pernet_operations audit_net_ops __net_initdata = {
@@@ -1333,24 -1196,17 +1333,24 @@@ static int __init audit_init(void
                audit_default ? "enabled" : "disabled");
        register_pernet_subsys(&audit_net_ops);
  
 -      skb_queue_head_init(&audit_skb_queue);
 -      skb_queue_head_init(&audit_skb_hold_queue);
 +      skb_queue_head_init(&audit_queue);
 +      skb_queue_head_init(&audit_retry_queue);
 +      skb_queue_head_init(&audit_hold_queue);
        audit_initialized = AUDIT_INITIALIZED;
        audit_enabled = audit_default;
        audit_ever_enabled |= !!audit_default;
  
 -      audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
 -
        for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
                INIT_LIST_HEAD(&audit_inode_hash[i]);
  
 +      kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
 +      if (IS_ERR(kauditd_task)) {
 +              int err = PTR_ERR(kauditd_task);
 +              panic("audit: failed to start the kauditd thread (%d)\n", err);
 +      }
 +
 +      audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
 +
        return 0;
  }
  __initcall(audit_init);
@@@ -1483,6 -1339,24 +1483,6 @@@ static inline void audit_get_stamp(stru
        }
  }
  
 -/*
 - * Wait for auditd to drain the queue a little
 - */
 -static long wait_for_auditd(long sleep_time)
 -{
 -      DECLARE_WAITQUEUE(wait, current);
 -
 -      if (audit_backlog_limit &&
 -          skb_queue_len(&audit_skb_queue) > audit_backlog_limit) {
 -              add_wait_queue_exclusive(&audit_backlog_wait, &wait);
 -              set_current_state(TASK_UNINTERRUPTIBLE);
 -              sleep_time = schedule_timeout(sleep_time);
 -              remove_wait_queue(&audit_backlog_wait, &wait);
 -      }
 -
 -      return sleep_time;
 -}
 -
  /**
   * audit_log_start - obtain an audit buffer
   * @ctx: audit_context (may be NULL)
  struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
                                     int type)
  {
 -      struct audit_buffer     *ab     = NULL;
 -      struct timespec         t;
 -      unsigned int            uninitialized_var(serial);
 -      int reserve = 5; /* Allow atomic callers to go up to five
 -                          entries over the normal backlog limit */
 -      unsigned long timeout_start = jiffies;
 +      struct audit_buffer *ab;
 +      struct timespec t;
 +      unsigned int uninitialized_var(serial);
  
        if (audit_initialized != AUDIT_INITIALIZED)
                return NULL;
        if (unlikely(!audit_filter(type, AUDIT_FILTER_TYPE)))
                return NULL;
  
 -      if (gfp_mask & __GFP_DIRECT_RECLAIM) {
 -              if (audit_pid && audit_pid == current->tgid)
 -                      gfp_mask &= ~__GFP_DIRECT_RECLAIM;
 -              else
 -                      reserve = 0;
 -      }
 -
 -      while (audit_backlog_limit
 -             && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) {
 -              if (gfp_mask & __GFP_DIRECT_RECLAIM && audit_backlog_wait_time) {
 -                      long sleep_time;
 +      /* don't ever fail/sleep on these two conditions:
 +       * 1. auditd generated record - since we need auditd to drain the
 +       *    queue; also, when we are checking for auditd, compare PIDs using
 +       *    task_tgid_vnr() since auditd_pid is set in audit_receive_msg()
 +       *    using a PID anchored in the caller's namespace
 +       * 2. audit command message - record types 1000 through 1099 inclusive
 +       *    are command messages/records used to manage the kernel subsystem
 +       *    and the audit userspace, blocking on these messages could cause
 +       *    problems under load so don't do it (note: not all of these
 +       *    command types are valid as record types, but it is quicker to
 +       *    just check two ints than a series of ints in a if/switch stmt) */
 +      if (!((audit_pid && audit_pid == task_tgid_vnr(current)) ||
 +            (type >= 1000 && type <= 1099))) {
 +              long sleep_time = audit_backlog_wait_time;
 +
 +              while (audit_backlog_limit &&
 +                     (skb_queue_len(&audit_queue) > audit_backlog_limit)) {
 +                      /* wake kauditd to try and flush the queue */
 +                      wake_up_interruptible(&kauditd_wait);
  
 -                      sleep_time = timeout_start + audit_backlog_wait_time - jiffies;
 -                      if (sleep_time > 0) {
 -                              sleep_time = wait_for_auditd(sleep_time);
 -                              if (sleep_time > 0)
 -                                      continue;
 +                      /* sleep if we are allowed and we haven't exhausted our
 +                       * backlog wait limit */
 +                      if ((gfp_mask & __GFP_DIRECT_RECLAIM) &&
 +                          (sleep_time > 0)) {
 +                              DECLARE_WAITQUEUE(wait, current);
 +
 +                              add_wait_queue_exclusive(&audit_backlog_wait,
 +                                                       &wait);
 +                              set_current_state(TASK_UNINTERRUPTIBLE);
 +                              sleep_time = schedule_timeout(sleep_time);
 +                              remove_wait_queue(&audit_backlog_wait, &wait);
 +                      } else {
 +                              if (audit_rate_check() && printk_ratelimit())
 +                                      pr_warn("audit_backlog=%d > audit_backlog_limit=%d\n",
 +                                              skb_queue_len(&audit_queue),
 +                                              audit_backlog_limit);
 +                              audit_log_lost("backlog limit exceeded");
 +                              return NULL;
                        }
                }
 -              if (audit_rate_check() && printk_ratelimit())
 -                      pr_warn("audit_backlog=%d > audit_backlog_limit=%d\n",
 -                              skb_queue_len(&audit_skb_queue),
 -                              audit_backlog_limit);
 -              audit_log_lost("backlog limit exceeded");
 -              audit_backlog_wait_time = 0;
 -              wake_up(&audit_backlog_wait);
 -              return NULL;
        }
  
 -      if (!reserve && !audit_backlog_wait_time)
 -              audit_backlog_wait_time = audit_backlog_wait_time_master;
 -
        ab = audit_buffer_alloc(ctx, gfp_mask, type);
        if (!ab) {
                audit_log_lost("out of memory in audit_log_start");
        }
  
        audit_get_stamp(ab->ctx, &t, &serial);
 -
        audit_log_format(ab, "audit(%lu.%03lu:%u): ",
                         t.tv_sec, t.tv_nsec/1000000, serial);
 +
        return ab;
  }
  
@@@ -1893,7 -1760,7 +1893,7 @@@ void audit_copy_inode(struct audit_name
   * @call_panic: optional pointer to int that will be updated if secid fails
   */
  void audit_log_name(struct audit_context *context, struct audit_names *n,
-                   struct path *path, int record_num, int *call_panic)
+                   const struct path *path, int record_num, int *call_panic)
  {
        struct audit_buffer *ab;
        ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
@@@ -2081,7 -1948,7 +2081,7 @@@ EXPORT_SYMBOL(audit_log_task_info)
   * @operation: specific link operation
   * @link: the path that triggered the restriction
   */
- void audit_log_link_denied(const char *operation, struct path *link)
+ void audit_log_link_denied(const char *operation, const struct path *link)
  {
        struct audit_buffer *ab;
        struct audit_names *name;
   * audit_log_end - end one audit record
   * @ab: the audit_buffer
   *
 - * netlink_unicast() cannot be called inside an irq context because it blocks
 - * (last arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed
 - * on a queue and a tasklet is scheduled to remove them from the queue outside
 - * the irq context.  May be called in any context.
 + * We can not do a netlink send inside an irq context because it blocks (last
 + * arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed on a
 + * queue and a tasklet is scheduled to remove them from the queue outside the
 + * irq context.  May be called in any context.
   */
  void audit_log_end(struct audit_buffer *ab)
  {
        if (!audit_rate_check()) {
                audit_log_lost("rate limit exceeded");
        } else {
 -              struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
 -
 -              nlh->nlmsg_len = ab->skb->len;
 -              kauditd_send_multicast_skb(ab->skb, ab->gfp_mask);
 -
 -              /*
 -               * The original kaudit unicast socket sends up messages with
 -               * nlmsg_len set to the payload length rather than the entire
 -               * message length.  This breaks the standard set by netlink.
 -               * The existing auditd daemon assumes this breakage.  Fixing
 -               * this would require co-ordinating a change in the established
 -               * protocol between the kaudit kernel subsystem and the auditd
 -               * userspace code.
 -               */
 -              nlh->nlmsg_len -= NLMSG_HDRLEN;
 -
 -              if (audit_pid) {
 -                      skb_queue_tail(&audit_skb_queue, ab->skb);
 -                      wake_up_interruptible(&kauditd_wait);
 -              } else {
 -                      audit_printk_skb(ab->skb);
 -              }
 +              skb_queue_tail(&audit_queue, ab->skb);
 +              wake_up_interruptible(&kauditd_wait);
                ab->skb = NULL;
        }
        audit_buffer_free(ab);
diff --combined kernel/audit_fsnotify.c
@@@ -74,7 -74,7 +74,7 @@@ int audit_mark_compare(struct audit_fsn
  }
  
  static void audit_update_mark(struct audit_fsnotify_mark *audit_mark,
-                            struct inode *inode)
+                            const struct inode *inode)
  {
        audit_mark->dev = inode ? inode->i_sb->s_dev : AUDIT_DEV_UNSET;
        audit_mark->ino = inode ? inode->i_ino : AUDIT_INO_UNSET;
@@@ -130,9 -130,10 +130,9 @@@ static void audit_mark_log_rule_change(
        ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
        if (unlikely(!ab))
                return;
 -      audit_log_format(ab, "auid=%u ses=%u op=",
 +      audit_log_format(ab, "auid=%u ses=%u op=%s",
                         from_kuid(&init_user_ns, audit_get_loginuid(current)),
 -                       audit_get_sessionid(current));
 -      audit_log_string(ab, op);
 +                       audit_get_sessionid(current), op);
        audit_log_format(ab, " path=");
        audit_log_untrustedstring(ab, audit_mark->path);
        audit_log_key(ab, rule->filterkey);
@@@ -167,11 -168,11 +167,11 @@@ static int audit_mark_handle_event(stru
                                    struct inode *to_tell,
                                    struct fsnotify_mark *inode_mark,
                                    struct fsnotify_mark *vfsmount_mark,
-                                   u32 mask, void *data, int data_type,
+                                   u32 mask, const void *data, int data_type,
                                    const unsigned char *dname, u32 cookie)
  {
        struct audit_fsnotify_mark *audit_mark;
-       struct inode *inode = NULL;
+       const struct inode *inode = NULL;
  
        audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark);
  
  
        switch (data_type) {
        case (FSNOTIFY_EVENT_PATH):
-               inode = ((struct path *)data)->dentry->d_inode;
+               inode = ((const struct path *)data)->dentry->d_inode;
                break;
        case (FSNOTIFY_EVENT_INODE):
-               inode = (struct inode *)data;
+               inode = (const struct inode *)data;
                break;
        default:
                BUG();
diff --combined kernel/audit_tree.c
@@@ -458,7 -458,8 +458,7 @@@ static void audit_tree_log_remove_rule(
        ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
        if (unlikely(!ab))
                return;
 -      audit_log_format(ab, "op=");
 -      audit_log_string(ab, "remove_rule");
 +      audit_log_format(ab, "op=remove_rule");
        audit_log_format(ab, " dir=");
        audit_log_untrustedstring(ab, rule->tree->pathname);
        audit_log_key(ab, rule->filterkey);
@@@ -947,7 -948,7 +947,7 @@@ static int audit_tree_handle_event(stru
                                   struct inode *to_tell,
                                   struct fsnotify_mark *inode_mark,
                                   struct fsnotify_mark *vfsmount_mark,
-                                  u32 mask, void *data, int data_type,
+                                  u32 mask, const void *data, int data_type,
                                   const unsigned char *file_name, u32 cookie)
  {
        return 0;
diff --combined kernel/audit_watch.c
@@@ -242,9 -242,10 +242,9 @@@ static void audit_watch_log_rule_change
                ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
                if (unlikely(!ab))
                        return;
 -              audit_log_format(ab, "auid=%u ses=%u op=",
 +              audit_log_format(ab, "auid=%u ses=%u op=%s",
                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
 -                               audit_get_sessionid(current));
 -              audit_log_string(ab, op);
 +                               audit_get_sessionid(current), op);
                audit_log_format(ab, " path=");
                audit_log_untrustedstring(ab, w->path);
                audit_log_key(ab, r->filterkey);
@@@ -471,10 -472,10 +471,10 @@@ static int audit_watch_handle_event(str
                                    struct inode *to_tell,
                                    struct fsnotify_mark *inode_mark,
                                    struct fsnotify_mark *vfsmount_mark,
-                                   u32 mask, void *data, int data_type,
+                                   u32 mask, const void *data, int data_type,
                                    const unsigned char *dname, u32 cookie)
  {
-       struct inode *inode;
+       const struct inode *inode;
        struct audit_parent *parent;
  
        parent = container_of(inode_mark, struct audit_parent, mark);
  
        switch (data_type) {
        case (FSNOTIFY_EVENT_PATH):
-               inode = d_backing_inode(((struct path *)data)->dentry);
+               inode = d_backing_inode(((const struct path *)data)->dentry);
                break;
        case (FSNOTIFY_EVENT_INODE):
-               inode = (struct inode *)data;
+               inode = (const struct inode *)data;
                break;
        default:
                BUG();
@@@ -547,8 -548,8 +547,8 @@@ int audit_exe_compare(struct task_struc
        exe_file = get_task_exe_file(tsk);
        if (!exe_file)
                return 0;
 -      ino = exe_file->f_inode->i_ino;
 -      dev = exe_file->f_inode->i_sb->s_dev;
 +      ino = file_inode(exe_file)->i_ino;
 +      dev = file_inode(exe_file)->i_sb->s_dev;
        fput(exe_file);
        return audit_mark_compare(mark, ino, dev);
  }