Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs

author Linus Torvalds <torvalds@linux-foundation.org>

Sun, 18 Dec 2016 02:44:00 +0000 (18:44 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sun, 18 Dec 2016 02:44:00 +0000 (18:44 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Sun, 18 Dec 2016 02:44:00 +0000 (18:44 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sun, 18 Dec 2016 02:44:00 +0000 (18:44 -0800)
diff --combined Documentation/filesystems/Locking

index 69e2387,4ca3a8d..ace63cd
--- 1/Documentation/filesystems/Locking
--- 2/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@@ -20,7 -20,7 +20,7 @@@ prototypes
         void (*d_iput)(struct dentry *, struct inode *);
         char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen);
         struct vfsmount *(*d_automount)(struct path *path);
-       int (*d_manage)(struct dentry *, bool);
+       int (*d_manage)(const struct path *, bool);
         struct dentry *(*d_real)(struct dentry *, const struct inode *,
                                  unsigned int);
   
@@@ -556,7 -556,7 +556,7 @@@ till "end_pgoff". ->map_pages() is call
   not block.  If it's not possible to reach a page without blocking,
   filesystem should skip it. Filesystem should use do_set_pte() to setup
   page table entry. Pointer to entry associated with the page is passed in
- -"pte" field in fault_env structure. Pointers to entries for other offsets
+ +"pte" field in vm_fault structure. Pointers to entries for other offsets
   should be calculated relative to "pte".
   
         ->page_mkwrite() is called when a previously read-only pte is
diff --combined fs/9p/vfs_addr.c

index 5ca1fb0,ff8ece8..adaf6f6
--- 1/fs/9p/vfs_addr.c
--- 2/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@@ -34,7 -34,6 +34,7 @@@
   #include <linux/idr.h>
   #include <linux/sched.h>
   #include <linux/uio.h>
+ +#include <linux/bvec.h>
   #include <net/9p/9p.h>
   #include <net/9p/client.h>
   
@@@ -310,18 -309,10 +310,10 @@@ static int v9fs_write_end(struct file *
   
         p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping);
   
-       if (unlikely(copied < len)) {
-               /*
-                * zero out the rest of the area
-                */
-               unsigned from = pos & (PAGE_SIZE - 1);
- 
-               zero_user(page, from + copied, len - copied);
-               flush_dcache_page(page);
+       if (unlikely(copied < len && !PageUptodate(page))) {
+               copied = 0;
+               goto out;
         }
- 
-       if (!PageUptodate(page))
-               SetPageUptodate(page);
         /*
          * No need to use i_size_read() here, the i_size
          * cannot change under us because we hold the i_mutex.
@@@ -331,6 -322,7 +323,7 @@@
                 i_size_write(inode, last_pos);
         }
         set_page_dirty(page);
+ out:
         unlock_page(page);
         put_page(page);
   
diff --combined fs/btrfs/ctree.h

index 50bcfb8,05f75a9..6a82371
--- 1/fs/btrfs/ctree.h
--- 2/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@@ -90,6 -90,9 +90,6 @@@ static const int btrfs_csum_sizes[] = 
   /* four bytes for CRC32 */
   #define BTRFS_EMPTY_DIR_SIZE 0
   
- -/* specific to btrfs_map_block(), therefore not in include/linux/blk_types.h */
- -#define REQ_GET_READ_MIRRORS  (1 << 30)
- -
   /* ioprio of readahead is set to idle */
   #define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
   
@@@ -337,7 -340,7 +337,7 @@@ struct btrfs_path 
         unsigned int need_commit_sem:1;
         unsigned int skip_release_on_error:1;
   };
- -#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r) >> 4) - \
+ +#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \
                                         sizeof(struct btrfs_item))
   struct btrfs_dev_replace {
         u64 replace_state;      /* see #define above */
@@@ -426,10 -429,6 +426,10 @@@ struct btrfs_space_info 
         struct list_head ro_bgs;
         struct list_head priority_tickets;
         struct list_head tickets;
+ +      /*
+ +       * tickets_id just indicates the next ticket will be handled, so note
+ +       * it's not stored per ticket.
+ +       */
         u64 tickets_id;
   
         struct rw_semaphore groups_sem;
@@@ -519,7 -518,7 +519,7 @@@ struct btrfs_io_ctl 
         void *cur, *orig;
         struct page *page;
         struct page **pages;
- -      struct btrfs_root *root;
+ +      struct btrfs_fs_info *fs_info;
         struct inode *inode;
         unsigned long size;
         int index;
@@@ -799,6 -798,7 +799,6 @@@ struct btrfs_fs_info 
         spinlock_t super_lock;
         struct btrfs_super_block *super_copy;
         struct btrfs_super_block *super_for_commit;
- -      struct block_device *__bdev;
         struct super_block *sb;
         struct inode *btree_inode;
         struct backing_dev_info bdi;
@@@ -1084,18 -1084,8 +1084,18 @@@
   
         /* Used to record internally whether fs has been frozen */
         int fs_frozen;
+ +
+ +      /* Cached block sizes */
+ +      u32 nodesize;
+ +      u32 sectorsize;
+ +      u32 stripesize;
   };
   
+ +static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
+ +{
+ +      return sb->s_fs_info;
+ +}
+ +
   struct btrfs_subvolume_writers {
         struct percpu_counter   counter;
         wait_queue_head_t       wait;
@@@ -1169,6 -1159,14 +1169,6 @@@ struct btrfs_root 
         u64 objectid;
         u64 last_trans;
   
- -      /* data allocations are done in sectorsize units */
- -      u32 sectorsize;
- -
- -      /* node allocations are done in nodesize units */
- -      u32 nodesize;
- -
- -      u32 stripesize;
- -
         u32 type;
   
         u64 highest_objectid;
@@@ -1252,42 -1250,38 +1252,42 @@@
         /* For qgroup metadata space reserve */
         atomic_t qgroup_meta_rsv;
   };
+ +static inline u32 btrfs_inode_sectorsize(const struct inode *inode)
+ +{
+ +      return btrfs_sb(inode->i_sb)->sectorsize;
+ +}
   
   static inline u32 __BTRFS_LEAF_DATA_SIZE(u32 blocksize)
   {
         return blocksize - sizeof(struct btrfs_header);
   }
   
- -static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_root *root)
+ +static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info)
   {
- -      return __BTRFS_LEAF_DATA_SIZE(root->nodesize);
+ +      return __BTRFS_LEAF_DATA_SIZE(info->nodesize);
   }
   
- -static inline u32 BTRFS_MAX_ITEM_SIZE(const struct btrfs_root *root)
+ +static inline u32 BTRFS_MAX_ITEM_SIZE(const struct btrfs_fs_info *info)
   {
- -      return BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
+ +      return BTRFS_LEAF_DATA_SIZE(info) - sizeof(struct btrfs_item);
   }
   
- -static inline u32 BTRFS_NODEPTRS_PER_BLOCK(const struct btrfs_root *root)
+ +static inline u32 BTRFS_NODEPTRS_PER_BLOCK(const struct btrfs_fs_info *info)
   {
- -      return BTRFS_LEAF_DATA_SIZE(root) / sizeof(struct btrfs_key_ptr);
+ +      return BTRFS_LEAF_DATA_SIZE(info) / sizeof(struct btrfs_key_ptr);
   }
   
   #define BTRFS_FILE_EXTENT_INLINE_DATA_START           \
                 (offsetof(struct btrfs_file_extent_item, disk_bytenr))
- -static inline u32 BTRFS_MAX_INLINE_DATA_SIZE(const struct btrfs_root *root)
+ +static inline u32 BTRFS_MAX_INLINE_DATA_SIZE(const struct btrfs_fs_info *info)
   {
- -      return BTRFS_MAX_ITEM_SIZE(root) -
+ +      return BTRFS_MAX_ITEM_SIZE(info) -
                BTRFS_FILE_EXTENT_INLINE_DATA_START;
   }
   
- -static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_root *root)
+ +static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
   {
- -      return BTRFS_MAX_ITEM_SIZE(root) - sizeof(struct btrfs_dir_item);
+ +      return BTRFS_MAX_ITEM_SIZE(info) - sizeof(struct btrfs_dir_item);
   }
   
   /*
@@@ -1349,13 -1343,12 +1349,13 @@@
   
   #ifdef CONFIG_BTRFS_DEBUG
   static inline int
- -btrfs_should_fragment_free_space(struct btrfs_root *root,
- -                               struct btrfs_block_group_cache *block_group)
+ +btrfs_should_fragment_free_space(struct btrfs_block_group_cache *block_group)
   {
- -      return (btrfs_test_opt(root->fs_info, FRAGMENT_METADATA) &&
+ +      struct btrfs_fs_info *fs_info = block_group->fs_info;
+ +
+ +      return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) &&
                 block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
- -             (btrfs_test_opt(root->fs_info, FRAGMENT_DATA) &&
+ +             (btrfs_test_opt(fs_info, FRAGMENT_DATA) &&
                 block_group->flags &  BTRFS_BLOCK_GROUP_DATA);
   }
   #endif
@@@ -2217,8 -2210,6 +2217,8 @@@ btrfs_disk_balance_args_to_cpu(struct b
         cpu->target = le64_to_cpu(disk->target);
         cpu->flags = le64_to_cpu(disk->flags);
         cpu->limit = le64_to_cpu(disk->limit);
+ +      cpu->stripes_min = le32_to_cpu(disk->stripes_min);
+ +      cpu->stripes_max = le32_to_cpu(disk->stripes_max);
   }
   
   static inline void
@@@ -2237,8 -2228,6 +2237,8 @@@ btrfs_cpu_balance_args_to_disk(struct b
         disk->target = cpu_to_le64(cpu->target);
         disk->flags = cpu_to_le64(cpu->flags);
         disk->limit = cpu_to_le64(cpu->limit);
+ +      disk->stripes_min = cpu_to_le32(cpu->stripes_min);
+ +      disk->stripes_max = cpu_to_le32(cpu->stripes_max);
   }
   
   /* struct btrfs_super_block */
@@@ -2310,13 -2299,13 +2310,13 @@@ static inline unsigned long btrfs_leaf_
    * this returns the address of the start of the last item,
    * which is the stop of the leaf data stack
    */
- -static inline unsigned int leaf_data_end(struct btrfs_root *root,
+ +static inline unsigned int leaf_data_end(struct btrfs_fs_info *fs_info,
                                          struct extent_buffer *leaf)
   {
         u32 nr = btrfs_header_nritems(leaf);
   
         if (nr == 0)
- -              return BTRFS_LEAF_DATA_SIZE(root);
+ +              return BTRFS_LEAF_DATA_SIZE(fs_info);
         return btrfs_item_offset_nr(leaf, nr - 1);
   }
   
@@@ -2512,6 -2501,11 +2512,6 @@@ BTRFS_SETGET_STACK_FUNCS(stack_dev_repl
   BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
                          struct btrfs_dev_replace_item, cursor_right, 64);
   
- -static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
- -{
- -      return sb->s_fs_info;
- -}
- -
   /* helper function to cast into the data area of the leaf. */
   #define btrfs_item_ptr(leaf, slot, type) \
         ((type *)(btrfs_leaf_data(leaf) + \
@@@ -2534,28 -2528,28 +2534,28 @@@ static inline gfp_t btrfs_alloc_write_m
   
   /* extent-tree.c */
   
- -u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes);
+ +u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes);
   
- -static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
+ +static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_fs_info *fs_info,
                                                  unsigned num_items)
   {
- -      return root->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;
+ +      return fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;
   }
   
   /*
    * Doing a truncate won't result in new nodes or leaves, just what we need for
    * COW.
    */
- -static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
+ +static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info,
                                                  unsigned num_items)
   {
- -      return root->nodesize * BTRFS_MAX_LEVEL * num_items;
+ +      return fs_info->nodesize * BTRFS_MAX_LEVEL * num_items;
   }
   
   int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
- -                                     struct btrfs_root *root);
+ +                                     struct btrfs_fs_info *fs_info);
   int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
- -                                     struct btrfs_root *root);
+ +                                     struct btrfs_fs_info *fs_info);
   void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
                                          const u64 start);
   void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg);
@@@ -2564,18 -2558,18 +2564,18 @@@ void btrfs_dec_nocow_writers(struct btr
   void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg);
   void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
   int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
- -                         struct btrfs_root *root, unsigned long count);
- -int btrfs_async_run_delayed_refs(struct btrfs_root *root,
+ +                         struct btrfs_fs_info *fs_info, unsigned long count);
+ +int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
                                  unsigned long count, u64 transid, int wait);
- -int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len);
+ +int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len);
   int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
- -                           struct btrfs_root *root, u64 bytenr,
+ +                           struct btrfs_fs_info *fs_info, u64 bytenr,
                              u64 offset, int metadata, u64 *refs, u64 *flags);
- -int btrfs_pin_extent(struct btrfs_root *root,
+ +int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
                      u64 bytenr, u64 num, int reserved);
- -int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
+ +int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
                                     u64 bytenr, u64 num_bytes);
- -int btrfs_exclude_logged_extents(struct btrfs_root *root,
+ +int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info,
                                  struct extent_buffer *eb);
   int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
@@@ -2596,11 -2590,12 +2596,11 @@@ void btrfs_free_tree_block(struct btrfs
                            struct extent_buffer *buf,
                            u64 parent, int last_ref);
   int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
- -                                   struct btrfs_root *root,
                                      u64 root_objectid, u64 owner,
                                      u64 offset, u64 ram_bytes,
                                      struct btrfs_key *ins);
   int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
- -                                 struct btrfs_root *root,
+ +                                 struct btrfs_fs_info *fs_info,
                                    u64 root_objectid, u64 owner, u64 offset,
                                    struct btrfs_key *ins);
   int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes,
@@@ -2611,52 -2606,52 +2611,52 @@@ int btrfs_inc_ref(struct btrfs_trans_ha
   int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                   struct extent_buffer *buf, int full_backref);
   int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
- -                              struct btrfs_root *root,
+ +                              struct btrfs_fs_info *fs_info,
                                 u64 bytenr, u64 num_bytes, u64 flags,
                                 int level, int is_data);
   int btrfs_free_extent(struct btrfs_trans_handle *trans,
- -                    struct btrfs_root *root,
+ +                    struct btrfs_fs_info *fs_info,
                       u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
                       u64 owner, u64 offset);
   
- -int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len,
- -                             int delalloc);
- -int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
+ +int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
+ +                             u64 start, u64 len, int delalloc);
+ +int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
                                        u64 start, u64 len);
   void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
- -                               struct btrfs_root *root);
+ +                               struct btrfs_fs_info *fs_info);
   int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
- -                             struct btrfs_root *root);
+ +                             struct btrfs_fs_info *fs_info);
   int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
- -                       struct btrfs_root *root,
+ +                       struct btrfs_fs_info *fs_info,
                          u64 bytenr, u64 num_bytes, u64 parent,
                          u64 root_objectid, u64 owner, u64 offset);
   
   int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
- -                                 struct btrfs_root *root);
+ +                                 struct btrfs_fs_info *fs_info);
   int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
- -                                  struct btrfs_root *root);
+ +                                 struct btrfs_fs_info *fs_info);
   int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
- -                          struct btrfs_root *root);
- -int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
+ +                          struct btrfs_fs_info *fs_info);
+ +int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr);
   int btrfs_free_block_groups(struct btrfs_fs_info *info);
- -int btrfs_read_block_groups(struct btrfs_root *root);
- -int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr);
+ +int btrfs_read_block_groups(struct btrfs_fs_info *info);
+ +int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr);
   int btrfs_make_block_group(struct btrfs_trans_handle *trans,
- -                         struct btrfs_root *root, u64 bytes_used,
+ +                         struct btrfs_fs_info *fs_info, u64 bytes_used,
                            u64 type, u64 chunk_objectid, u64 chunk_offset,
                            u64 size);
   struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
                                 struct btrfs_fs_info *fs_info,
                                 const u64 chunk_offset);
   int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
- -                           struct btrfs_root *root, u64 group_start,
+ +                           struct btrfs_fs_info *fs_info, u64 group_start,
                              struct extent_map *em);
   void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
   void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache);
   void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *cache);
   void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
- -                                     struct btrfs_root *root);
+ +                                     struct btrfs_fs_info *fs_info);
   u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
   void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
   
@@@ -2686,7 -2681,7 +2686,7 @@@ void btrfs_free_reserved_data_space(str
   void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
                                             u64 len);
   void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
- -                              struct btrfs_root *root);
+ +                                struct btrfs_fs_info *fs_info);
   void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
   int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
                                   struct inode *inode);
@@@ -2695,7 -2690,7 +2695,7 @@@ int btrfs_subvolume_reserve_metadata(st
                                      struct btrfs_block_rsv *rsv,
                                      int nitems,
                                      u64 *qgroup_reserved, bool use_global_rsv);
- -void btrfs_subvolume_release_metadata(struct btrfs_root *root,
+ +void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
                                       struct btrfs_block_rsv *rsv,
                                       u64 qgroup_reserved);
   int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
@@@ -2703,15 -2698,16 +2703,15 @@@ void btrfs_delalloc_release_metadata(st
   int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len);
   void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len);
   void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
- -struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
+ +struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
                                               unsigned short type);
- -void btrfs_free_block_rsv(struct btrfs_root *root,
+ +void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
                           struct btrfs_block_rsv *rsv);
   void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv);
   int btrfs_block_rsv_add(struct btrfs_root *root,
                         struct btrfs_block_rsv *block_rsv, u64 num_bytes,
                         enum btrfs_reserve_flush_enum flush);
- -int btrfs_block_rsv_check(struct btrfs_root *root,
- -                        struct btrfs_block_rsv *block_rsv, int min_factor);
+ +int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor);
   int btrfs_block_rsv_refill(struct btrfs_root *root,
                            struct btrfs_block_rsv *block_rsv, u64 min_reserved,
                            enum btrfs_reserve_flush_enum flush);
@@@ -2721,21 -2717,22 +2721,21 @@@ int btrfs_block_rsv_migrate(struct btrf
   int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
                              struct btrfs_block_rsv *dest, u64 num_bytes,
                              int min_factor);
- -void btrfs_block_rsv_release(struct btrfs_root *root,
+ +void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
                              struct btrfs_block_rsv *block_rsv,
                              u64 num_bytes);
   int btrfs_inc_block_group_ro(struct btrfs_root *root,
                              struct btrfs_block_group_cache *cache);
- -void btrfs_dec_block_group_ro(struct btrfs_root *root,
- -                            struct btrfs_block_group_cache *cache);
+ +void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache);
   void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
   u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
- -int btrfs_error_unpin_extent_range(struct btrfs_root *root,
+ +int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
                                    u64 start, u64 end);
- -int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
+ +int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
                          u64 num_bytes, u64 *actual_bytes);
   int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
- -                          struct btrfs_root *root, u64 type);
- -int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range);
+ +                          struct btrfs_fs_info *fs_info, u64 type);
+ +int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
   
   int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
   int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
@@@ -2745,7 -2742,8 +2745,7 @@@ int btrfs_start_write_no_snapshoting(st
   void btrfs_end_write_no_snapshoting(struct btrfs_root *root);
   void btrfs_wait_for_snapshot_creation(struct btrfs_root *root);
   void check_system_chunk(struct btrfs_trans_handle *trans,
- -                      struct btrfs_root *root,
- -                      const u64 type);
+ +                      struct btrfs_fs_info *fs_info, const u64 type);
   u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
                        struct btrfs_fs_info *info, u64 start, u64 end);
   
@@@ -2795,10 -2793,10 +2795,10 @@@ int btrfs_copy_root(struct btrfs_trans_
                       struct extent_buffer **cow_ret, u64 new_root_objectid);
   int btrfs_block_can_be_shared(struct btrfs_root *root,
                               struct extent_buffer *buf);
- -void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
+ +void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
                        u32 data_size);
- -void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
- -                       u32 new_size, int from_end);
+ +void btrfs_truncate_item(struct btrfs_fs_info *fs_info,
+ +                       struct btrfs_path *path, u32 new_size, int from_end);
   int btrfs_split_item(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root,
                      struct btrfs_path *path,
@@@ -2874,8 -2872,7 +2874,8 @@@ static inline int btrfs_next_item(struc
   {
         return btrfs_next_old_item(root, p, 0);
   }
- -int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
+ +int btrfs_leaf_free_space(struct btrfs_fs_info *fs_info,
+ +                        struct extent_buffer *leaf);
   int __must_check btrfs_drop_snapshot(struct btrfs_root *root,
                                      struct btrfs_block_rsv *block_rsv,
                                      int update_ref, int for_reloc);
@@@ -2901,9 -2898,10 +2901,9 @@@ static inline int btrfs_fs_closing(stru
    * anything except sleeping. This function is used to check the status of
    * the fs.
    */
- -static inline int btrfs_need_cleaner_sleep(struct btrfs_root *root)
+ +static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info)
   {
- -      return (root->fs_info->sb->s_flags & MS_RDONLY ||
- -              btrfs_fs_closing(root->fs_info));
+ +      return fs_info->sb->s_flags & MS_RDONLY || btrfs_fs_closing(fs_info);
   }
   
   static inline void free_fs_info(struct btrfs_fs_info *fs_info)
@@@ -2933,11 -2931,11 +2933,11 @@@ int btrfs_old_root_level(struct btrfs_r
   
   /* root-item.c */
   int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
- -                     struct btrfs_root *tree_root,
+ +                     struct btrfs_fs_info *fs_info,
                        u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
                        const char *name, int name_len);
   int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
- -                     struct btrfs_root *tree_root,
+ +                     struct btrfs_fs_info *fs_info,
                        u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
                        const char *name, int name_len);
   int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@@ -2952,7 -2950,7 +2952,7 @@@ int __must_check btrfs_update_root(stru
   int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key,
                     struct btrfs_path *path, struct btrfs_root_item *root_item,
                     struct btrfs_key *root_key);
- -int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
+ +int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info);
   void btrfs_set_root_node(struct btrfs_root_item *item,
                          struct extent_buffer *node);
   void btrfs_check_and_init_root_item(struct btrfs_root_item *item);
@@@ -2961,10 -2959,10 +2961,10 @@@ void btrfs_update_root_times(struct btr
   
   /* uuid-tree.c */
   int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans,
- -                      struct btrfs_root *uuid_root, u8 *uuid, u8 type,
+ +                      struct btrfs_fs_info *fs_info, u8 *uuid, u8 type,
                         u64 subid);
   int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans,
- -                      struct btrfs_root *uuid_root, u8 *uuid, u8 type,
+ +                      struct btrfs_fs_info *fs_info, u8 *uuid, u8 type,
                         u64 subid);
   int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
                             int (*check_func)(struct btrfs_fs_info *, u8 *, u8,
@@@ -3006,10 -3004,10 +3006,10 @@@ struct btrfs_dir_item *btrfs_lookup_xat
                                           struct btrfs_path *path, u64 dir,
                                           const char *name, u16 name_len,
                                           int mod);
- -int verify_dir_item(struct btrfs_root *root,
+ +int verify_dir_item(struct btrfs_fs_info *fs_info,
                     struct extent_buffer *leaf,
                     struct btrfs_dir_item *dir_item);
- -struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+ +struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
                                                  struct btrfs_path *path,
                                                  const char *name,
                                                  int name_len);
@@@ -3053,10 -3051,11 +3053,10 @@@ int btrfs_find_name_in_ext_backref(stru
   /* file-item.c */
   struct btrfs_dio_private;
   int btrfs_del_csums(struct btrfs_trans_handle *trans,
- -                  struct btrfs_root *root, u64 bytenr, u64 len);
- -int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
- -                        struct bio *bio, u32 *dst);
- -int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
- -                            struct bio *bio, u64 logical_offset);
+ +                  struct btrfs_fs_info *fs_info, u64 bytenr, u64 len);
+ +int btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst);
+ +int btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio,
+ +                            u64 logical_offset);
   int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
                              u64 objectid, u64 pos,
@@@ -3070,8 -3069,8 +3070,8 @@@ int btrfs_lookup_file_extent(struct btr
   int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root,
                            struct btrfs_ordered_sum *sums);
- -int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
- -                     struct bio *bio, u64 file_start, int contig);
+ +int btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
+ +                     u64 file_start, int contig);
   int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
                              struct list_head *list, int search_commit);
   void btrfs_extent_item_to_extent_map(struct inode *inode,
@@@ -3174,7 -3173,7 +3174,7 @@@ void btrfs_orphan_commit_root(struct bt
   int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
   void btrfs_invalidate_inodes(struct btrfs_root *root);
   void btrfs_add_delayed_iput(struct inode *inode);
- -void btrfs_run_delayed_iputs(struct btrfs_root *root);
+ +void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info);
   int btrfs_prealloc_file_range(struct inode *inode, int mode,
                               u64 start, u64 num_bytes, u64 min_size,
                               loff_t actual_len, u64 *alloc_hint);
@@@ -3228,13 -3227,11 +3228,10 @@@ int btrfs_drop_extents(struct btrfs_tra
   int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
                               struct inode *inode, u64 start, u64 end);
   int btrfs_release_file(struct inode *inode, struct file *file);
- -int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
- -                    struct page **pages, size_t num_pages,
- -                    loff_t pos, size_t write_bytes,
+ +int btrfs_dirty_pages(struct inode *inode, struct page **pages,
+ +                    size_t num_pages, loff_t pos, size_t write_bytes,
                       struct extent_state **cached);
   int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
- ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in,
-                             struct file *file_out, loff_t pos_out,
-                             size_t len, unsigned int flags);
   int btrfs_clone_file_range(struct file *file_in, loff_t pos_in,
                            struct file *file_out, loff_t pos_out, u64 len);
   
@@@ -3252,7 -3249,7 +3249,7 @@@ void btrfs_sysfs_remove_mounted(struct 
   ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
   
   /* super.c */
- -int btrfs_parse_options(struct btrfs_root *root, char *options,
+ +int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
                         unsigned long new_flags);
   int btrfs_sync_fs(struct super_block *sb, int wait);
   
@@@ -3445,14 -3442,9 +3442,14 @@@ do {                                                          
         /* Report first abort since mount */                    \
         if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED,     \
                         &((trans)->fs_info->fs_state))) {       \
- -              WARN(1, KERN_DEBUG                              \
- -              "BTRFS: Transaction aborted (error %d)\n",      \
- -              (errno));                                       \
+ +              if ((errno) != -EIO) {                          \
+ +                      WARN(1, KERN_DEBUG                              \
+ +                      "BTRFS: Transaction aborted (error %d)\n",      \
+ +                      (errno));                                       \
+ +              } else {                                                \
+ +                      pr_debug("BTRFS: Transaction aborted (error %d)\n", \
+ +                                (errno));                     \
+ +              }                                               \
         }                                                       \
         __btrfs_abort_transaction((trans), __func__,            \
                                   __LINE__, (errno));           \
@@@ -3614,7 -3606,7 +3611,7 @@@ static inline int btrfs_init_acl(struc
   #endif
   
   /* relocation.c */
- -int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
+ +int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start);
   int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root);
   int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
@@@ -3633,12 -3625,12 +3630,12 @@@ int btrfs_reloc_post_snapshot(struct bt
   int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
                     u64 end, struct btrfs_scrub_progress *progress,
                     int readonly, int is_dev_replace);
- -void btrfs_scrub_pause(struct btrfs_root *root);
- -void btrfs_scrub_continue(struct btrfs_root *root);
+ +void btrfs_scrub_pause(struct btrfs_fs_info *fs_info);
+ +void btrfs_scrub_continue(struct btrfs_fs_info *fs_info);
   int btrfs_scrub_cancel(struct btrfs_fs_info *info);
   int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
                            struct btrfs_device *dev);
- -int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
+ +int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
                          struct btrfs_scrub_progress *progress);
   
   /* dev-replace.c */
@@@ -3653,7 -3645,7 +3650,7 @@@ static inline void btrfs_bio_counter_de
   
   /* reada.c */
   struct reada_control {
- -      struct btrfs_root       *root;          /* tree to prefetch */
+ +      struct btrfs_fs_info    *fs_info;               /* tree to prefetch */
         struct btrfs_key        key_start;
         struct btrfs_key        key_end;        /* exclusive */
         atomic_t                elems;
@@@ -3665,7 -3657,7 +3662,7 @@@ struct reada_control *btrfs_reada_add(s
   int btrfs_reada_wait(void *handle);
   void btrfs_reada_detach(void *handle);
   int btree_readahead_hook(struct btrfs_fs_info *fs_info,
- -                       struct extent_buffer *eb, u64 start, int err);
+ +                       struct extent_buffer *eb, int err);
   
   static inline int is_fstree(u64 rootid)
   {
diff --combined fs/btrfs/file.c

index 448f57d,991cc99..b5c5da2
--- 1/fs/btrfs/file.c
--- 2/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@@ -27,6 -27,7 +27,6 @@@
   #include <linux/falloc.h>
   #include <linux/swap.h>
   #include <linux/writeback.h>
- -#include <linux/statfs.h>
   #include <linux/compat.h>
   #include <linux/slab.h>
   #include <linux/btrfs.h>
@@@ -95,13 -96,13 +95,13 @@@ static int __compare_inode_defrag(struc
   static int __btrfs_add_inode_defrag(struct inode *inode,
                                     struct inode_defrag *defrag)
   {
- -      struct btrfs_root *root = BTRFS_I(inode)->root;
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         struct inode_defrag *entry;
         struct rb_node **p;
         struct rb_node *parent = NULL;
         int ret;
   
- -      p = &root->fs_info->defrag_inodes.rb_node;
+ +      p = &fs_info->defrag_inodes.rb_node;
         while (*p) {
                 parent = *p;
                 entry = rb_entry(parent, struct inode_defrag, rb_node);
@@@ -125,16 -126,16 +125,16 @@@
         }
         set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
         rb_link_node(&defrag->rb_node, parent, p);
- -      rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
+ +      rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes);
         return 0;
   }
   
- -static inline int __need_auto_defrag(struct btrfs_root *root)
+ +static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
   {
- -      if (!btrfs_test_opt(root->fs_info, AUTO_DEFRAG))
+ +      if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
                 return 0;
   
- -      if (btrfs_fs_closing(root->fs_info))
+ +      if (btrfs_fs_closing(fs_info))
                 return 0;
   
         return 1;
@@@ -147,13 -148,12 +147,13 @@@
   int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
                            struct inode *inode)
   {
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct inode_defrag *defrag;
         u64 transid;
         int ret;
   
- -      if (!__need_auto_defrag(root))
+ +      if (!__need_auto_defrag(fs_info))
                 return 0;
   
         if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
@@@ -172,7 -172,7 +172,7 @@@
         defrag->transid = transid;
         defrag->root = root->root_key.objectid;
   
- -      spin_lock(&root->fs_info->defrag_inodes_lock);
+ +      spin_lock(&fs_info->defrag_inodes_lock);
         if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) {
                 /*
                  * If we set IN_DEFRAG flag and evict the inode from memory,
@@@ -185,7 -185,7 +185,7 @@@
         } else {
                 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
         }
- -      spin_unlock(&root->fs_info->defrag_inodes_lock);
+ +      spin_unlock(&fs_info->defrag_inodes_lock);
         return 0;
   }
   
@@@ -197,19 -197,19 +197,19 @@@
   static void btrfs_requeue_inode_defrag(struct inode *inode,
                                        struct inode_defrag *defrag)
   {
- -      struct btrfs_root *root = BTRFS_I(inode)->root;
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         int ret;
   
- -      if (!__need_auto_defrag(root))
+ +      if (!__need_auto_defrag(fs_info))
                 goto out;
   
         /*
          * Here we don't check the IN_DEFRAG flag, because we need merge
          * them together.
          */
- -      spin_lock(&root->fs_info->defrag_inodes_lock);
+ +      spin_lock(&fs_info->defrag_inodes_lock);
         ret = __btrfs_add_inode_defrag(inode, defrag);
- -      spin_unlock(&root->fs_info->defrag_inodes_lock);
+ +      spin_unlock(&fs_info->defrag_inodes_lock);
         if (ret)
                 goto out;
         return;
@@@ -373,7 -373,7 +373,7 @@@ int btrfs_run_defrag_inodes(struct btrf
                              &fs_info->fs_state))
                         break;
   
- -              if (!__need_auto_defrag(fs_info->tree_root))
+ +              if (!__need_auto_defrag(fs_info))
                         break;
   
                 /* find an inode to defrag */
@@@ -485,11 -485,11 +485,11 @@@ static void btrfs_drop_pages(struct pag
    * this also makes the decision about creating an inline extent vs
    * doing real data extents, marking pages dirty and delalloc as required.
    */
- -int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
- -                           struct page **pages, size_t num_pages,
- -                           loff_t pos, size_t write_bytes,
- -                           struct extent_state **cached)
+ +int btrfs_dirty_pages(struct inode *inode, struct page **pages,
+ +                    size_t num_pages, loff_t pos, size_t write_bytes,
+ +                    struct extent_state **cached)
   {
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         int err = 0;
         int i;
         u64 num_bytes;
@@@ -498,9 -498,8 +498,9 @@@
         u64 end_pos = pos + write_bytes;
         loff_t isize = i_size_read(inode);
   
- -      start_pos = pos & ~((u64)root->sectorsize - 1);
- -      num_bytes = round_up(write_bytes + pos - start_pos, root->sectorsize);
+ +      start_pos = pos & ~((u64) fs_info->sectorsize - 1);
+ +      num_bytes = round_up(write_bytes + pos - start_pos,
+ +                           fs_info->sectorsize);
   
         end_of_last_block = start_pos + num_bytes - 1;
         err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
@@@ -697,7 -696,6 +697,7 @@@ int __btrfs_drop_extents(struct btrfs_t
                          u32 extent_item_size,
                          int *key_inserted)
   {
+ +      struct btrfs_fs_info *fs_info = root->fs_info;
         struct extent_buffer *leaf;
         struct btrfs_file_extent_item *fi;
         struct btrfs_key key;
@@@ -708,7 -706,6 +708,7 @@@
         u64 num_bytes = 0;
         u64 extent_offset = 0;
         u64 extent_end = 0;
+ +      u64 last_end = start;
         int del_nr = 0;
         int del_slot = 0;
         int extent_type;
@@@ -726,7 -723,7 +726,7 @@@
                 modify_tree = 0;
   
         update_refs = (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
- -                     root == root->fs_info->tree_root);
+ +                     root == fs_info->tree_root);
         while (1) {
                 recow = 0;
                 ret = btrfs_lookup_file_extent(trans, root, path, ino,
@@@ -800,10 -797,8 +800,10 @@@ next_slot
                  * extent item in the call to setup_items_for_insert() later
                  * in this function.
                  */
- -              if (extent_end == key.offset && extent_end >= search_start)
+ +              if (extent_end == key.offset && extent_end >= search_start) {
+ +                      last_end = extent_end;
                         goto delete_extent_item;
+ +              }
   
                 if (extent_end <= search_start) {
                         path->slots[0]++;
@@@ -856,7 -851,7 +856,7 @@@
                         btrfs_mark_buffer_dirty(leaf);
   
                         if (update_refs && disk_bytenr > 0) {
- -                              ret = btrfs_inc_extent_ref(trans, root,
+ +                              ret = btrfs_inc_extent_ref(trans, fs_info,
                                                 disk_bytenr, num_bytes, 0,
                                                 root->root_key.objectid,
                                                 new_key.objectid,
@@@ -865,12 -860,6 +865,12 @@@
                         }
                         key.offset = start;
                 }
+ +              /*
+ +               * From here on out we will have actually dropped something, so
+ +               * last_end can be updated.
+ +               */
+ +              last_end = extent_end;
+ +
                 /*
                  *  | ---- range to drop ----- |
                  *      | -------- extent -------- |
@@@ -883,7 -872,7 +883,7 @@@
   
                         memcpy(&new_key, &key, sizeof(new_key));
                         new_key.offset = end;
- -                      btrfs_set_item_key_safe(root->fs_info, path, &new_key);
+ +                      btrfs_set_item_key_safe(fs_info, path, &new_key);
   
                         extent_offset += end - key.offset;
                         btrfs_set_file_extent_offset(leaf, fi, extent_offset);
@@@ -938,9 -927,9 +938,9 @@@ delete_extent_item
                                 inode_sub_bytes(inode,
                                                 extent_end - key.offset);
                                 extent_end = ALIGN(extent_end,
- -                                                 root->sectorsize);
+ +                                                 fs_info->sectorsize);
                         } else if (update_refs && disk_bytenr > 0) {
- -                              ret = btrfs_free_extent(trans, root,
+ +                              ret = btrfs_free_extent(trans, fs_info,
                                                 disk_bytenr, num_bytes, 0,
                                                 root->root_key.objectid,
                                                 key.objectid, key.offset -
@@@ -997,7 -986,7 +997,7 @@@
         if (!ret && replace_extent && leafs_visited == 1 &&
             (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING ||
              path->locks[0] == BTRFS_WRITE_LOCK) &&
- -          btrfs_leaf_free_space(root, leaf) >=
+ +          btrfs_leaf_free_space(fs_info, leaf) >=
             sizeof(struct btrfs_item) + extent_item_size) {
   
                 key.objectid = ino;
@@@ -1021,7 -1010,7 +1021,7 @@@
         if (!replace_extent || !(*key_inserted))
                 btrfs_release_path(path);
         if (drop_end)
- -              *drop_end = found ? min(end, extent_end) : end;
+ +              *drop_end = found ? min(end, last_end) : end;
         return ret;
   }
   
@@@ -1084,7 -1073,6 +1084,7 @@@ static int extent_mergeable(struct exte
   int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
                               struct inode *inode, u64 start, u64 end)
   {
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct extent_buffer *leaf;
         struct btrfs_path *path;
@@@ -1154,7 -1142,7 +1154,7 @@@ again
                                      ino, bytenr, orig_offset,
                                      &other_start, &other_end)) {
                         new_key.offset = end;
- -                      btrfs_set_item_key_safe(root->fs_info, path, &new_key);
+ +                      btrfs_set_item_key_safe(fs_info, path, &new_key);
                         fi = btrfs_item_ptr(leaf, path->slots[0],
                                             struct btrfs_file_extent_item);
                         btrfs_set_file_extent_generation(leaf, fi,
@@@ -1188,7 -1176,7 +1188,7 @@@
                                                          trans->transid);
                         path->slots[0]++;
                         new_key.offset = start;
- -                      btrfs_set_item_key_safe(root->fs_info, path, &new_key);
+ +                      btrfs_set_item_key_safe(fs_info, path, &new_key);
   
                         fi = btrfs_item_ptr(leaf, path->slots[0],
                                             struct btrfs_file_extent_item);
@@@ -1234,8 -1222,8 +1234,8 @@@
                                                 extent_end - split);
                 btrfs_mark_buffer_dirty(leaf);
   
- -              ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
- -                                         root->root_key.objectid,
+ +              ret = btrfs_inc_extent_ref(trans, fs_info, bytenr, num_bytes,
+ +                                         0, root->root_key.objectid,
                                            ino, orig_offset);
                 if (ret) {
                         btrfs_abort_transaction(trans, ret);
@@@ -1268,7 -1256,7 +1268,7 @@@
                 extent_end = other_end;
                 del_slot = path->slots[0] + 1;
                 del_nr++;
- -              ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+ +              ret = btrfs_free_extent(trans, fs_info, bytenr, num_bytes,
                                         0, root->root_key.objectid,
                                         ino, orig_offset);
                 if (ret) {
@@@ -1288,7 -1276,7 +1288,7 @@@
                 key.offset = other_start;
                 del_slot = path->slots[0];
                 del_nr++;
- -              ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+ +              ret = btrfs_free_extent(trans, fs_info, bytenr, num_bytes,
                                         0, root->root_key.objectid,
                                         ino, orig_offset);
                 if (ret) {
@@@ -1421,16 -1409,15 +1421,16 @@@ lock_and_cleanup_extent_if_need(struct 
                                 u64 *lockstart, u64 *lockend,
                                 struct extent_state **cached_state)
   {
- -      struct btrfs_root *root = BTRFS_I(inode)->root;
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         u64 start_pos;
         u64 last_pos;
         int i;
         int ret = 0;
   
- -      start_pos = round_down(pos, root->sectorsize);
+ +      start_pos = round_down(pos, fs_info->sectorsize);
         last_pos = start_pos
- -              + round_up(pos + write_bytes - start_pos, root->sectorsize) - 1;
+ +              + round_up(pos + write_bytes - start_pos,
+ +                         fs_info->sectorsize) - 1;
   
         if (start_pos < inode->i_size) {
                 struct btrfs_ordered_extent *ordered;
@@@ -1477,7 -1464,6 +1477,7 @@@
   static noinline int check_can_nocow(struct inode *inode, loff_t pos,
                                     size_t *write_bytes)
   {
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct btrfs_ordered_extent *ordered;
         u64 lockstart, lockend;
@@@ -1488,9 -1474,8 +1488,9 @@@
         if (!ret)
                 return -ENOSPC;
   
- -      lockstart = round_down(pos, root->sectorsize);
- -      lockend = round_up(pos + *write_bytes, root->sectorsize) - 1;
+ +      lockstart = round_down(pos, fs_info->sectorsize);
+ +      lockend = round_up(pos + *write_bytes,
+ +                         fs_info->sectorsize) - 1;
   
         while (1) {
                 lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
@@@ -1524,7 -1509,6 +1524,7 @@@ static noinline ssize_t __btrfs_buffere
                                                loff_t pos)
   {
         struct inode *inode = file_inode(file);
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct page **pages = NULL;
         struct extent_state *cached_state = NULL;
@@@ -1571,9 -1555,9 +1571,9 @@@
                         break;
                 }
   
- -              sector_offset = pos & (root->sectorsize - 1);
+ +              sector_offset = pos & (fs_info->sectorsize - 1);
                 reserve_bytes = round_up(write_bytes + sector_offset,
- -                              root->sectorsize);
+ +                              fs_info->sectorsize);
   
                 ret = btrfs_check_data_free_space(inode, pos, write_bytes);
                 if (ret < 0) {
@@@ -1593,7 -1577,7 +1593,7 @@@
                                                          PAGE_SIZE);
                                 reserve_bytes = round_up(write_bytes +
                                                          sector_offset,
- -                                                       root->sectorsize);
+ +                                                       fs_info->sectorsize);
                         } else {
                                 break;
                         }
@@@ -1637,10 -1621,12 +1637,10 @@@ again
   
                 copied = btrfs_copy_from_user(pos, write_bytes, pages, i);
   
- -              num_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info,
- -                                              reserve_bytes);
+ +              num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
                 dirty_sectors = round_up(copied + sector_offset,
- -                                      root->sectorsize);
- -              dirty_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info,
- -                                              dirty_sectors);
+ +                                      fs_info->sectorsize);
+ +              dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors);
   
                 /*
                  * if we have trouble faulting in the pages, fall
@@@ -1668,9 -1654,11 +1668,9 @@@
                  * managed to copy.
                  */
                 if (num_sectors > dirty_sectors) {
- -
                         /* release everything except the sectors we dirtied */
                         release_bytes -= dirty_sectors <<
- -                              root->fs_info->sb->s_blocksize_bits;
- -
+ +                                              fs_info->sb->s_blocksize_bits;
                         if (copied > 0) {
                                 spin_lock(&BTRFS_I(inode)->lock);
                                 BTRFS_I(inode)->outstanding_extents++;
@@@ -1682,8 -1670,7 +1682,8 @@@
                         } else {
                                 u64 __pos;
   
- -                              __pos = round_down(pos, root->sectorsize) +
+ +                              __pos = round_down(pos,
+ +                                                 fs_info->sectorsize) +
                                         (dirty_pages << PAGE_SHIFT);
                                 btrfs_delalloc_release_space(inode, __pos,
                                                              release_bytes);
@@@ -1691,11 -1678,12 +1691,11 @@@
                 }
   
                 release_bytes = round_up(copied + sector_offset,
- -                                      root->sectorsize);
+ +                                      fs_info->sectorsize);
   
                 if (copied > 0)
- -                      ret = btrfs_dirty_pages(root, inode, pages,
- -                                              dirty_pages, pos, copied,
- -                                              NULL);
+ +                      ret = btrfs_dirty_pages(inode, pages, dirty_pages,
+ +                                              pos, copied, NULL);
                 if (need_unlock)
                         unlock_extent_cached(&BTRFS_I(inode)->io_tree,
                                              lockstart, lockend, &cached_state,
@@@ -1710,10 -1698,8 +1710,10 @@@
                         btrfs_end_write_no_snapshoting(root);
   
                 if (only_release_metadata && copied > 0) {
- -                      lockstart = round_down(pos, root->sectorsize);
- -                      lockend = round_up(pos + copied, root->sectorsize) - 1;
+ +                      lockstart = round_down(pos,
+ +                                             fs_info->sectorsize);
+ +                      lockend = round_up(pos + copied,
+ +                                         fs_info->sectorsize) - 1;
   
                         set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
                                        lockend, EXTENT_NORESERVE, NULL,
@@@ -1726,8 -1712,8 +1726,8 @@@
                 cond_resched();
   
                 balance_dirty_pages_ratelimited(inode->i_mapping);
- -              if (dirty_pages < (root->nodesize >> PAGE_SHIFT) + 1)
- -                      btrfs_btree_balance_dirty(root);
+ +              if (dirty_pages < (fs_info->nodesize >> PAGE_SHIFT) + 1)
+ +                      btrfs_btree_balance_dirty(fs_info);
   
                 pos += copied;
                 num_written += copied;
@@@ -1741,7 -1727,7 +1741,7 @@@
                         btrfs_delalloc_release_metadata(inode, release_bytes);
                 } else {
                         btrfs_delalloc_release_space(inode,
- -                                              round_down(pos, root->sectorsize),
+ +                                              round_down(pos, fs_info->sectorsize),
                                                 release_bytes);
                 }
         }
@@@ -1812,7 -1798,6 +1812,7 @@@ static ssize_t btrfs_file_write_iter(st
   {
         struct file *file = iocb->ki_filp;
         struct inode *inode = file_inode(file);
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         struct btrfs_root *root = BTRFS_I(inode)->root;
         u64 start_pos;
         u64 end_pos;
@@@ -1844,7 -1829,7 +1844,7 @@@
          * although we have opened a file as writable, we have
          * to stop this write operation to ensure FS consistency.
          */
- -      if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
+ +      if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
                 inode_unlock(inode);
                 err = -EROFS;
                 goto out;
@@@ -1860,18 -1845,17 +1860,18 @@@
   
         pos = iocb->ki_pos;
         count = iov_iter_count(from);
- -      start_pos = round_down(pos, root->sectorsize);
+ +      start_pos = round_down(pos, fs_info->sectorsize);
         oldsize = i_size_read(inode);
         if (start_pos > oldsize) {
                 /* Expand hole size to cover write data, preventing empty gap */
- -              end_pos = round_up(pos + count, root->sectorsize);
+ +              end_pos = round_up(pos + count,
+ +                                 fs_info->sectorsize);
                 err = btrfs_cont_expand(inode, oldsize, end_pos);
                 if (err) {
                         inode_unlock(inode);
                         goto out;
                 }
- -              if (start_pos > round_up(oldsize, root->sectorsize))
+ +              if (start_pos > round_up(oldsize, fs_info->sectorsize))
                         clean_page = 1;
         }
   
@@@ -1951,7 -1935,6 +1951,7 @@@ int btrfs_sync_file(struct file *file, 
   {
         struct dentry *dentry = file_dentry(file);
         struct inode *inode = d_inode(dentry);
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct btrfs_trans_handle *trans;
         struct btrfs_log_ctx ctx;
@@@ -2062,12 -2045,12 +2062,12 @@@
          * commit does not start nor waits for ordered extents to complete.
          */
         smp_mb();
- -      if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
+ +      if (btrfs_inode_in_log(inode, fs_info->generation) ||
             (full_sync && BTRFS_I(inode)->last_trans <=
- -           root->fs_info->last_trans_committed) ||
+ +           fs_info->last_trans_committed) ||
             (!btrfs_have_ordered_extents_in_range(inode, start, len) &&
              BTRFS_I(inode)->last_trans
- -           <= root->fs_info->last_trans_committed)) {
+ +           <= fs_info->last_trans_committed)) {
                 /*
                  * We've had everything committed since the last time we were
                  * modified so clear this flag in case it was set for whatever
@@@ -2146,7 -2129,7 +2146,7 @@@
          * which are indicated by ctx.io_err.
          */
         if (ctx.io_err) {
- -              btrfs_end_transaction(trans, root);
+ +              btrfs_end_transaction(trans);
                 ret = ctx.io_err;
                 goto out;
         }
@@@ -2155,20 -2138,20 +2155,20 @@@
                 if (!ret) {
                         ret = btrfs_sync_log(trans, root, &ctx);
                         if (!ret) {
- -                              ret = btrfs_end_transaction(trans, root);
+ +                              ret = btrfs_end_transaction(trans);
                                 goto out;
                         }
                 }
                 if (!full_sync) {
                         ret = btrfs_wait_ordered_range(inode, start, len);
                         if (ret) {
- -                              btrfs_end_transaction(trans, root);
+ +                              btrfs_end_transaction(trans);
                                 goto out;
                         }
                 }
- -              ret = btrfs_commit_transaction(trans, root);
+ +              ret = btrfs_commit_transaction(trans);
         } else {
- -              ret = btrfs_end_transaction(trans, root);
+ +              ret = btrfs_end_transaction(trans);
         }
   out:
         return ret > 0 ? -EIO : ret;
@@@ -2225,7 -2208,6 +2225,7 @@@ static int hole_mergeable(struct inode 
   static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
                       struct btrfs_path *path, u64 offset, u64 end)
   {
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct extent_buffer *leaf;
         struct btrfs_file_extent_item *fi;
@@@ -2234,7 -2216,7 +2234,7 @@@
         struct btrfs_key key;
         int ret;
   
- -      if (btrfs_fs_incompat(root->fs_info, NO_HOLES))
+ +      if (btrfs_fs_incompat(fs_info, NO_HOLES))
                 goto out;
   
         key.objectid = btrfs_ino(inode);
@@@ -2242,15 -2224,9 +2242,15 @@@
         key.offset = offset;
   
         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
- -      if (ret < 0)
+ +      if (ret <= 0) {
+ +              /*
+ +               * We should have dropped this offset, so if we find it then
+ +               * something has gone horribly wrong.
+ +               */
+ +              if (ret == 0)
+ +                      ret = -EINVAL;
                 return ret;
- -      BUG_ON(!ret);
+ +      }
   
         leaf = path->nodes[0];
         if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) {
@@@ -2272,7 -2248,7 +2272,7 @@@
                 u64 num_bytes;
   
                 key.offset = offset;
- -              btrfs_set_item_key_safe(root->fs_info, path, &key);
+ +              btrfs_set_item_key_safe(fs_info, path, &key);
                 fi = btrfs_item_ptr(leaf, path->slots[0],
                                     struct btrfs_file_extent_item);
                 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
@@@ -2308,7 -2284,7 +2308,7 @@@ out
                 hole_em->block_start = EXTENT_MAP_HOLE;
                 hole_em->block_len = 0;
                 hole_em->orig_block_len = 0;
- -              hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
+ +              hole_em->bdev = fs_info->fs_devices->latest_bdev;
                 hole_em->compress_type = BTRFS_COMPRESS_NONE;
                 hole_em->generation = trans->transid;
   
@@@ -2360,7 -2336,6 +2360,7 @@@ static int find_first_non_hole(struct i
   
   static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
   {
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct extent_state *cached_state = NULL;
         struct btrfs_path *path;
@@@ -2372,13 -2347,13 +2372,13 @@@
         u64 tail_len;
         u64 orig_start = offset;
         u64 cur_offset;
- -      u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
+ +      u64 min_size = btrfs_calc_trans_metadata_size(fs_info, 1);
         u64 drop_end;
         int ret = 0;
         int err = 0;
         unsigned int rsv_count;
         bool same_block;
- -      bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
+ +      bool no_holes = btrfs_fs_incompat(fs_info, NO_HOLES);
         u64 ino_size;
         bool truncated_block = false;
         bool updated_inode = false;
@@@ -2388,7 -2363,7 +2388,7 @@@
                 return ret;
   
         inode_lock(inode);
- -      ino_size = round_up(inode->i_size, root->sectorsize);
+ +      ino_size = round_up(inode->i_size, fs_info->sectorsize);
         ret = find_first_non_hole(inode, &offset, &len);
         if (ret < 0)
                 goto out_only_mutex;
@@@ -2398,11 -2373,11 +2398,11 @@@
                 goto out_only_mutex;
         }
   
- -      lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
+ +      lockstart = round_up(offset, btrfs_inode_sectorsize(inode));
         lockend = round_down(offset + len,
- -                           BTRFS_I(inode)->root->sectorsize) - 1;
- -      same_block = (BTRFS_BYTES_TO_BLKS(root->fs_info, offset))
- -              == (BTRFS_BYTES_TO_BLKS(root->fs_info, offset + len - 1));
+ +                           btrfs_inode_sectorsize(inode)) - 1;
+ +      same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
+ +              == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
         /*
          * We needn't truncate any block which is beyond the end of the file
          * because we are sure there is no data there.
@@@ -2411,7 -2386,7 +2411,7 @@@
          * Only do this if we are in the same block and we aren't doing the
          * entire block.
          */
- -      if (same_block && len < root->sectorsize) {
+ +      if (same_block && len < fs_info->sectorsize) {
                 if (offset < ino_size) {
                         truncated_block = true;
                         ret = btrfs_truncate_block(inode, offset, len, 0);
@@@ -2514,12 -2489,12 +2514,12 @@@
                 goto out;
         }
   
- -      rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
+ +      rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
         if (!rsv) {
                 ret = -ENOMEM;
                 goto out_free;
         }
- -      rsv->size = btrfs_calc_trunc_metadata_size(root, 1);
+ +      rsv->size = btrfs_calc_trans_metadata_size(fs_info, 1);
         rsv->failfast = 1;
   
         /*
@@@ -2534,7 -2509,7 +2534,7 @@@
                 goto out_free;
         }
   
- -      ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
+ +      ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
                                       min_size, 0);
         BUG_ON(ret);
         trans->block_rsv = rsv;
@@@ -2548,19 -2523,12 +2548,19 @@@
                 if (ret != -ENOSPC)
                         break;
   
- -              trans->block_rsv = &root->fs_info->trans_block_rsv;
+ +              trans->block_rsv = &fs_info->trans_block_rsv;
   
- -              if (cur_offset < ino_size) {
+ +              if (cur_offset < drop_end && cur_offset < ino_size) {
                         ret = fill_holes(trans, inode, path, cur_offset,
                                          drop_end);
                         if (ret) {
+ +                              /*
+ +                               * If we failed then we didn't insert our hole
+ +                               * entries for the area we dropped, so now the
+ +                               * fs is corrupted, so we must abort the
+ +                               * transaction.
+ +                               */
+ +                              btrfs_abort_transaction(trans, ret);
                                 err = ret;
                                 break;
                         }
@@@ -2574,8 -2542,8 +2574,8 @@@
                         break;
                 }
   
- -              btrfs_end_transaction(trans, root);
- -              btrfs_btree_balance_dirty(root);
+ +              btrfs_end_transaction(trans);
+ +              btrfs_btree_balance_dirty(fs_info);
   
                 trans = btrfs_start_transaction(root, rsv_count);
                 if (IS_ERR(trans)) {
@@@ -2584,7 -2552,7 +2584,7 @@@
                         break;
                 }
   
- -              ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
+ +              ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
                                               rsv, min_size, 0);
                 BUG_ON(ret);    /* shouldn't happen */
                 trans->block_rsv = rsv;
@@@ -2603,7 -2571,7 +2603,7 @@@
                 goto out_trans;
         }
   
- -      trans->block_rsv = &root->fs_info->trans_block_rsv;
+ +      trans->block_rsv = &fs_info->trans_block_rsv;
         /*
          * If we are using the NO_HOLES feature we might have had already an
          * hole that overlaps a part of the region [lockstart, lockend] and
@@@ -2625,8 -2593,6 +2625,8 @@@
         if (cur_offset < ino_size && cur_offset < drop_end) {
                 ret = fill_holes(trans, inode, path, cur_offset, drop_end);
                 if (ret) {
+ +                      /* Same comment as above. */
+ +                      btrfs_abort_transaction(trans, ret);
                         err = ret;
                         goto out_trans;
                 }
@@@ -2639,14 -2605,14 +2639,14 @@@ out_trans
         inode_inc_iversion(inode);
         inode->i_mtime = inode->i_ctime = current_time(inode);
   
- -      trans->block_rsv = &root->fs_info->trans_block_rsv;
+ +      trans->block_rsv = &fs_info->trans_block_rsv;
         ret = btrfs_update_inode(trans, root, inode);
         updated_inode = true;
- -      btrfs_end_transaction(trans, root);
- -      btrfs_btree_balance_dirty(root);
+ +      btrfs_end_transaction(trans);
+ +      btrfs_btree_balance_dirty(fs_info);
   out_free:
         btrfs_free_path(path);
- -      btrfs_free_block_rsv(root, rsv);
+ +      btrfs_free_block_rsv(fs_info, rsv);
   out:
         unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
                              &cached_state, GFP_NOFS);
@@@ -2664,7 -2630,7 +2664,7 @@@ out_only_mutex
                         err = PTR_ERR(trans);
                 } else {
                         err = btrfs_update_inode(trans, root, inode);
- -                      ret = btrfs_end_transaction(trans, root);
+ +                      ret = btrfs_end_transaction(trans);
                 }
         }
         inode_unlock(inode);
@@@ -2729,7 -2695,7 +2729,7 @@@ static long btrfs_fallocate(struct fil
         u64 locked_end;
         u64 actual_end = 0;
         struct extent_map *em;
- -      int blocksize = BTRFS_I(inode)->root->sectorsize;
+ +      int blocksize = btrfs_inode_sectorsize(inode);
         int ret;
   
         alloc_start = round_down(offset, blocksize);
@@@ -2906,9 -2872,9 +2906,9 @@@
                         btrfs_ordered_update_i_size(inode, actual_end, NULL);
                         ret = btrfs_update_inode(trans, root, inode);
                         if (ret)
- -                              btrfs_end_transaction(trans, root);
+ +                              btrfs_end_transaction(trans);
                         else
- -                              ret = btrfs_end_transaction(trans, root);
+ +                              ret = btrfs_end_transaction(trans);
                 }
         }
   out_unlock:
@@@ -2925,7 -2891,7 +2925,7 @@@ out
   
   static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
   {
- -      struct btrfs_root *root = BTRFS_I(inode)->root;
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         struct extent_map *em = NULL;
         struct extent_state *cached_state = NULL;
         u64 lockstart;
@@@ -2943,11 -2909,10 +2943,11 @@@
          */
         start = max_t(loff_t, 0, *offset);
   
- -      lockstart = round_down(start, root->sectorsize);
- -      lockend = round_up(i_size_read(inode), root->sectorsize);
+ +      lockstart = round_down(start, fs_info->sectorsize);
+ +      lockend = round_up(i_size_read(inode),
+ +                         fs_info->sectorsize);
         if (lockend <= lockstart)
- -              lockend = lockstart + root->sectorsize;
+ +              lockend = lockstart + fs_info->sectorsize;
         lockend--;
         len = lockend - lockstart + 1;
   
@@@ -3033,7 -2998,6 +3033,6 @@@ const struct file_operations btrfs_file
   #ifdef CONFIG_COMPAT
         .compat_ioctl   = btrfs_compat_ioctl,
   #endif
-       .copy_file_range = btrfs_copy_file_range,
         .clone_file_range = btrfs_clone_file_range,
         .dedupe_file_range = btrfs_dedupe_file_range,
   };
diff --combined fs/btrfs/ioctl.c

index 0a69025,70eebc6..33f967d
--- 1/fs/btrfs/ioctl.c
--- 2/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@@ -33,6 -33,7 +33,6 @@@
   #include <linux/namei.h>
   #include <linux/swap.h>
   #include <linux/writeback.h>
- -#include <linux/statfs.h>
   #include <linux/compat.h>
   #include <linux/bit_spinlock.h>
   #include <linux/security.h>
@@@ -215,7 -216,6 +215,7 @@@ static int check_flags(unsigned int fla
   static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
   {
         struct inode *inode = file_inode(file);
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         struct btrfs_inode *ip = BTRFS_I(inode);
         struct btrfs_root *root = ip->root;
         struct btrfs_trans_handle *trans;
@@@ -325,7 -325,7 +325,7 @@@
                 ip->flags |= BTRFS_INODE_COMPRESS;
                 ip->flags &= ~BTRFS_INODE_NOCOMPRESS;
   
- -              if (root->fs_info->compress_type == BTRFS_COMPRESS_LZO)
+ +              if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
                         comp = "lzo";
                 else
                         comp = "zlib";
@@@ -352,7 -352,7 +352,7 @@@
         inode->i_ctime = current_time(inode);
         ret = btrfs_update_inode(trans, root, inode);
   
- -      btrfs_end_transaction(trans, root);
+ +      btrfs_end_transaction(trans);
    out_drop:
         if (ret) {
                 ip->flags = ip_oldflags;
@@@ -374,8 -374,7 +374,8 @@@ static int btrfs_ioctl_getversion(struc
   
   static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
   {
- -      struct btrfs_fs_info *fs_info = btrfs_sb(file_inode(file)->i_sb);
+ +      struct inode *inode = file_inode(file);
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         struct btrfs_device *device;
         struct request_queue *q;
         struct fstrim_range range;
@@@ -411,7 -410,7 +411,7 @@@
   
         range.len = min(range.len, total_bytes - range.start);
         range.minlen = max(range.minlen, minlen);
- -      ret = btrfs_trim_fs(fs_info->tree_root, &range);
+ +      ret = btrfs_trim_fs(fs_info, &range);
         if (ret < 0)
                 return ret;
   
@@@ -438,7 -437,6 +438,7 @@@ static noinline int create_subvol(struc
                                   u64 *async_transid,
                                   struct btrfs_qgroup_inherit *inherit)
   {
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
         struct btrfs_trans_handle *trans;
         struct btrfs_key key;
         struct btrfs_root_item *root_item;
@@@ -461,7 -459,7 +461,7 @@@
         if (!root_item)
                 return -ENOMEM;
   
- -      ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid);
+ +      ret = btrfs_find_free_objectid(fs_info->tree_root, &objectid);
         if (ret)
                 goto fail_free;
   
@@@ -487,14 -485,14 +487,14 @@@
         trans = btrfs_start_transaction(root, 0);
         if (IS_ERR(trans)) {
                 ret = PTR_ERR(trans);
- -              btrfs_subvolume_release_metadata(root, &block_rsv,
+ +              btrfs_subvolume_release_metadata(fs_info, &block_rsv,
                                                  qgroup_reserved);
                 goto fail_free;
         }
         trans->block_rsv = &block_rsv;
         trans->bytes_reserved = block_rsv.size;
   
- -      ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid, inherit);
+ +      ret = btrfs_qgroup_inherit(trans, fs_info, 0, objectid, inherit);
         if (ret)
                 goto fail;
   
@@@ -504,22 -502,24 +504,22 @@@
                 goto fail;
         }
   
- -      memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
+ +      memzero_extent_buffer(leaf, 0, sizeof(struct btrfs_header));
         btrfs_set_header_bytenr(leaf, leaf->start);
         btrfs_set_header_generation(leaf, trans->transid);
         btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
         btrfs_set_header_owner(leaf, objectid);
   
- -      write_extent_buffer(leaf, root->fs_info->fsid, btrfs_header_fsid(),
- -                          BTRFS_FSID_SIZE);
- -      write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
- -                          btrfs_header_chunk_tree_uuid(leaf),
- -                          BTRFS_UUID_SIZE);
+ +      write_extent_buffer_fsid(leaf, fs_info->fsid);
+ +      write_extent_buffer_chunk_tree_uuid(leaf, fs_info->chunk_tree_uuid);
         btrfs_mark_buffer_dirty(leaf);
   
         inode_item = &root_item->inode;
         btrfs_set_stack_inode_generation(inode_item, 1);
         btrfs_set_stack_inode_size(inode_item, 3);
         btrfs_set_stack_inode_nlink(inode_item, 1);
- -      btrfs_set_stack_inode_nbytes(inode_item, root->nodesize);
+ +      btrfs_set_stack_inode_nbytes(inode_item,
+ +                                   fs_info->nodesize);
         btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
   
         btrfs_set_root_flags(root_item, 0);
@@@ -552,13 -552,13 +552,13 @@@
         key.objectid = objectid;
         key.offset = 0;
         key.type = BTRFS_ROOT_ITEM_KEY;
- -      ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
+ +      ret = btrfs_insert_root(trans, fs_info->tree_root, &key,
                                 root_item);
         if (ret)
                 goto fail;
   
         key.offset = (u64)-1;
- -      new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
+ +      new_root = btrfs_read_fs_root_no_name(fs_info, &key);
         if (IS_ERR(new_root)) {
                 ret = PTR_ERR(new_root);
                 btrfs_abort_transaction(trans, ret);
@@@ -599,13 -599,14 +599,13 @@@
         ret = btrfs_update_inode(trans, root, dir);
         BUG_ON(ret);
   
- -      ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
+ +      ret = btrfs_add_root_ref(trans, fs_info,
                                  objectid, root->root_key.objectid,
                                  btrfs_ino(dir), index, name, namelen);
         BUG_ON(ret);
   
- -      ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root,
- -                                root_item->uuid, BTRFS_UUID_KEY_SUBVOL,
- -                                objectid);
+ +      ret = btrfs_uuid_tree_add(trans, fs_info, root_item->uuid,
+ +                                BTRFS_UUID_KEY_SUBVOL, objectid);
         if (ret)
                 btrfs_abort_transaction(trans, ret);
   
@@@ -613,15 -614,15 +613,15 @@@ fail
         kfree(root_item);
         trans->block_rsv = NULL;
         trans->bytes_reserved = 0;
- -      btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
+ +      btrfs_subvolume_release_metadata(fs_info, &block_rsv, qgroup_reserved);
   
         if (async_transid) {
                 *async_transid = trans->transid;
- -              err = btrfs_commit_transaction_async(trans, root, 1);
+ +              err = btrfs_commit_transaction_async(trans, 1);
                 if (err)
- -                      err = btrfs_commit_transaction(trans, root);
+ +                      err = btrfs_commit_transaction(trans);
         } else {
- -              err = btrfs_commit_transaction(trans, root);
+ +              err = btrfs_commit_transaction(trans);
         }
         if (err && !ret)
                 ret = err;
@@@ -661,7 -662,6 +661,7 @@@ static int create_snapshot(struct btrfs
                            u64 *async_transid, bool readonly,
                            struct btrfs_qgroup_inherit *inherit)
   {
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
         struct inode *inode;
         struct btrfs_pending_snapshot *pending_snapshot;
         struct btrfs_trans_handle *trans;
@@@ -721,17 -721,19 +721,17 @@@
                 goto fail;
         }
   
- -      spin_lock(&root->fs_info->trans_lock);
+ +      spin_lock(&fs_info->trans_lock);
         list_add(&pending_snapshot->list,
                  &trans->transaction->pending_snapshots);
- -      spin_unlock(&root->fs_info->trans_lock);
+ +      spin_unlock(&fs_info->trans_lock);
         if (async_transid) {
                 *async_transid = trans->transid;
- -              ret = btrfs_commit_transaction_async(trans,
- -                                   root->fs_info->extent_root, 1);
+ +              ret = btrfs_commit_transaction_async(trans, 1);
                 if (ret)
- -                      ret = btrfs_commit_transaction(trans, root);
+ +                      ret = btrfs_commit_transaction(trans);
         } else {
- -              ret = btrfs_commit_transaction(trans,
- -                                             root->fs_info->extent_root);
+ +              ret = btrfs_commit_transaction(trans);
         }
         if (ret)
                 goto fail;
@@@ -753,7 -755,7 +753,7 @@@
         d_instantiate(dentry, inode);
         ret = 0;
   fail:
- -      btrfs_subvolume_release_metadata(BTRFS_I(dir)->root,
+ +      btrfs_subvolume_release_metadata(fs_info,
                                          &pending_snapshot->block_rsv,
                                          pending_snapshot->qgroup_reserved);
   dec_and_free:
@@@ -834,14 -836,13 +834,14 @@@ static inline int btrfs_may_create(stru
    * sys_mkdirat and vfs_mkdir, but we only do a single component lookup
    * inside this filesystem so it's quite a bit simpler.
    */
- static noinline int btrfs_mksubvol(struct path *parent,
+ static noinline int btrfs_mksubvol(const struct path *parent,
                                    char *name, int namelen,
                                    struct btrfs_root *snap_src,
                                    u64 *async_transid, bool readonly,
                                    struct btrfs_qgroup_inherit *inherit)
   {
- -      struct inode *dir  = d_inode(parent->dentry);
+ +      struct inode *dir = d_inode(parent->dentry);
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
         struct dentry *dentry;
         int error;
   
@@@ -868,7 -869,7 +868,7 @@@
         if (error)
                 goto out_dput;
   
- -      down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
+ +      down_read(&fs_info->subvol_sem);
   
         if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
                 goto out_up_read;
@@@ -883,7 -884,7 +883,7 @@@
         if (!error)
                 fsnotify_mkdir(dir, dentry);
   out_up_read:
- -      up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
+ +      up_read(&fs_info->subvol_sem);
   out_dput:
         dput(dentry);
   out_unlock:
@@@ -1267,7 -1268,6 +1267,7 @@@ int btrfs_defrag_file(struct inode *ino
                       struct btrfs_ioctl_defrag_range_args *range,
                       u64 newer_than, unsigned long max_to_defrag)
   {
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct file_ra_state *ra = NULL;
         unsigned long last_index;
@@@ -1365,8 -1365,8 +1365,8 @@@
                 if (!(inode->i_sb->s_flags & MS_ACTIVE))
                         break;
   
- -              if (btrfs_defrag_cancelled(root->fs_info)) {
- -                      btrfs_debug(root->fs_info, "defrag_file cancelled");
+ +              if (btrfs_defrag_cancelled(fs_info)) {
+ +                      btrfs_debug(fs_info, "defrag_file cancelled");
                         ret = -EAGAIN;
                         break;
                 }
@@@ -1454,18 -1454,18 +1454,18 @@@
                  * we have to make sure the IO is actually started and that
                  * ordered extents get created before we return
                  */
- -              atomic_inc(&root->fs_info->async_submit_draining);
- -              while (atomic_read(&root->fs_info->nr_async_submits) ||
- -                    atomic_read(&root->fs_info->async_delalloc_pages)) {
- -                      wait_event(root->fs_info->async_submit_wait,
- -                         (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
- -                          atomic_read(&root->fs_info->async_delalloc_pages) == 0));
+ +              atomic_inc(&fs_info->async_submit_draining);
+ +              while (atomic_read(&fs_info->nr_async_submits) ||
+ +                     atomic_read(&fs_info->async_delalloc_pages)) {
+ +                      wait_event(fs_info->async_submit_wait,
+ +                                 (atomic_read(&fs_info->nr_async_submits) == 0 &&
+ +                                  atomic_read(&fs_info->async_delalloc_pages) == 0));
                 }
- -              atomic_dec(&root->fs_info->async_submit_draining);
+ +              atomic_dec(&fs_info->async_submit_draining);
         }
   
         if (range->compress_type == BTRFS_COMPRESS_LZO) {
- -              btrfs_set_fs_incompat(root->fs_info, COMPRESS_LZO);
+ +              btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
         }
   
         ret = defrag_count;
@@@ -1485,12 -1485,10 +1485,12 @@@ out_ra
   static noinline int btrfs_ioctl_resize(struct file *file,
                                         void __user *arg)
   {
+ +      struct inode *inode = file_inode(file);
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         u64 new_size;
         u64 old_size;
         u64 devid = 1;
- -      struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ +      struct btrfs_root *root = BTRFS_I(inode)->root;
         struct btrfs_ioctl_vol_args *vol_args;
         struct btrfs_trans_handle *trans;
         struct btrfs_device *device = NULL;
@@@ -1507,12 -1505,13 +1507,12 @@@
         if (ret)
                 return ret;
   
- -      if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
- -                      1)) {
+ +      if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) {
                 mnt_drop_write_file(file);
                 return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
         }
   
- -      mutex_lock(&root->fs_info->volume_mutex);
+ +      mutex_lock(&fs_info->volume_mutex);
         vol_args = memdup_user(arg, sizeof(*vol_args));
         if (IS_ERR(vol_args)) {
                 ret = PTR_ERR(vol_args);
@@@ -1534,19 -1533,19 +1534,19 @@@
                         ret = -EINVAL;
                         goto out_free;
                 }
- -              btrfs_info(root->fs_info, "resizing devid %llu", devid);
+ +              btrfs_info(fs_info, "resizing devid %llu", devid);
         }
   
- -      device = btrfs_find_device(root->fs_info, devid, NULL, NULL);
+ +      device = btrfs_find_device(fs_info, devid, NULL, NULL);
         if (!device) {
- -              btrfs_info(root->fs_info, "resizer unable to find device %llu",
- -                     devid);
+ +              btrfs_info(fs_info, "resizer unable to find device %llu",
+ +                         devid);
                 ret = -ENODEV;
                 goto out_free;
         }
   
         if (!device->writeable) {
- -              btrfs_info(root->fs_info,
+ +              btrfs_info(fs_info,
                            "resizer unable to apply on readonly device %llu",
                        devid);
                 ret = -EPERM;
@@@ -1600,11 -1599,11 +1600,11 @@@
                 goto out_free;
         }
   
- -      new_size = div_u64(new_size, root->sectorsize);
- -      new_size *= root->sectorsize;
+ +      new_size = div_u64(new_size, fs_info->sectorsize);
+ +      new_size *= fs_info->sectorsize;
   
- -      btrfs_info_in_rcu(root->fs_info, "new size for %s is %llu",
- -                    rcu_str_deref(device->name), new_size);
+ +      btrfs_info_in_rcu(fs_info, "new size for %s is %llu",
+ +                        rcu_str_deref(device->name), new_size);
   
         if (new_size > old_size) {
                 trans = btrfs_start_transaction(root, 0);
@@@ -1613,7 -1612,7 +1613,7 @@@
                         goto out_free;
                 }
                 ret = btrfs_grow_device(trans, device, new_size);
- -              btrfs_commit_transaction(trans, root);
+ +              btrfs_commit_transaction(trans);
         } else if (new_size < old_size) {
                 ret = btrfs_shrink_device(device, new_size);
         } /* equal, nothing need to do */
@@@ -1621,8 -1620,8 +1621,8 @@@
   out_free:
         kfree(vol_args);
   out:
- -      mutex_unlock(&root->fs_info->volume_mutex);
- -      atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+ +      mutex_unlock(&fs_info->volume_mutex);
+ +      atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
         mnt_drop_write_file(file);
         return ret;
   }
@@@ -1775,7 -1774,6 +1775,7 @@@ static noinline int btrfs_ioctl_subvol_
                                                 void __user *arg)
   {
         struct inode *inode = file_inode(file);
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         struct btrfs_root *root = BTRFS_I(inode)->root;
         int ret = 0;
         u64 flags = 0;
@@@ -1783,10 -1781,10 +1783,10 @@@
         if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID)
                 return -EINVAL;
   
- -      down_read(&root->fs_info->subvol_sem);
+ +      down_read(&fs_info->subvol_sem);
         if (btrfs_root_readonly(root))
                 flags |= BTRFS_SUBVOL_RDONLY;
- -      up_read(&root->fs_info->subvol_sem);
+ +      up_read(&fs_info->subvol_sem);
   
         if (copy_to_user(arg, &flags, sizeof(flags)))
                 ret = -EFAULT;
@@@ -1798,7 -1796,6 +1798,7 @@@ static noinline int btrfs_ioctl_subvol_
                                               void __user *arg)
   {
         struct inode *inode = file_inode(file);
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct btrfs_trans_handle *trans;
         u64 root_flags;
@@@ -1832,7 -1829,7 +1832,7 @@@
                 goto out_drop_write;
         }
   
- -      down_write(&root->fs_info->subvol_sem);
+ +      down_write(&fs_info->subvol_sem);
   
         /* nothing to do */
         if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root))
@@@ -1854,9 -1851,9 +1854,9 @@@
                         spin_unlock(&root->root_item_lock);
                 } else {
                         spin_unlock(&root->root_item_lock);
- -                      btrfs_warn(root->fs_info,
- -                      "Attempt to set subvolume %llu read-write during send",
- -                                      root->root_key.objectid);
+ +                      btrfs_warn(fs_info,
+ +                                 "Attempt to set subvolume %llu read-write during send",
+ +                                 root->root_key.objectid);
                         ret = -EPERM;
                         goto out_drop_sem;
                 }
@@@ -1868,15 -1865,15 +1868,15 @@@
                 goto out_reset;
         }
   
- -      ret = btrfs_update_root(trans, root->fs_info->tree_root,
+ +      ret = btrfs_update_root(trans, fs_info->tree_root,
                                 &root->root_key, &root->root_item);
   
- -      btrfs_commit_transaction(trans, root);
+ +      btrfs_commit_transaction(trans);
   out_reset:
         if (ret)
                 btrfs_set_root_flags(&root->root_item, root_flags);
   out_drop_sem:
- -      up_write(&root->fs_info->subvol_sem);
+ +      up_write(&fs_info->subvol_sem);
   out_drop_write:
         mnt_drop_write_file(file);
   out:
@@@ -1888,7 -1885,6 +1888,7 @@@
    */
   static noinline int may_destroy_subvol(struct btrfs_root *root)
   {
+ +      struct btrfs_fs_info *fs_info = root->fs_info;
         struct btrfs_path *path;
         struct btrfs_dir_item *di;
         struct btrfs_key key;
@@@ -1900,14 -1896,14 +1900,14 @@@
                 return -ENOMEM;
   
         /* Make sure this root isn't set as the default subvol */
- -      dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
- -      di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root, path,
+ +      dir_id = btrfs_super_root_dir(fs_info->super_copy);
+ +      di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
                                    dir_id, "default", 7, 0);
         if (di && !IS_ERR(di)) {
                 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
                 if (key.objectid == root->root_key.objectid) {
                         ret = -EPERM;
- -                      btrfs_err(root->fs_info,
+ +                      btrfs_err(fs_info,
                                   "deleting default subvolume %llu is not allowed",
                                   key.objectid);
                         goto out;
@@@ -1919,7 -1915,8 +1919,7 @@@
         key.type = BTRFS_ROOT_REF_KEY;
         key.offset = (u64)-1;
   
- -      ret = btrfs_search_slot(NULL, root->fs_info->tree_root,
- -                              &key, path, 0, 0);
+ +      ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
         if (ret < 0)
                 goto out;
         BUG_ON(ret == 0);
@@@ -2090,10 -2087,10 +2090,10 @@@ static noinline int search_ioctl(struc
                                  size_t *buf_size,
                                  char __user *ubuf)
   {
+ +      struct btrfs_fs_info *info = btrfs_sb(inode->i_sb);
         struct btrfs_root *root;
         struct btrfs_key key;
         struct btrfs_path *path;
- -      struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info;
         int ret;
         int num_found = 0;
         unsigned long sk_offset = 0;
@@@ -2356,7 -2353,6 +2356,7 @@@ static noinline int btrfs_ioctl_snap_de
                                              void __user *arg)
   {
         struct dentry *parent = file->f_path.dentry;
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(parent->d_sb);
         struct dentry *dentry;
         struct inode *dir = d_inode(parent);
         struct inode *inode;
@@@ -2422,7 -2418,7 +2422,7 @@@
                  * rmdir(2).
                  */
                 err = -EPERM;
- -              if (!btrfs_test_opt(root->fs_info, USER_SUBVOL_RM_ALLOWED))
+ +              if (!btrfs_test_opt(fs_info, USER_SUBVOL_RM_ALLOWED))
                         goto out_dput;
   
                 /*
@@@ -2466,14 -2462,14 +2466,14 @@@
                 spin_unlock(&dest->root_item_lock);
         } else {
                 spin_unlock(&dest->root_item_lock);
- -              btrfs_warn(root->fs_info,
- -                      "Attempt to delete subvolume %llu during send",
- -                      dest->root_key.objectid);
+ +              btrfs_warn(fs_info,
+ +                         "Attempt to delete subvolume %llu during send",
+ +                         dest->root_key.objectid);
                 err = -EPERM;
                 goto out_unlock_inode;
         }
   
- -      down_write(&root->fs_info->subvol_sem);
+ +      down_write(&fs_info->subvol_sem);
   
         err = may_destroy_subvol(dest);
         if (err)
@@@ -2518,7 -2514,7 +2518,7 @@@
   
         if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
                 ret = btrfs_insert_orphan_item(trans,
- -                                      root->fs_info->tree_root,
+ +                                      fs_info->tree_root,
                                         dest->root_key.objectid);
                 if (ret) {
                         btrfs_abort_transaction(trans, ret);
@@@ -2527,8 -2523,8 +2527,8 @@@
                 }
         }
   
- -      ret = btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root,
- -                                dest->root_item.uuid, BTRFS_UUID_KEY_SUBVOL,
+ +      ret = btrfs_uuid_tree_rem(trans, fs_info, dest->root_item.uuid,
+ +                                BTRFS_UUID_KEY_SUBVOL,
                                   dest->root_key.objectid);
         if (ret && ret != -ENOENT) {
                 btrfs_abort_transaction(trans, ret);
@@@ -2536,7 -2532,7 +2536,7 @@@
                 goto out_end_trans;
         }
         if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
- -              ret = btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root,
+ +              ret = btrfs_uuid_tree_rem(trans, fs_info,
                                           dest->root_item.received_uuid,
                                           BTRFS_UUID_KEY_RECEIVED_SUBVOL,
                                           dest->root_key.objectid);
@@@ -2550,14 -2546,14 +2550,14 @@@
   out_end_trans:
         trans->block_rsv = NULL;
         trans->bytes_reserved = 0;
- -      ret = btrfs_end_transaction(trans, root);
+ +      ret = btrfs_end_transaction(trans);
         if (ret && !err)
                 err = ret;
         inode->i_flags |= S_DEAD;
   out_release:
- -      btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
+ +      btrfs_subvolume_release_metadata(fs_info, &block_rsv, qgroup_reserved);
   out_up_write:
- -      up_write(&root->fs_info->subvol_sem);
+ +      up_write(&fs_info->subvol_sem);
         if (err) {
                 spin_lock(&dest->root_item_lock);
                 root_flags = btrfs_root_flags(&dest->root_item);
@@@ -2659,7 -2655,7 +2659,7 @@@ out
         return ret;
   }
   
- -static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
+ +static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
   {
         struct btrfs_ioctl_vol_args *vol_args;
         int ret;
@@@ -2667,10 -2663,12 +2667,10 @@@
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
   
- -      if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
- -                      1)) {
+ +      if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1))
                 return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
- -      }
   
- -      mutex_lock(&root->fs_info->volume_mutex);
+ +      mutex_lock(&fs_info->volume_mutex);
         vol_args = memdup_user(arg, sizeof(*vol_args));
         if (IS_ERR(vol_args)) {
                 ret = PTR_ERR(vol_args);
@@@ -2678,22 -2676,21 +2678,22 @@@
         }
   
         vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
- -      ret = btrfs_init_new_device(root, vol_args->name);
+ +      ret = btrfs_init_new_device(fs_info, vol_args->name);
   
         if (!ret)
- -              btrfs_info(root->fs_info, "disk added %s",vol_args->name);
+ +              btrfs_info(fs_info, "disk added %s", vol_args->name);
   
         kfree(vol_args);
   out:
- -      mutex_unlock(&root->fs_info->volume_mutex);
- -      atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+ +      mutex_unlock(&fs_info->volume_mutex);
+ +      atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
         return ret;
   }
   
   static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
   {
- -      struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ +      struct inode *inode = file_inode(file);
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         struct btrfs_ioctl_vol_args_v2 *vol_args;
         int ret;
   
@@@ -2714,27 -2711,28 +2714,27 @@@
         if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED)
                 return -EOPNOTSUPP;
   
- -      if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
- -                      1)) {
+ +      if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) {
                 ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
                 goto out;
         }
   
- -      mutex_lock(&root->fs_info->volume_mutex);
+ +      mutex_lock(&fs_info->volume_mutex);
         if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) {
- -              ret = btrfs_rm_device(root, NULL, vol_args->devid);
+ +              ret = btrfs_rm_device(fs_info, NULL, vol_args->devid);
         } else {
                 vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
- -              ret = btrfs_rm_device(root, vol_args->name, 0);
+ +              ret = btrfs_rm_device(fs_info, vol_args->name, 0);
         }
- -      mutex_unlock(&root->fs_info->volume_mutex);
- -      atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+ +      mutex_unlock(&fs_info->volume_mutex);
+ +      atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
   
         if (!ret) {
                 if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
- -                      btrfs_info(root->fs_info, "device deleted: id %llu",
+ +                      btrfs_info(fs_info, "device deleted: id %llu",
                                         vol_args->devid);
                 else
- -                      btrfs_info(root->fs_info, "device deleted: %s",
+ +                      btrfs_info(fs_info, "device deleted: %s",
                                         vol_args->name);
         }
   out:
@@@ -2746,8 -2744,7 +2746,8 @@@ err_drop
   
   static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
   {
- -      struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ +      struct inode *inode = file_inode(file);
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         struct btrfs_ioctl_vol_args *vol_args;
         int ret;
   
@@@ -2758,7 -2755,8 +2758,7 @@@
         if (ret)
                 return ret;
   
- -      if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
- -                      1)) {
+ +      if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) {
                 ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
                 goto out_drop_write;
         }
@@@ -2770,27 -2768,26 +2770,27 @@@
         }
   
         vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
- -      mutex_lock(&root->fs_info->volume_mutex);
- -      ret = btrfs_rm_device(root, vol_args->name, 0);
- -      mutex_unlock(&root->fs_info->volume_mutex);
+ +      mutex_lock(&fs_info->volume_mutex);
+ +      ret = btrfs_rm_device(fs_info, vol_args->name, 0);
+ +      mutex_unlock(&fs_info->volume_mutex);
   
         if (!ret)
- -              btrfs_info(root->fs_info, "disk deleted %s",vol_args->name);
+ +              btrfs_info(fs_info, "disk deleted %s", vol_args->name);
         kfree(vol_args);
   out:
- -      atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+ +      atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
   out_drop_write:
         mnt_drop_write_file(file);
   
         return ret;
   }
   
- -static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
+ +static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
+ +                              void __user *arg)
   {
         struct btrfs_ioctl_fs_info_args *fi_args;
         struct btrfs_device *device;
- -      struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+ +      struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
         int ret = 0;
   
         fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL);
@@@ -2799,7 -2796,7 +2799,7 @@@
   
         mutex_lock(&fs_devices->device_list_mutex);
         fi_args->num_devices = fs_devices->num_devices;
- -      memcpy(&fi_args->fsid, root->fs_info->fsid, sizeof(fi_args->fsid));
+ +      memcpy(&fi_args->fsid, fs_info->fsid, sizeof(fi_args->fsid));
   
         list_for_each_entry(device, &fs_devices->devices, dev_list) {
                 if (device->devid > fi_args->max_id)
@@@ -2807,9 -2804,9 +2807,9 @@@
         }
         mutex_unlock(&fs_devices->device_list_mutex);
   
- -      fi_args->nodesize = root->fs_info->super_copy->nodesize;
- -      fi_args->sectorsize = root->fs_info->super_copy->sectorsize;
- -      fi_args->clone_alignment = root->fs_info->super_copy->sectorsize;
+ +      fi_args->nodesize = fs_info->super_copy->nodesize;
+ +      fi_args->sectorsize = fs_info->super_copy->sectorsize;
+ +      fi_args->clone_alignment = fs_info->super_copy->sectorsize;
   
         if (copy_to_user(arg, fi_args, sizeof(*fi_args)))
                 ret = -EFAULT;
@@@ -2818,12 -2815,11 +2818,12 @@@
         return ret;
   }
   
- -static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
+ +static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
+ +                               void __user *arg)
   {
         struct btrfs_ioctl_dev_info_args *di_args;
         struct btrfs_device *dev;
- -      struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+ +      struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
         int ret = 0;
         char *s_uuid = NULL;
   
@@@ -2835,7 -2831,7 +2835,7 @@@
                 s_uuid = di_args->uuid;
   
         mutex_lock(&fs_devices->device_list_mutex);
- -      dev = btrfs_find_device(root->fs_info, di_args->devid, s_uuid, NULL);
+ +      dev = btrfs_find_device(fs_info, di_args->devid, s_uuid, NULL);
   
         if (!dev) {
                 ret = -ENODEV;
@@@ -3309,10 -3305,10 +3309,10 @@@ static int clone_finish_inode_update(st
         ret = btrfs_update_inode(trans, root, inode);
         if (ret) {
                 btrfs_abort_transaction(trans, ret);
- -              btrfs_end_transaction(trans, root);
+ +              btrfs_end_transaction(trans);
                 goto out;
         }
- -      ret = btrfs_end_transaction(trans, root);
+ +      ret = btrfs_end_transaction(trans);
   out:
         return ret;
   }
@@@ -3410,10 -3406,9 +3410,10 @@@ static int clone_copy_inline_extent(str
                                     const u64 size,
                                     char *inline_data)
   {
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb);
         struct btrfs_root *root = BTRFS_I(dst)->root;
         const u64 aligned_end = ALIGN(new_key->offset + datal,
- -                                    root->sectorsize);
+ +                                    fs_info->sectorsize);
         int ret;
         struct btrfs_key key;
   
@@@ -3534,7 -3529,6 +3534,7 @@@ static int btrfs_clone(struct inode *sr
                        const u64 off, const u64 olen, const u64 olen_aligned,
                        const u64 destoff, int no_time_update)
   {
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct btrfs_path *path = NULL;
         struct extent_buffer *leaf;
@@@ -3548,9 -3542,9 +3548,9 @@@
         u64 last_dest_end = destoff;
   
         ret = -ENOMEM;
- -      buf = kmalloc(root->nodesize, GFP_KERNEL | __GFP_NOWARN);
+ +      buf = kmalloc(fs_info->nodesize, GFP_KERNEL | __GFP_NOWARN);
         if (!buf) {
- -              buf = vmalloc(root->nodesize);
+ +              buf = vmalloc(fs_info->nodesize);
                 if (!buf)
                         return ret;
         }
@@@ -3713,7 -3707,7 +3713,7 @@@ process_slot
                                         if (ret != -EOPNOTSUPP)
                                                 btrfs_abort_transaction(trans,
                                                                         ret);
- -                                      btrfs_end_transaction(trans, root);
+ +                                      btrfs_end_transaction(trans);
                                         goto out;
                                 }
   
@@@ -3721,7 -3715,7 +3721,7 @@@
                                                               &new_key, size);
                                 if (ret) {
                                         btrfs_abort_transaction(trans, ret);
- -                                      btrfs_end_transaction(trans, root);
+ +                                      btrfs_end_transaction(trans);
                                         goto out;
                                 }
   
@@@ -3745,8 -3739,7 +3745,8 @@@
   
                                 if (disko) {
                                         inode_add_bytes(inode, datal);
- -                                      ret = btrfs_inc_extent_ref(trans, root,
+ +                                      ret = btrfs_inc_extent_ref(trans,
+ +                                                      fs_info,
                                                         disko, diskl, 0,
                                                         root->root_key.objectid,
                                                         btrfs_ino(inode),
@@@ -3754,7 -3747,8 +3754,7 @@@
                                         if (ret) {
                                                 btrfs_abort_transaction(trans,
                                                                         ret);
- -                                              btrfs_end_transaction(trans,
- -                                                                    root);
+ +                                              btrfs_end_transaction(trans);
                                                 goto out;
   
                                         }
@@@ -3773,7 -3767,7 +3773,7 @@@
   
                                 if (comp && (skip || trim)) {
                                         ret = -EINVAL;
- -                                      btrfs_end_transaction(trans, root);
+ +                                      btrfs_end_transaction(trans);
                                         goto out;
                                 }
                                 size -= skip + trim;
@@@ -3789,7 -3783,7 +3789,7 @@@
                                         if (ret != -EOPNOTSUPP)
                                                 btrfs_abort_transaction(trans,
                                                                         ret);
- -                                      btrfs_end_transaction(trans, root);
+ +                                      btrfs_end_transaction(trans);
                                         goto out;
                                 }
                                 leaf = path->nodes[0];
@@@ -3808,7 -3802,7 +3808,7 @@@
                         btrfs_release_path(path);
   
                         last_dest_end = ALIGN(new_key.offset + datal,
- -                                            root->sectorsize);
+ +                                            fs_info->sectorsize);
                         ret = clone_finish_inode_update(trans, inode,
                                                         last_dest_end,
                                                         destoff, olen,
@@@ -3849,7 -3843,7 +3849,7 @@@
                 if (ret) {
                         if (ret != -EOPNOTSUPP)
                                 btrfs_abort_transaction(trans, ret);
- -                      btrfs_end_transaction(trans, root);
+ +                      btrfs_end_transaction(trans);
                         goto out;
                 }
                 clone_update_extent_map(inode, trans, NULL, last_dest_end,
@@@ -3869,11 -3863,10 +3869,11 @@@ static noinline int btrfs_clone_files(s
   {
         struct inode *inode = file_inode(file);
         struct inode *src = file_inode(file_src);
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         struct btrfs_root *root = BTRFS_I(inode)->root;
         int ret;
         u64 len = olen;
- -      u64 bs = root->fs_info->sb->s_blocksize;
+ +      u64 bs = fs_info->sb->s_blocksize;
         int same_inode = src == inode;
   
         /*
@@@ -3987,18 -3980,6 +3987,6 @@@ out_unlock
         return ret;
   }
   
- ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in,
-                             struct file *file_out, loff_t pos_out,
-                             size_t len, unsigned int flags)
- {
-       ssize_t ret;
- 
-       ret = btrfs_clone_files(file_out, file_in, pos_in, len, pos_out);
-       if (ret == 0)
-               ret = len;
-       return ret;
- }
- 
   int btrfs_clone_file_range(struct file *src_file, loff_t off,
                 struct file *dst_file, loff_t destoff, u64 len)
   {
@@@ -4014,7 -3995,6 +4002,7 @@@
   static long btrfs_ioctl_trans_start(struct file *file)
   {
         struct inode *inode = file_inode(file);
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct btrfs_trans_handle *trans;
         int ret;
@@@ -4035,7 -4015,7 +4023,7 @@@
         if (ret)
                 goto out;
   
- -      atomic_inc(&root->fs_info->open_ioctl_trans);
+ +      atomic_inc(&fs_info->open_ioctl_trans);
   
         ret = -ENOMEM;
         trans = btrfs_start_ioctl_transaction(root);
@@@ -4046,7 -4026,7 +4034,7 @@@
         return 0;
   
   out_drop:
- -      atomic_dec(&root->fs_info->open_ioctl_trans);
+ +      atomic_dec(&fs_info->open_ioctl_trans);
         mnt_drop_write_file(file);
   out:
         return ret;
@@@ -4055,7 -4035,6 +4043,7 @@@
   static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
   {
         struct inode *inode = file_inode(file);
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct btrfs_root *new_root;
         struct btrfs_dir_item *di;
@@@ -4086,7 -4065,7 +4074,7 @@@
         location.type = BTRFS_ROOT_ITEM_KEY;
         location.offset = (u64)-1;
   
- -      new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
+ +      new_root = btrfs_read_fs_root_no_name(fs_info, &location);
         if (IS_ERR(new_root)) {
                 ret = PTR_ERR(new_root);
                 goto out;
@@@ -4106,13 -4085,13 +4094,13 @@@
                 goto out;
         }
   
- -      dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
- -      di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
+ +      dir_id = btrfs_super_root_dir(fs_info->super_copy);
+ +      di = btrfs_lookup_dir_item(trans, fs_info->tree_root, path,
                                    dir_id, "default", 7, 1);
         if (IS_ERR_OR_NULL(di)) {
                 btrfs_free_path(path);
- -              btrfs_end_transaction(trans, root);
- -              btrfs_err(new_root->fs_info,
+ +              btrfs_end_transaction(trans);
+ +              btrfs_err(fs_info,
                           "Umm, you don't have the default diritem, this isn't going to work");
                 ret = -ENOENT;
                 goto out;
@@@ -4123,8 -4102,8 +4111,8 @@@
         btrfs_mark_buffer_dirty(path->nodes[0]);
         btrfs_free_path(path);
   
- -      btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL);
- -      btrfs_end_transaction(trans, root);
+ +      btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL);
+ +      btrfs_end_transaction(trans);
   out:
         mnt_drop_write_file(file);
         return ret;
@@@ -4146,8 -4125,7 +4134,8 @@@ void btrfs_get_block_group_info(struct 
         }
   }
   
- -static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
+ +static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info,
+ +                                 void __user *arg)
   {
         struct btrfs_ioctl_space_args space_args;
         struct btrfs_ioctl_space_info space;
@@@ -4175,7 -4153,7 +4163,7 @@@
   
                 info = NULL;
                 rcu_read_lock();
- -              list_for_each_entry_rcu(tmp, &root->fs_info->space_info,
+ +              list_for_each_entry_rcu(tmp, &fs_info->space_info,
                                         list) {
                         if (tmp->flags == types[i]) {
                                 info = tmp;
@@@ -4231,7 -4209,7 +4219,7 @@@
   
                 info = NULL;
                 rcu_read_lock();
- -              list_for_each_entry_rcu(tmp, &root->fs_info->space_info,
+ +              list_for_each_entry_rcu(tmp, &fs_info->space_info,
                                         list) {
                         if (tmp->flags == types[i]) {
                                 info = tmp;
@@@ -4262,7 -4240,7 +4250,7 @@@
          * Add global block reserve
          */
         if (slot_count) {
- -              struct btrfs_block_rsv *block_rsv = &root->fs_info->global_block_rsv;
+ +              struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
   
                 spin_lock(&block_rsv->lock);
                 space.total_bytes = block_rsv->size;
@@@ -4304,7 -4282,7 +4292,7 @@@ long btrfs_ioctl_trans_end(struct file 
                 return -EINVAL;
         file->private_data = NULL;
   
- -      btrfs_end_transaction(trans, root);
+ +      btrfs_end_transaction(trans);
   
         atomic_dec(&root->fs_info->open_ioctl_trans);
   
@@@ -4329,9 -4307,9 +4317,9 @@@ static noinline long btrfs_ioctl_start_
                 goto out;
         }
         transid = trans->transid;
- -      ret = btrfs_commit_transaction_async(trans, root, 0);
+ +      ret = btrfs_commit_transaction_async(trans, 0);
         if (ret) {
- -              btrfs_end_transaction(trans, root);
+ +              btrfs_end_transaction(trans);
                 return ret;
         }
   out:
@@@ -4341,7 -4319,7 +4329,7 @@@
         return 0;
   }
   
- -static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root,
+ +static noinline long btrfs_ioctl_wait_sync(struct btrfs_fs_info *fs_info,
                                            void __user *argp)
   {
         u64 transid;
@@@ -4352,12 -4330,12 +4340,12 @@@
         } else {
                 transid = 0;  /* current trans */
         }
- -      return btrfs_wait_for_commit(root, transid);
+ +      return btrfs_wait_for_commit(fs_info, transid);
   }
   
   static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
   {
- -      struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(file_inode(file)->i_sb);
         struct btrfs_ioctl_scrub_args *sa;
         int ret;
   
@@@ -4374,7 -4352,7 +4362,7 @@@
                         goto out;
         }
   
- -      ret = btrfs_scrub_dev(root->fs_info, sa->devid, sa->start, sa->end,
+ +      ret = btrfs_scrub_dev(fs_info, sa->devid, sa->start, sa->end,
                               &sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
                               0);
   
@@@ -4388,15 -4366,15 +4376,15 @@@ out
         return ret;
   }
   
- -static long btrfs_ioctl_scrub_cancel(struct btrfs_root *root, void __user *arg)
+ +static long btrfs_ioctl_scrub_cancel(struct btrfs_fs_info *fs_info)
   {
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
   
- -      return btrfs_scrub_cancel(root->fs_info);
+ +      return btrfs_scrub_cancel(fs_info);
   }
   
- -static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
+ +static long btrfs_ioctl_scrub_progress(struct btrfs_fs_info *fs_info,
                                        void __user *arg)
   {
         struct btrfs_ioctl_scrub_args *sa;
@@@ -4409,7 -4387,7 +4397,7 @@@
         if (IS_ERR(sa))
                 return PTR_ERR(sa);
   
- -      ret = btrfs_scrub_progress(root, sa->devid, &sa->progress);
+ +      ret = btrfs_scrub_progress(fs_info, sa->devid, &sa->progress);
   
         if (copy_to_user(arg, sa, sizeof(*sa)))
                 ret = -EFAULT;
@@@ -4418,7 -4396,7 +4406,7 @@@
         return ret;
   }
   
- -static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root,
+ +static long btrfs_ioctl_get_dev_stats(struct btrfs_fs_info *fs_info,
                                       void __user *arg)
   {
         struct btrfs_ioctl_get_dev_stats *sa;
@@@ -4433,7 -4411,7 +4421,7 @@@
                 return -EPERM;
         }
   
- -      ret = btrfs_get_dev_stats(root, sa);
+ +      ret = btrfs_get_dev_stats(fs_info, sa);
   
         if (copy_to_user(arg, sa, sizeof(*sa)))
                 ret = -EFAULT;
@@@ -4442,8 -4420,7 +4430,8 @@@
         return ret;
   }
   
- -static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg)
+ +static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
+ +                                  void __user *arg)
   {
         struct btrfs_ioctl_dev_replace_args *p;
         int ret;
@@@ -4457,25 -4434,27 +4445,25 @@@
   
         switch (p->cmd) {
         case BTRFS_IOCTL_DEV_REPLACE_CMD_START:
- -              if (root->fs_info->sb->s_flags & MS_RDONLY) {
+ +              if (fs_info->sb->s_flags & MS_RDONLY) {
                         ret = -EROFS;
                         goto out;
                 }
                 if (atomic_xchg(
- -                      &root->fs_info->mutually_exclusive_operation_running,
- -                      1)) {
+ +                      &fs_info->mutually_exclusive_operation_running, 1)) {
                         ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
                 } else {
- -                      ret = btrfs_dev_replace_by_ioctl(root, p);
+ +                      ret = btrfs_dev_replace_by_ioctl(fs_info, p);
                         atomic_set(
- -                       &root->fs_info->mutually_exclusive_operation_running,
- -                       0);
+ +                       &fs_info->mutually_exclusive_operation_running, 0);
                 }
                 break;
         case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
- -              btrfs_dev_replace_status(root->fs_info, p);
+ +              btrfs_dev_replace_status(fs_info, p);
                 ret = 0;
                 break;
         case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL:
- -              ret = btrfs_dev_replace_cancel(root->fs_info, p);
+ +              ret = btrfs_dev_replace_cancel(fs_info, p);
                 break;
         default:
                 ret = -EINVAL;
@@@ -4568,7 -4547,7 +4556,7 @@@ static int build_ino_list(u64 inum, u6
         return 0;
   }
   
- -static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
+ +static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
                                         void __user *arg)
   {
         int ret = 0;
@@@ -4581,8 -4560,11 +4569,8 @@@
                 return -EPERM;
   
         loi = memdup_user(arg, sizeof(*loi));
- -      if (IS_ERR(loi)) {
- -              ret = PTR_ERR(loi);
- -              loi = NULL;
- -              goto out;
- -      }
+ +      if (IS_ERR(loi))
+ +              return PTR_ERR(loi);
   
         path = btrfs_alloc_path();
         if (!path) {
@@@ -4598,7 -4580,7 +4586,7 @@@
                 goto out;
         }
   
- -      ret = iterate_inodes_from_logical(loi->logical, root->fs_info, path,
+ +      ret = iterate_inodes_from_logical(loi->logical, fs_info, path,
                                           build_ino_list, inodes);
         if (ret == -EINVAL)
                 ret = -ENOENT;
@@@ -4794,24 -4776,25 +4782,24 @@@ out
         return ret;
   }
   
- -static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd)
+ +static long btrfs_ioctl_balance_ctl(struct btrfs_fs_info *fs_info, int cmd)
   {
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
   
         switch (cmd) {
         case BTRFS_BALANCE_CTL_PAUSE:
- -              return btrfs_pause_balance(root->fs_info);
+ +              return btrfs_pause_balance(fs_info);
         case BTRFS_BALANCE_CTL_CANCEL:
- -              return btrfs_cancel_balance(root->fs_info);
+ +              return btrfs_cancel_balance(fs_info);
         }
   
         return -EINVAL;
   }
   
- -static long btrfs_ioctl_balance_progress(struct btrfs_root *root,
+ +static long btrfs_ioctl_balance_progress(struct btrfs_fs_info *fs_info,
                                          void __user *arg)
   {
- -      struct btrfs_fs_info *fs_info = root->fs_info;
         struct btrfs_ioctl_balance_args *bargs;
         int ret = 0;
   
@@@ -4843,8 -4826,7 +4831,8 @@@ out
   
   static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
   {
- -      struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ +      struct inode *inode = file_inode(file);
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         struct btrfs_ioctl_quota_ctl_args *sa;
         struct btrfs_trans_handle *trans = NULL;
         int ret;
@@@ -4863,8 -4845,8 +4851,8 @@@
                 goto drop_write;
         }
   
- -      down_write(&root->fs_info->subvol_sem);
- -      trans = btrfs_start_transaction(root->fs_info->tree_root, 2);
+ +      down_write(&fs_info->subvol_sem);
+ +      trans = btrfs_start_transaction(fs_info->tree_root, 2);
         if (IS_ERR(trans)) {
                 ret = PTR_ERR(trans);
                 goto out;
@@@ -4872,22 -4854,22 +4860,22 @@@
   
         switch (sa->cmd) {
         case BTRFS_QUOTA_CTL_ENABLE:
- -              ret = btrfs_quota_enable(trans, root->fs_info);
+ +              ret = btrfs_quota_enable(trans, fs_info);
                 break;
         case BTRFS_QUOTA_CTL_DISABLE:
- -              ret = btrfs_quota_disable(trans, root->fs_info);
+ +              ret = btrfs_quota_disable(trans, fs_info);
                 break;
         default:
                 ret = -EINVAL;
                 break;
         }
   
- -      err = btrfs_commit_transaction(trans, root->fs_info->tree_root);
+ +      err = btrfs_commit_transaction(trans);
         if (err && !ret)
                 ret = err;
   out:
         kfree(sa);
- -      up_write(&root->fs_info->subvol_sem);
+ +      up_write(&fs_info->subvol_sem);
   drop_write:
         mnt_drop_write_file(file);
         return ret;
@@@ -4895,9 -4877,7 +4883,9 @@@
   
   static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
   {
- -      struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ +      struct inode *inode = file_inode(file);
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ +      struct btrfs_root *root = BTRFS_I(inode)->root;
         struct btrfs_ioctl_qgroup_assign_args *sa;
         struct btrfs_trans_handle *trans;
         int ret;
@@@ -4924,19 -4904,19 +4912,19 @@@
   
         /* FIXME: check if the IDs really exist */
         if (sa->assign) {
- -              ret = btrfs_add_qgroup_relation(trans, root->fs_info,
+ +              ret = btrfs_add_qgroup_relation(trans, fs_info,
                                                 sa->src, sa->dst);
         } else {
- -              ret = btrfs_del_qgroup_relation(trans, root->fs_info,
+ +              ret = btrfs_del_qgroup_relation(trans, fs_info,
                                                 sa->src, sa->dst);
         }
   
         /* update qgroup status and info */
- -      err = btrfs_run_qgroups(trans, root->fs_info);
+ +      err = btrfs_run_qgroups(trans, fs_info);
         if (err < 0)
- -              btrfs_handle_fs_error(root->fs_info, err,
- -                          "failed to update qgroup status and info");
- -      err = btrfs_end_transaction(trans, root);
+ +              btrfs_handle_fs_error(fs_info, err,
+ +                                    "failed to update qgroup status and info");
+ +      err = btrfs_end_transaction(trans);
         if (err && !ret)
                 ret = err;
   
@@@ -4949,9 -4929,7 +4937,9 @@@ drop_write
   
   static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
   {
- -      struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ +      struct inode *inode = file_inode(file);
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ +      struct btrfs_root *root = BTRFS_I(inode)->root;
         struct btrfs_ioctl_qgroup_create_args *sa;
         struct btrfs_trans_handle *trans;
         int ret;
@@@ -4983,12 -4961,12 +4971,12 @@@
   
         /* FIXME: check if the IDs really exist */
         if (sa->create) {
- -              ret = btrfs_create_qgroup(trans, root->fs_info, sa->qgroupid);
+ +              ret = btrfs_create_qgroup(trans, fs_info, sa->qgroupid);
         } else {
- -              ret = btrfs_remove_qgroup(trans, root->fs_info, sa->qgroupid);
+ +              ret = btrfs_remove_qgroup(trans, fs_info, sa->qgroupid);
         }
   
- -      err = btrfs_end_transaction(trans, root);
+ +      err = btrfs_end_transaction(trans);
         if (err && !ret)
                 ret = err;
   
@@@ -5001,9 -4979,7 +4989,9 @@@ drop_write
   
   static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
   {
- -      struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ +      struct inode *inode = file_inode(file);
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ +      struct btrfs_root *root = BTRFS_I(inode)->root;
         struct btrfs_ioctl_qgroup_limit_args *sa;
         struct btrfs_trans_handle *trans;
         int ret;
@@@ -5036,9 -5012,9 +5024,9 @@@
         }
   
         /* FIXME: check if the IDs really exist */
- -      ret = btrfs_limit_qgroup(trans, root->fs_info, qgroupid, &sa->lim);
+ +      ret = btrfs_limit_qgroup(trans, fs_info, qgroupid, &sa->lim);
   
- -      err = btrfs_end_transaction(trans, root);
+ +      err = btrfs_end_transaction(trans);
         if (err && !ret)
                 ret = err;
   
@@@ -5051,8 -5027,7 +5039,8 @@@ drop_write
   
   static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg)
   {
- -      struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ +      struct inode *inode = file_inode(file);
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         struct btrfs_ioctl_quota_rescan_args *qsa;
         int ret;
   
@@@ -5074,7 -5049,7 +5062,7 @@@
                 goto out;
         }
   
- -      ret = btrfs_qgroup_rescan(root->fs_info);
+ +      ret = btrfs_qgroup_rescan(fs_info);
   
   out:
         kfree(qsa);
@@@ -5085,8 -5060,7 +5073,8 @@@ drop_write
   
   static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg)
   {
- -      struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ +      struct inode *inode = file_inode(file);
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         struct btrfs_ioctl_quota_rescan_args *qsa;
         int ret = 0;
   
@@@ -5097,9 -5071,9 +5085,9 @@@
         if (!qsa)
                 return -ENOMEM;
   
- -      if (root->fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
+ +      if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
                 qsa->flags = 1;
- -              qsa->progress = root->fs_info->qgroup_rescan_progress.objectid;
+ +              qsa->progress = fs_info->qgroup_rescan_progress.objectid;
         }
   
         if (copy_to_user(arg, qsa, sizeof(*qsa)))
@@@ -5111,20 -5085,18 +5099,20 @@@
   
   static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
   {
- -      struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ +      struct inode *inode = file_inode(file);
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
   
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
   
- -      return btrfs_qgroup_wait_for_completion(root->fs_info, true);
+ +      return btrfs_qgroup_wait_for_completion(fs_info, true);
   }
   
   static long _btrfs_ioctl_set_received_subvol(struct file *file,
                                             struct btrfs_ioctl_received_subvol_args *sa)
   {
         struct inode *inode = file_inode(file);
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct btrfs_root_item *root_item = &root->root_item;
         struct btrfs_trans_handle *trans;
@@@ -5139,7 -5111,7 +5127,7 @@@
         if (ret < 0)
                 return ret;
   
- -      down_write(&root->fs_info->subvol_sem);
+ +      down_write(&fs_info->subvol_sem);
   
         if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
                 ret = -EINVAL;
@@@ -5170,7 -5142,8 +5158,7 @@@
                                        BTRFS_UUID_SIZE);
         if (received_uuid_changed &&
             !btrfs_is_empty_uuid(root_item->received_uuid))
- -              btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root,
- -                                  root_item->received_uuid,
+ +              btrfs_uuid_tree_rem(trans, fs_info, root_item->received_uuid,
                                     BTRFS_UUID_KEY_RECEIVED_SUBVOL,
                                     root->root_key.objectid);
         memcpy(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE);
@@@ -5181,14 -5154,15 +5169,14 @@@
         btrfs_set_stack_timespec_sec(&root_item->rtime, sa->rtime.sec);
         btrfs_set_stack_timespec_nsec(&root_item->rtime, sa->rtime.nsec);
   
- -      ret = btrfs_update_root(trans, root->fs_info->tree_root,
+ +      ret = btrfs_update_root(trans, fs_info->tree_root,
                                 &root->root_key, &root->root_item);
         if (ret < 0) {
- -              btrfs_end_transaction(trans, root);
+ +              btrfs_end_transaction(trans);
                 goto out;
         }
         if (received_uuid_changed && !btrfs_is_empty_uuid(sa->uuid)) {
- -              ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root,
- -                                        sa->uuid,
+ +              ret = btrfs_uuid_tree_add(trans, fs_info, sa->uuid,
                                           BTRFS_UUID_KEY_RECEIVED_SUBVOL,
                                           root->root_key.objectid);
                 if (ret < 0 && ret != -EEXIST) {
@@@ -5196,14 -5170,14 +5184,14 @@@
                         goto out;
                 }
         }
- -      ret = btrfs_commit_transaction(trans, root);
+ +      ret = btrfs_commit_transaction(trans);
         if (ret < 0) {
                 btrfs_abort_transaction(trans, ret);
                 goto out;
         }
   
   out:
- -      up_write(&root->fs_info->subvol_sem);
+ +      up_write(&fs_info->subvol_sem);
         mnt_drop_write_file(file);
         return ret;
   }
@@@ -5217,8 -5191,11 +5205,8 @@@ static long btrfs_ioctl_set_received_su
         int ret = 0;
   
         args32 = memdup_user(arg, sizeof(*args32));
- -      if (IS_ERR(args32)) {
- -              ret = PTR_ERR(args32);
- -              args32 = NULL;
- -              goto out;
- -      }
+ +      if (IS_ERR(args32))
+ +              return PTR_ERR(args32);
   
         args64 = kmalloc(sizeof(*args64), GFP_KERNEL);
         if (!args64) {
@@@ -5266,8 -5243,11 +5254,8 @@@ static long btrfs_ioctl_set_received_su
         int ret = 0;
   
         sa = memdup_user(arg, sizeof(*sa));
- -      if (IS_ERR(sa)) {
- -              ret = PTR_ERR(sa);
- -              sa = NULL;
- -              goto out;
- -      }
+ +      if (IS_ERR(sa))
+ +              return PTR_ERR(sa);
   
         ret = _btrfs_ioctl_set_received_subvol(file, sa);
   
@@@ -5285,22 -5265,20 +5273,22 @@@ out
   
   static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg)
   {
- -      struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ +      struct inode *inode = file_inode(file);
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         size_t len;
         int ret;
         char label[BTRFS_LABEL_SIZE];
   
- -      spin_lock(&root->fs_info->super_lock);
- -      memcpy(label, root->fs_info->super_copy->label, BTRFS_LABEL_SIZE);
- -      spin_unlock(&root->fs_info->super_lock);
+ +      spin_lock(&fs_info->super_lock);
+ +      memcpy(label, fs_info->super_copy->label, BTRFS_LABEL_SIZE);
+ +      spin_unlock(&fs_info->super_lock);
   
         len = strnlen(label, BTRFS_LABEL_SIZE);
   
         if (len == BTRFS_LABEL_SIZE) {
- -              btrfs_warn(root->fs_info,
- -                      "label is too long, return the first %zu bytes", --len);
+ +              btrfs_warn(fs_info,
+ +                         "label is too long, return the first %zu bytes",
+ +                         --len);
         }
   
         ret = copy_to_user(arg, label, len);
@@@ -5310,10 -5288,8 +5298,10 @@@
   
   static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
   {
- -      struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
- -      struct btrfs_super_block *super_block = root->fs_info->super_copy;
+ +      struct inode *inode = file_inode(file);
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ +      struct btrfs_root *root = BTRFS_I(inode)->root;
+ +      struct btrfs_super_block *super_block = fs_info->super_copy;
         struct btrfs_trans_handle *trans;
         char label[BTRFS_LABEL_SIZE];
         int ret;
@@@ -5325,7 -5301,7 +5313,7 @@@
                 return -EFAULT;
   
         if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) {
- -              btrfs_err(root->fs_info,
+ +              btrfs_err(fs_info,
                           "unable to set label with more than %d bytes",
                           BTRFS_LABEL_SIZE - 1);
                 return -EINVAL;
@@@ -5341,10 -5317,10 +5329,10 @@@
                 goto out_unlock;
         }
   
- -      spin_lock(&root->fs_info->super_lock);
+ +      spin_lock(&fs_info->super_lock);
         strcpy(super_block->label, label);
- -      spin_unlock(&root->fs_info->super_lock);
- -      ret = btrfs_commit_transaction(trans, root);
+ +      spin_unlock(&fs_info->super_lock);
+ +      ret = btrfs_commit_transaction(trans);
   
   out_unlock:
         mnt_drop_write_file(file);
@@@ -5372,9 -5348,8 +5360,9 @@@ int btrfs_ioctl_get_supported_features(
   
   static int btrfs_ioctl_get_features(struct file *file, void __user *arg)
   {
- -      struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
- -      struct btrfs_super_block *super_block = root->fs_info->super_copy;
+ +      struct inode *inode = file_inode(file);
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ +      struct btrfs_super_block *super_block = fs_info->super_copy;
         struct btrfs_ioctl_feature_flags features;
   
         features.compat_flags = btrfs_super_compat_flags(super_block);
@@@ -5387,7 -5362,7 +5375,7 @@@
         return 0;
   }
   
- -static int check_feature_bits(struct btrfs_root *root,
+ +static int check_feature_bits(struct btrfs_fs_info *fs_info,
                               enum btrfs_feature_set set,
                               u64 change_mask, u64 flags, u64 supported_flags,
                               u64 safe_set, u64 safe_clear)
@@@ -5402,14 -5377,14 +5390,14 @@@
         if (unsupported) {
                 names = btrfs_printable_features(set, unsupported);
                 if (names) {
- -                      btrfs_warn(root->fs_info,
- -                         "this kernel does not support the %s feature bit%s",
- -                         names, strchr(names, ',') ? "s" : "");
+ +                      btrfs_warn(fs_info,
+ +                                 "this kernel does not support the %s feature bit%s",
+ +                                 names, strchr(names, ',') ? "s" : "");
                         kfree(names);
                 } else
- -                      btrfs_warn(root->fs_info,
- -                         "this kernel does not support %s bits 0x%llx",
- -                         type, unsupported);
+ +                      btrfs_warn(fs_info,
+ +                                 "this kernel does not support %s bits 0x%llx",
+ +                                 type, unsupported);
                 return -EOPNOTSUPP;
         }
   
@@@ -5417,14 -5392,14 +5405,14 @@@
         if (disallowed) {
                 names = btrfs_printable_features(set, disallowed);
                 if (names) {
- -                      btrfs_warn(root->fs_info,
- -                         "can't set the %s feature bit%s while mounted",
- -                         names, strchr(names, ',') ? "s" : "");
+ +                      btrfs_warn(fs_info,
+ +                                 "can't set the %s feature bit%s while mounted",
+ +                                 names, strchr(names, ',') ? "s" : "");
                         kfree(names);
                 } else
- -                      btrfs_warn(root->fs_info,
- -                         "can't set %s bits 0x%llx while mounted",
- -                         type, disallowed);
+ +                      btrfs_warn(fs_info,
+ +                                 "can't set %s bits 0x%llx while mounted",
+ +                                 type, disallowed);
                 return -EPERM;
         }
   
@@@ -5432,32 -5407,30 +5420,32 @@@
         if (disallowed) {
                 names = btrfs_printable_features(set, disallowed);
                 if (names) {
- -                      btrfs_warn(root->fs_info,
- -                         "can't clear the %s feature bit%s while mounted",
- -                         names, strchr(names, ',') ? "s" : "");
+ +                      btrfs_warn(fs_info,
+ +                                 "can't clear the %s feature bit%s while mounted",
+ +                                 names, strchr(names, ',') ? "s" : "");
                         kfree(names);
                 } else
- -                      btrfs_warn(root->fs_info,
- -                         "can't clear %s bits 0x%llx while mounted",
- -                         type, disallowed);
+ +                      btrfs_warn(fs_info,
+ +                                 "can't clear %s bits 0x%llx while mounted",
+ +                                 type, disallowed);
                 return -EPERM;
         }
   
         return 0;
   }
   
- -#define check_feature(root, change_mask, flags, mask_base)    \
- -check_feature_bits(root, FEAT_##mask_base, change_mask, flags,        \
+ +#define check_feature(fs_info, change_mask, flags, mask_base) \
+ +check_feature_bits(fs_info, FEAT_##mask_base, change_mask, flags,     \
                    BTRFS_FEATURE_ ## mask_base ## _SUPP,        \
                    BTRFS_FEATURE_ ## mask_base ## _SAFE_SET,    \
                    BTRFS_FEATURE_ ## mask_base ## _SAFE_CLEAR)
   
   static int btrfs_ioctl_set_features(struct file *file, void __user *arg)
   {
- -      struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
- -      struct btrfs_super_block *super_block = root->fs_info->super_copy;
+ +      struct inode *inode = file_inode(file);
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ +      struct btrfs_root *root = BTRFS_I(inode)->root;
+ +      struct btrfs_super_block *super_block = fs_info->super_copy;
         struct btrfs_ioctl_feature_flags flags[2];
         struct btrfs_trans_handle *trans;
         u64 newflags;
@@@ -5474,17 -5447,17 +5462,17 @@@
             !flags[0].incompat_flags)
                 return 0;
   
- -      ret = check_feature(root, flags[0].compat_flags,
+ +      ret = check_feature(fs_info, flags[0].compat_flags,
                             flags[1].compat_flags, COMPAT);
         if (ret)
                 return ret;
   
- -      ret = check_feature(root, flags[0].compat_ro_flags,
+ +      ret = check_feature(fs_info, flags[0].compat_ro_flags,
                             flags[1].compat_ro_flags, COMPAT_RO);
         if (ret)
                 return ret;
   
- -      ret = check_feature(root, flags[0].incompat_flags,
+ +      ret = check_feature(fs_info, flags[0].incompat_flags,
                             flags[1].incompat_flags, INCOMPAT);
         if (ret)
                 return ret;
@@@ -5499,7 -5472,7 +5487,7 @@@
                 goto out_drop_write;
         }
   
- -      spin_lock(&root->fs_info->super_lock);
+ +      spin_lock(&fs_info->super_lock);
         newflags = btrfs_super_compat_flags(super_block);
         newflags |= flags[0].compat_flags & flags[1].compat_flags;
         newflags &= ~(flags[0].compat_flags & ~flags[1].compat_flags);
@@@ -5514,9 -5487,9 +5502,9 @@@
         newflags |= flags[0].incompat_flags & flags[1].incompat_flags;
         newflags &= ~(flags[0].incompat_flags & ~flags[1].incompat_flags);
         btrfs_set_super_incompat_flags(super_block, newflags);
- -      spin_unlock(&root->fs_info->super_lock);
+ +      spin_unlock(&fs_info->super_lock);
   
- -      ret = btrfs_commit_transaction(trans, root);
+ +      ret = btrfs_commit_transaction(trans);
   out_drop_write:
         mnt_drop_write_file(file);
   
@@@ -5526,9 -5499,7 +5514,9 @@@
   long btrfs_ioctl(struct file *file, unsigned int
                 cmd, unsigned long arg)
   {
- -      struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ +      struct inode *inode = file_inode(file);
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ +      struct btrfs_root *root = BTRFS_I(inode)->root;
         void __user *argp = (void __user *)arg;
   
         switch (cmd) {
@@@ -5563,15 -5534,15 +5551,15 @@@
         case BTRFS_IOC_RESIZE:
                 return btrfs_ioctl_resize(file, argp);
         case BTRFS_IOC_ADD_DEV:
- -              return btrfs_ioctl_add_dev(root, argp);
+ +              return btrfs_ioctl_add_dev(fs_info, argp);
         case BTRFS_IOC_RM_DEV:
                 return btrfs_ioctl_rm_dev(file, argp);
         case BTRFS_IOC_RM_DEV_V2:
                 return btrfs_ioctl_rm_dev_v2(file, argp);
         case BTRFS_IOC_FS_INFO:
- -              return btrfs_ioctl_fs_info(root, argp);
+ +              return btrfs_ioctl_fs_info(fs_info, argp);
         case BTRFS_IOC_DEV_INFO:
- -              return btrfs_ioctl_dev_info(root, argp);
+ +              return btrfs_ioctl_dev_info(fs_info, argp);
         case BTRFS_IOC_BALANCE:
                 return btrfs_ioctl_balance(file, NULL);
         case BTRFS_IOC_TRANS_START:
@@@ -5587,40 -5558,40 +5575,40 @@@
         case BTRFS_IOC_INO_PATHS:
                 return btrfs_ioctl_ino_to_path(root, argp);
         case BTRFS_IOC_LOGICAL_INO:
- -              return btrfs_ioctl_logical_to_ino(root, argp);
+ +              return btrfs_ioctl_logical_to_ino(fs_info, argp);
         case BTRFS_IOC_SPACE_INFO:
- -              return btrfs_ioctl_space_info(root, argp);
+ +              return btrfs_ioctl_space_info(fs_info, argp);
         case BTRFS_IOC_SYNC: {
                 int ret;
   
- -              ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1);
+ +              ret = btrfs_start_delalloc_roots(fs_info, 0, -1);
                 if (ret)
                         return ret;
- -              ret = btrfs_sync_fs(file_inode(file)->i_sb, 1);
+ +              ret = btrfs_sync_fs(inode->i_sb, 1);
                 /*
                  * The transaction thread may want to do more work,
                  * namely it pokes the cleaner kthread that will start
                  * processing uncleaned subvols.
                  */
- -              wake_up_process(root->fs_info->transaction_kthread);
+ +              wake_up_process(fs_info->transaction_kthread);
                 return ret;
         }
         case BTRFS_IOC_START_SYNC:
                 return btrfs_ioctl_start_sync(root, argp);
         case BTRFS_IOC_WAIT_SYNC:
- -              return btrfs_ioctl_wait_sync(root, argp);
+ +              return btrfs_ioctl_wait_sync(fs_info, argp);
         case BTRFS_IOC_SCRUB:
                 return btrfs_ioctl_scrub(file, argp);
         case BTRFS_IOC_SCRUB_CANCEL:
- -              return btrfs_ioctl_scrub_cancel(root, argp);
+ +              return btrfs_ioctl_scrub_cancel(fs_info);
         case BTRFS_IOC_SCRUB_PROGRESS:
- -              return btrfs_ioctl_scrub_progress(root, argp);
+ +              return btrfs_ioctl_scrub_progress(fs_info, argp);
         case BTRFS_IOC_BALANCE_V2:
                 return btrfs_ioctl_balance(file, argp);
         case BTRFS_IOC_BALANCE_CTL:
- -              return btrfs_ioctl_balance_ctl(root, arg);
+ +              return btrfs_ioctl_balance_ctl(fs_info, arg);
         case BTRFS_IOC_BALANCE_PROGRESS:
- -              return btrfs_ioctl_balance_progress(root, argp);
+ +              return btrfs_ioctl_balance_progress(fs_info, argp);
         case BTRFS_IOC_SET_RECEIVED_SUBVOL:
                 return btrfs_ioctl_set_received_subvol(file, argp);
   #ifdef CONFIG_64BIT
@@@ -5630,7 -5601,7 +5618,7 @@@
         case BTRFS_IOC_SEND:
                 return btrfs_ioctl_send(file, argp);
         case BTRFS_IOC_GET_DEV_STATS:
- -              return btrfs_ioctl_get_dev_stats(root, argp);
+ +              return btrfs_ioctl_get_dev_stats(fs_info, argp);
         case BTRFS_IOC_QUOTA_CTL:
                 return btrfs_ioctl_quota_ctl(file, argp);
         case BTRFS_IOC_QGROUP_ASSIGN:
@@@ -5646,7 -5617,7 +5634,7 @@@
         case BTRFS_IOC_QUOTA_RESCAN_WAIT:
                 return btrfs_ioctl_quota_rescan_wait(file, argp);
         case BTRFS_IOC_DEV_REPLACE:
- -              return btrfs_ioctl_dev_replace(root, argp);
+ +              return btrfs_ioctl_dev_replace(fs_info, argp);
         case BTRFS_IOC_GET_FSLABEL:
                 return btrfs_ioctl_get_fslabel(file, argp);
         case BTRFS_IOC_SET_FSLABEL:
diff --combined fs/ceph/addr.c

index a0f1e2b,834be09..9cd0c0e
--- 1/fs/ceph/addr.c
--- 2/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@@ -315,32 -315,7 +315,32 @@@ static int start_read(struct inode *ino
         struct page **pages;
         pgoff_t next_index;
         int nr_pages = 0;
- -      int ret;
+ +      int got = 0;
+ +      int ret = 0;
+ +
+ +      if (!current->journal_info) {
+ +              /* caller of readpages does not hold buffer and read caps
+ +               * (fadvise, madvise and readahead cases) */
+ +              int want = CEPH_CAP_FILE_CACHE;
+ +              ret = ceph_try_get_caps(ci, CEPH_CAP_FILE_RD, want, &got);
+ +              if (ret < 0) {
+ +                      dout("start_read %p, error getting cap\n", inode);
+ +              } else if (!(got & want)) {
+ +                      dout("start_read %p, no cache cap\n", inode);
+ +                      ret = 0;
+ +              }
+ +              if (ret <= 0) {
+ +                      if (got)
+ +                              ceph_put_cap_refs(ci, got);
+ +                      while (!list_empty(page_list)) {
+ +                              page = list_entry(page_list->prev,
+ +                                                struct page, lru);
+ +                              list_del(&page->lru);
+ +                              put_page(page);
+ +                      }
+ +                      return ret;
+ +              }
+ +      }
   
         off = (u64) page_offset(page);
   
@@@ -363,18 -338,15 +363,18 @@@
                                     CEPH_OSD_FLAG_READ, NULL,
                                     ci->i_truncate_seq, ci->i_truncate_size,
                                     false);
- -      if (IS_ERR(req))
- -              return PTR_ERR(req);
+ +      if (IS_ERR(req)) {
+ +              ret = PTR_ERR(req);
+ +              goto out;
+ +      }
   
         /* build page vector */
         nr_pages = calc_pages_for(0, len);
         pages = kmalloc(sizeof(*pages) * nr_pages, GFP_KERNEL);
- -      ret = -ENOMEM;
- -      if (!pages)
- -              goto out;
+ +      if (!pages) {
+ +              ret = -ENOMEM;
+ +              goto out_put;
+ +      }
         for (i = 0; i < nr_pages; ++i) {
                 page = list_entry(page_list->prev, struct page, lru);
                 BUG_ON(PageLocked(page));
@@@ -406,12 -378,6 +406,12 @@@
         if (ret < 0)
                 goto out_pages;
         ceph_osdc_put_request(req);
+ +
+ +      /* After adding locked pages to page cache, the inode holds cache cap.
+ +       * So we can drop our cap refs. */
+ +      if (got)
+ +              ceph_put_cap_refs(ci, got);
+ +
         return nr_pages;
   
   out_pages:
@@@ -420,11 -386,8 +420,11 @@@
                 unlock_page(pages[i]);
         }
         ceph_put_page_vector(pages, nr_pages, false);
- -out:
+ +out_put:
         ceph_osdc_put_request(req);
+ +out:
+ +      if (got)
+ +              ceph_put_cap_refs(ci, got);
         return ret;
   }
   
@@@ -461,6 -424,7 +461,6 @@@ static int ceph_readpages(struct file *
                 rc = start_read(inode, page_list, max);
                 if (rc < 0)
                         goto out;
- -              BUG_ON(rc == 0);
         }
   out:
         ceph_fscache_readpages_cancel(inode, page_list);
@@@ -474,9 -438,7 +474,9 @@@
    * only snap context we are allowed to write back.
    */
   static struct ceph_snap_context *get_oldest_context(struct inode *inode,
- -                                                  loff_t *snap_size)
+ +                                                  loff_t *snap_size,
+ +                                                  u64 *truncate_size,
+ +                                                  u32 *truncate_seq)
   {
         struct ceph_inode_info *ci = ceph_inode(inode);
         struct ceph_snap_context *snapc = NULL;
@@@ -490,10 -452,6 +490,10 @@@
                         snapc = ceph_get_snap_context(capsnap->context);
                         if (snap_size)
                                 *snap_size = capsnap->size;
+ +                      if (truncate_size)
+ +                              *truncate_size = capsnap->truncate_size;
+ +                      if (truncate_seq)
+ +                              *truncate_seq = capsnap->truncate_seq;
                         break;
                 }
         }
@@@ -501,10 -459,6 +501,10 @@@
                 snapc = ceph_get_snap_context(ci->i_head_snapc);
                 dout(" head snapc %p has %d dirty pages\n",
                      snapc, ci->i_wrbuffer_ref_head);
+ +              if (truncate_size)
+ +                      *truncate_size = capsnap->truncate_size;
+ +              if (truncate_seq)
+ +                      *truncate_seq = capsnap->truncate_seq;
         }
         spin_unlock(&ci->i_ceph_lock);
         return snapc;
@@@ -547,8 -501,7 +547,8 @@@ static int writepage_nounlock(struct pa
                 dout("writepage %p page %p not dirty?\n", inode, page);
                 goto out;
         }
- -      oldest = get_oldest_context(inode, &snap_size);
+ +      oldest = get_oldest_context(inode, &snap_size,
+ +                                  &truncate_size, &truncate_seq);
         if (snapc->seq > oldest->seq) {
                 dout("writepage %p page %p snapc %p not writeable - noop\n",
                      inode, page, snapc);
@@@ -559,8 -512,12 +559,8 @@@
         }
         ceph_put_snap_context(oldest);
   
- -      spin_lock(&ci->i_ceph_lock);
- -      truncate_seq = ci->i_truncate_seq;
- -      truncate_size = ci->i_truncate_size;
         if (snap_size == -1)
                 snap_size = i_size_read(inode);
- -      spin_unlock(&ci->i_ceph_lock);
   
         /* is this a partial page at end of file? */
         if (page_off >= snap_size) {
@@@ -807,8 -764,7 +807,8 @@@ retry
         /* find oldest snap context with dirty data */
         ceph_put_snap_context(snapc);
         snap_size = -1;
- -      snapc = get_oldest_context(inode, &snap_size);
+ +      snapc = get_oldest_context(inode, &snap_size,
+ +                                 &truncate_size, &truncate_seq);
         if (!snapc) {
                 /* hmm, why does writepages get called when there
                    is no dirty data? */
@@@ -818,7 -774,11 +818,7 @@@
         dout(" oldest snapc is %p seq %lld (%d snaps)\n",
              snapc, snapc->seq, snapc->num_snaps);
   
- -      spin_lock(&ci->i_ceph_lock);
- -      truncate_seq = ci->i_truncate_seq;
- -      truncate_size = ci->i_truncate_size;
         i_size = i_size_read(inode);
- -      spin_unlock(&ci->i_ceph_lock);
   
         if (last_snapc && snapc != last_snapc) {
                 /* if we switched to a newer snapc, restart our scan at the
@@@ -1164,8 -1124,7 +1164,8 @@@ out
   static int context_is_writeable_or_written(struct inode *inode,
                                            struct ceph_snap_context *snapc)
   {
- -      struct ceph_snap_context *oldest = get_oldest_context(inode, NULL);
+ +      struct ceph_snap_context *oldest = get_oldest_context(inode, NULL,
+ +                                                            NULL, NULL);
         int ret = !oldest || snapc->seq <= oldest->seq;
   
         ceph_put_snap_context(oldest);
@@@ -1210,7 -1169,7 +1210,7 @@@ retry_locked
                  * this page is already dirty in another (older) snap
                  * context!  is it writeable now?
                  */
- -              oldest = get_oldest_context(inode, NULL);
+ +              oldest = get_oldest_context(inode, NULL, NULL, NULL);
   
                 if (snapc->seq > oldest->seq) {
                         ceph_put_snap_context(oldest);
@@@ -1317,25 -1276,27 +1317,27 @@@ static int ceph_write_end(struct file *
                           struct page *page, void *fsdata)
   {
         struct inode *inode = file_inode(file);
-       unsigned from = pos & (PAGE_SIZE - 1);
         int check_cap = 0;
   
         dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
              inode, page, (int)pos, (int)copied, (int)len);
   
         /* zero the stale part of the page if we did a short copy */
-       if (copied < len)
-               zero_user_segment(page, from+copied, len);
+       if (!PageUptodate(page)) {
+               if (copied < len) {
+                       copied = 0;
+                       goto out;
+               }
+               SetPageUptodate(page);
+       }
   
         /* did file size increase? */
         if (pos+copied > i_size_read(inode))
                 check_cap = ceph_inode_set_size(inode, pos+copied);
   
-       if (!PageUptodate(page))
-               SetPageUptodate(page);
- 
         set_page_dirty(page);
   
+ out:
         unlock_page(page);
         put_page(page);
   
@@@ -1412,11 -1373,9 +1414,11 @@@ static int ceph_filemap_fault(struct vm
              inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got));
   
         if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) ||
- -          ci->i_inline_version == CEPH_INLINE_NONE)
+ +          ci->i_inline_version == CEPH_INLINE_NONE) {
+ +              current->journal_info = vma->vm_file;
                 ret = filemap_fault(vma, vmf);
- -      else
+ +              current->journal_info = NULL;
+ +      } else
                 ret = -EAGAIN;
   
         dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n",
@@@ -1948,15 -1907,6 +1950,15 @@@ int ceph_pool_perm_check(struct ceph_in
         struct ceph_string *pool_ns;
         int ret, flags;
   
+ +      if (ci->i_vino.snap != CEPH_NOSNAP) {
+ +              /*
+ +               * Pool permission check needs to write to the first object.
+ +               * But for snapshot, head of the first object may have alread
+ +               * been deleted. Skip check to avoid creating orphan object.
+ +               */
+ +              return 0;
+ +      }
+ +
         if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode),
                                 NOPOOLPERM))
                 return 0;
diff --combined fs/ext4/super.c

index dfc8309,2d97f7a..63a6b63
--- 1/fs/ext4/super.c
--- 2/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@@ -863,6 -863,7 +863,6 @@@ static void ext4_put_super(struct super
         percpu_counter_destroy(&sbi->s_dirs_counter);
         percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
         percpu_free_rwsem(&sbi->s_journal_flag_rwsem);
- -      brelse(sbi->s_sbh);
   #ifdef CONFIG_QUOTA
         for (i = 0; i < EXT4_MAXQUOTAS; i++)
                 kfree(sbi->s_qf_names[i]);
@@@ -894,7 -895,6 +894,7 @@@
         }
         if (sbi->s_mmp_tsk)
                 kthread_stop(sbi->s_mmp_tsk);
+ +      brelse(sbi->s_sbh);
         sb->s_fs_info = NULL;
         /*
          * Now that we are completely done shutting down the
@@@ -1114,55 -1114,37 +1114,55 @@@ static int ext4_prepare_context(struct 
   static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
                                                         void *fs_data)
   {
- -      handle_t *handle;
- -      int res, res2;
+ +      handle_t *handle = fs_data;
+ +      int res, res2, retries = 0;
+ +
+ +      /*
+ +       * If a journal handle was specified, then the encryption context is
+ +       * being set on a new inode via inheritance and is part of a larger
+ +       * transaction to create the inode.  Otherwise the encryption context is
+ +       * being set on an existing inode in its own transaction.  Only in the
+ +       * latter case should the "retry on ENOSPC" logic be used.
+ +       */
   
- -      /* fs_data is null when internally used. */
- -      if (fs_data) {
- -              res  = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION,
- -                              EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx,
- -                              len, 0);
+ +      if (handle) {
+ +              res = ext4_xattr_set_handle(handle, inode,
+ +                                          EXT4_XATTR_INDEX_ENCRYPTION,
+ +                                          EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
+ +                                          ctx, len, 0);
                 if (!res) {
                         ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
                         ext4_clear_inode_state(inode,
                                         EXT4_STATE_MAY_INLINE_DATA);
+ +                      /*
+ +                       * Update inode->i_flags - e.g. S_DAX may get disabled
+ +                       */
+ +                      ext4_set_inode_flags(inode);
                 }
                 return res;
         }
   
+ +retry:
         handle = ext4_journal_start(inode, EXT4_HT_MISC,
                         ext4_jbd2_credits_xattr(inode));
         if (IS_ERR(handle))
                 return PTR_ERR(handle);
   
- -      res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION,
- -                      EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx,
- -                      len, 0);
+ +      res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION,
+ +                                  EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
+ +                                  ctx, len, 0);
         if (!res) {
                 ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
+ +              /* Update inode->i_flags - e.g. S_DAX may get disabled */
+ +              ext4_set_inode_flags(inode);
                 res = ext4_mark_inode_dirty(handle, inode);
                 if (res)
                         EXT4_ERROR_INODE(inode, "Failed to mark inode dirty");
         }
         res2 = ext4_journal_stop(handle);
+ +
+ +      if (res == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ +              goto retry;
         if (!res)
                 res = res2;
         return res;
@@@ -1205,7 -1187,7 +1205,7 @@@ static int ext4_release_dquot(struct dq
   static int ext4_mark_dquot_dirty(struct dquot *dquot);
   static int ext4_write_info(struct super_block *sb, int type);
   static int ext4_quota_on(struct super_block *sb, int type, int format_id,
-                        struct path *path);
+                        const struct path *path);
   static int ext4_quota_off(struct super_block *sb, int type);
   static int ext4_quota_on_mount(struct super_block *sb, int type);
   static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
@@@ -1901,6 -1883,12 +1901,6 @@@ static int parse_options(char *options
                         return 0;
                 }
         }
- -      if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
- -          test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
- -              ext4_msg(sb, KERN_ERR, "can't mount with journal_async_commit "
- -                       "in data=ordered mode");
- -              return 0;
- -      }
         return 1;
   }
   
@@@ -2342,7 -2330,7 +2342,7 @@@ static void ext4_orphan_cleanup(struct 
                                 struct ext4_super_block *es)
   {
         unsigned int s_flags = sb->s_flags;
- -      int nr_orphans = 0, nr_truncates = 0;
+ +      int ret, nr_orphans = 0, nr_truncates = 0;
   #ifdef CONFIG_QUOTA
         int i;
   #endif
@@@ -2424,9 -2412,7 +2424,9 @@@
                                   inode->i_ino, inode->i_size);
                         inode_lock(inode);
                         truncate_inode_pages(inode->i_mapping, inode->i_size);
- -                      ext4_truncate(inode);
+ +                      ret = ext4_truncate(inode);
+ +                      if (ret)
+ +                              ext4_std_error(inode->i_sb, ret);
                         inode_unlock(inode);
                         nr_truncates++;
                 } else {
@@@ -3207,15 -3193,10 +3207,15 @@@ static int count_overhead(struct super_
                         ext4_set_bit(s++, buf);
                         count++;
                 }
- -              for (j = ext4_bg_num_gdb(sb, grp); j > 0; j--) {
- -                      ext4_set_bit(EXT4_B2C(sbi, s++), buf);
- -                      count++;
+ +              j = ext4_bg_num_gdb(sb, grp);
+ +              if (s + j > EXT4_BLOCKS_PER_GROUP(sb)) {
+ +                      ext4_error(sb, "Invalid number of block group "
+ +                                 "descriptor blocks: %d", j);
+ +                      j = EXT4_BLOCKS_PER_GROUP(sb) - s;
                 }
+ +              count += j;
+ +              for (; j > 0; j--)
+ +                      ext4_set_bit(EXT4_B2C(sbi, s++), buf);
         }
         if (!count)
                 return 0;
@@@ -3320,7 -3301,7 +3320,7 @@@ static int ext4_fill_super(struct super
         char *orig_data = kstrdup(data, GFP_KERNEL);
         struct buffer_head *bh;
         struct ext4_super_block *es = NULL;
- -      struct ext4_sb_info *sbi;
+ +      struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
         ext4_fsblk_t block;
         ext4_fsblk_t sb_block = get_sb_block(&data);
         ext4_fsblk_t logical_sb_block;
@@@ -3339,14 -3320,16 +3339,14 @@@
         unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
         ext4_group_t first_not_zeroed;
   
- -      sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
- -      if (!sbi)
- -              goto out_free_orig;
+ +      if ((data && !orig_data) || !sbi)
+ +              goto out_free_base;
   
         sbi->s_blockgroup_lock =
                 kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
- -      if (!sbi->s_blockgroup_lock) {
- -              kfree(sbi);
- -              goto out_free_orig;
- -      }
+ +      if (!sbi->s_blockgroup_lock)
+ +              goto out_free_base;
+ +
         sb->s_fs_info = sbi;
         sbi->s_sb = sb;
         sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
@@@ -3492,19 -3475,11 +3492,19 @@@
          */
         sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
   
- -      if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
- -                         &journal_devnum, &journal_ioprio, 0)) {
- -              ext4_msg(sb, KERN_WARNING,
- -                       "failed to parse options in superblock: %s",
- -                       sbi->s_es->s_mount_opts);
+ +      if (sbi->s_es->s_mount_opts[0]) {
+ +              char *s_mount_opts = kstrndup(sbi->s_es->s_mount_opts,
+ +                                            sizeof(sbi->s_es->s_mount_opts),
+ +                                            GFP_KERNEL);
+ +              if (!s_mount_opts)
+ +                      goto failed_mount;
+ +              if (!parse_options(s_mount_opts, sb, &journal_devnum,
+ +                                 &journal_ioprio, 0)) {
+ +                      ext4_msg(sb, KERN_WARNING,
+ +                               "failed to parse options in superblock: %s",
+ +                               s_mount_opts);
+ +              }
+ +              kfree(s_mount_opts);
         }
         sbi->s_def_mount_opt = sbi->s_mount_opt;
         if (!parse_options((char *) data, sb, &journal_devnum,
@@@ -3530,11 -3505,6 +3530,11 @@@
                                  "both data=journal and dax");
                         goto failed_mount;
                 }
+ +              if (ext4_has_feature_encrypt(sb)) {
+ +                      ext4_msg(sb, KERN_WARNING,
+ +                               "encrypted files will use data=ordered "
+ +                               "instead of data journaling mode");
+ +              }
                 if (test_opt(sb, DELALLOC))
                         clear_opt(sb, DELALLOC);
         } else {
@@@ -3690,16 -3660,12 +3690,16 @@@
   
         sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
         sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
- -      if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0)
- -              goto cantfind_ext4;
   
         sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
         if (sbi->s_inodes_per_block == 0)
                 goto cantfind_ext4;
+ +      if (sbi->s_inodes_per_group < sbi->s_inodes_per_block ||
+ +          sbi->s_inodes_per_group > blocksize * 8) {
+ +              ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n",
+ +                       sbi->s_blocks_per_group);
+ +              goto failed_mount;
+ +      }
         sbi->s_itb_per_group = sbi->s_inodes_per_group /
                                         sbi->s_inodes_per_block;
         sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb);
@@@ -3782,6 -3748,13 +3782,6 @@@
         }
         sbi->s_cluster_ratio = clustersize / blocksize;
   
- -      if (sbi->s_inodes_per_group > blocksize * 8) {
- -              ext4_msg(sb, KERN_ERR,
- -                     "#inodes per group too big: %lu",
- -                     sbi->s_inodes_per_group);
- -              goto failed_mount;
- -      }
- -
         /* Do we have standard group size of clustersize * 8 blocks ? */
         if (sbi->s_blocks_per_group == clustersize << 3)
                 set_opt2(sb, STD_GROUP_SIZE);
@@@ -3841,15 -3814,6 +3841,15 @@@
                         (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
         db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
                    EXT4_DESC_PER_BLOCK(sb);
+ +      if (ext4_has_feature_meta_bg(sb)) {
+ +              if (le32_to_cpu(es->s_first_meta_bg) >= db_count) {
+ +                      ext4_msg(sb, KERN_WARNING,
+ +                               "first meta block group too large: %u "
+ +                               "(group descriptor block count %u)",
+ +                               le32_to_cpu(es->s_first_meta_bg), db_count);
+ +                      goto failed_mount;
+ +              }
+ +      }
         sbi->s_group_desc = ext4_kvmalloc(db_count *
                                           sizeof(struct buffer_head *),
                                           GFP_KERNEL);
@@@ -4003,14 -3967,6 +4003,14 @@@
         default:
                 break;
         }
+ +
+ +      if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
+ +          test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+ +              ext4_msg(sb, KERN_ERR, "can't mount with "
+ +                      "journal_async_commit in data=ordered mode");
+ +              goto failed_mount_wq;
+ +      }
+ +
         set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
   
         sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
@@@ -4204,9 -4160,7 +4204,9 @@@ no_journal
   
         if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount"))
                 ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
- -                       "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
+ +                       "Opts: %.*s%s%s", descr,
+ +                       (int) sizeof(sbi->s_es->s_mount_opts),
+ +                       sbi->s_es->s_mount_opts,
                          *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
   
         if (es->s_error_count)
@@@ -4285,8 -4239,8 +4285,8 @@@ failed_mount
   out_fail:
         sb->s_fs_info = NULL;
         kfree(sbi->s_blockgroup_lock);
+ +out_free_base:
         kfree(sbi);
- -out_free_orig:
         kfree(orig_data);
         return err ? err : ret;
   }
@@@ -4596,8 -4550,7 +4596,8 @@@ static int ext4_commit_super(struct sup
                                 &EXT4_SB(sb)->s_freeinodes_counter));
         BUFFER_TRACE(sbh, "marking dirty");
         ext4_superblock_csum_set(sb);
- -      lock_buffer(sbh);
+ +      if (sync)
+ +              lock_buffer(sbh);
         if (buffer_write_io_error(sbh)) {
                 /*
                  * Oh, dear.  A previous attempt to write the
@@@ -4613,10 -4566,10 +4613,10 @@@
                 set_buffer_uptodate(sbh);
         }
         mark_buffer_dirty(sbh);
- -      unlock_buffer(sbh);
         if (sync) {
+ +              unlock_buffer(sbh);
                 error = __sync_dirty_buffer(sbh,
- -                      test_opt(sb, BARRIER) ? WRITE_FUA : WRITE_SYNC);
+ +                      test_opt(sb, BARRIER) ? REQ_FUA : REQ_SYNC);
                 if (error)
                         return error;
   
@@@ -4904,13 -4857,6 +4904,13 @@@ static int ext4_remount(struct super_bl
                         err = -EINVAL;
                         goto restore_opts;
                 }
+ +      } else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) {
+ +              if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+ +                      ext4_msg(sb, KERN_ERR, "can't mount with "
+ +                              "journal_async_commit in data=ordered mode");
+ +                      err = -EINVAL;
+ +                      goto restore_opts;
+ +              }
         }
   
         if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) {
@@@ -5293,7 -5239,7 +5293,7 @@@ static void lockdep_set_quota_inode(str
    * Standard function to be called on quota_on
    */
   static int ext4_quota_on(struct super_block *sb, int type, int format_id,
-                        struct path *path)
+                        const struct path *path)
   {
         int err;
   
@@@ -5420,7 -5366,7 +5420,7 @@@ static int ext4_quota_off(struct super_
         handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
         if (IS_ERR(handle))
                 goto out;
- -      inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ +      inode->i_mtime = inode->i_ctime = current_time(inode);
         ext4_mark_inode_dirty(handle, inode);
         ext4_journal_stop(handle);
   
diff --combined fs/internal.h

index 4fcf517,3e46015..b63cf3a
--- 1/fs/internal.h
--- 2/fs/internal.h
+++ b/fs/internal.h
@@@ -62,7 -62,7 +62,7 @@@ extern int vfs_path_lookup(struct dentr
   extern void *copy_mount_options(const void __user *);
   extern char *copy_mount_string(const void __user *);
   
- extern struct vfsmount *lookup_mnt(struct path *);
+ extern struct vfsmount *lookup_mnt(const struct path *);
   extern int finish_automount(struct vfsmount *, struct path *);
   
   extern int sb_prepare_remount_readonly(struct super_block *);
@@@ -184,6 -184,3 +184,6 @@@ typedef loff_t (*iomap_actor_t)(struct 
   loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length,
                 unsigned flags, struct iomap_ops *ops, void *data,
                 iomap_actor_t actor);
+ +
+ +/* direct-io.c: */
+ +int sb_init_dio_done_wq(struct super_block *sb);
diff --combined fs/namei.c

index 2b55ea1,47781b0..1c372de
--- 1/fs/namei.c
--- 2/fs/namei.c
+++ b/fs/namei.c
@@@ -1200,7 -1200,7 +1200,7 @@@ static int follow_managed(struct path *
                 if (managed & DCACHE_MANAGE_TRANSIT) {
                         BUG_ON(!path->dentry->d_op);
                         BUG_ON(!path->dentry->d_op->d_manage);
-                       ret = path->dentry->d_op->d_manage(path->dentry, false);
+                       ret = path->dentry->d_op->d_manage(path, false);
                         if (ret < 0)
                                 break;
                 }
@@@ -1263,10 -1263,10 +1263,10 @@@ int follow_down_one(struct path *path
   }
   EXPORT_SYMBOL(follow_down_one);
   
- static inline int managed_dentry_rcu(struct dentry *dentry)
+ static inline int managed_dentry_rcu(const struct path *path)
   {
-       return (dentry->d_flags & DCACHE_MANAGE_TRANSIT) ?
-               dentry->d_op->d_manage(dentry, true) : 0;
+       return (path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) ?
+               path->dentry->d_op->d_manage(path, true) : 0;
   }
   
   /*
@@@ -1282,7 -1282,7 +1282,7 @@@ static bool __follow_mount_rcu(struct n
                  * Don't forget we might have a non-mountpoint managed dentry
                  * that wants to block transit.
                  */
-               switch (managed_dentry_rcu(path->dentry)) {
+               switch (managed_dentry_rcu(path)) {
                 case -ECHILD:
                 default:
                         return false;
@@@ -1392,8 -1392,7 +1392,7 @@@ int follow_down(struct path *path
                 if (managed & DCACHE_MANAGE_TRANSIT) {
                         BUG_ON(!path->dentry->d_op);
                         BUG_ON(!path->dentry->d_op->d_manage);
-                       ret = path->dentry->d_op->d_manage(
-                               path->dentry, false);
+                       ret = path->dentry->d_op->d_manage(path, false);
                         if (ret < 0)
                                 return ret == -EISDIR ? 0 : ret;
                 }
@@@ -1725,35 -1724,30 +1724,35 @@@ static int pick_link(struct nameidata *
         return 1;
   }
   
+ +enum {WALK_FOLLOW = 1, WALK_MORE = 2};
+ +
   /*
    * Do we need to follow links? We _really_ want to be able
    * to do this check without having to look at inode->i_op,
    * so we keep a cache of "no, this doesn't need follow_link"
    * for the common case.
    */
- -static inline int should_follow_link(struct nameidata *nd, struct path *link,
- -                                   int follow,
- -                                   struct inode *inode, unsigned seq)
+ +static inline int step_into(struct nameidata *nd, struct path *path,
+ +                          int flags, struct inode *inode, unsigned seq)
   {
- -      if (likely(!d_is_symlink(link->dentry)))
- -              return 0;
- -      if (!follow)
+ +      if (!(flags & WALK_MORE) && nd->depth)
+ +              put_link(nd);
+ +      if (likely(!d_is_symlink(path->dentry)) ||
+ +         !(flags & WALK_FOLLOW || nd->flags & LOOKUP_FOLLOW)) {
+ +              /* not a symlink or should not follow */
+ +              path_to_nameidata(path, nd);
+ +              nd->inode = inode;
+ +              nd->seq = seq;
                 return 0;
+ +      }
         /* make sure that d_is_symlink above matches inode */
         if (nd->flags & LOOKUP_RCU) {
- -              if (read_seqcount_retry(&link->dentry->d_seq, seq))
+ +              if (read_seqcount_retry(&path->dentry->d_seq, seq))
                         return -ECHILD;
         }
- -      return pick_link(nd, link, inode, seq);
+ +      return pick_link(nd, path, inode, seq);
   }
   
- -enum {WALK_GET = 1, WALK_PUT = 2};
- -
   static int walk_component(struct nameidata *nd, int flags)
   {
         struct path path;
@@@ -1767,7 -1761,7 +1766,7 @@@
          */
         if (unlikely(nd->last_type != LAST_NORM)) {
                 err = handle_dots(nd, nd->last_type);
- -              if (flags & WALK_PUT)
+ +              if (!(flags & WALK_MORE) && nd->depth)
                         put_link(nd);
                 return err;
         }
@@@ -1794,7 -1788,15 +1793,7 @@@
                 inode = d_backing_inode(path.dentry);
         }
   
- -      if (flags & WALK_PUT)
- -              put_link(nd);
- -      err = should_follow_link(nd, &path, flags & WALK_GET, inode, seq);
- -      if (unlikely(err))
- -              return err;
- -      path_to_nameidata(&path, nd);
- -      nd->inode = inode;
- -      nd->seq = seq;
- -      return 0;
+ +      return step_into(nd, &path, flags, inode, seq);
   }
   
   /*
@@@ -2101,10 -2103,9 +2100,10 @@@ OK
                         if (!name)
                                 return 0;
                         /* last component of nested symlink */
- -                      err = walk_component(nd, WALK_GET | WALK_PUT);
+ +                      err = walk_component(nd, WALK_FOLLOW);
                 } else {
- -                      err = walk_component(nd, WALK_GET);
+ +                      /* not the last component */
+ +                      err = walk_component(nd, WALK_FOLLOW | WALK_MORE);
                 }
                 if (err < 0)
                         return err;
@@@ -2246,7 -2247,12 +2245,7 @@@ static inline int lookup_last(struct na
                 nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
   
         nd->flags &= ~LOOKUP_PARENT;
- -      return walk_component(nd,
- -                      nd->flags & LOOKUP_FOLLOW
- -                              ? nd->depth
- -                                      ? WALK_PUT | WALK_GET
- -                                      : WALK_GET
- -                              : 0);
+ +      return walk_component(nd, 0);
   }
   
   /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
@@@ -2551,9 -2557,28 +2550,9 @@@ int user_path_at_empty(int dfd, const c
   }
   EXPORT_SYMBOL(user_path_at_empty);
   
- -/*
- - * NB: most callers don't do anything directly with the reference to the
- - *     to struct filename, but the nd->last pointer points into the name string
- - *     allocated by getname. So we must hold the reference to it until all
- - *     path-walking is complete.
- - */
- -static inline struct filename *
- -user_path_parent(int dfd, const char __user *path,
- -               struct path *parent,
- -               struct qstr *last,
- -               int *type,
- -               unsigned int flags)
- -{
- -      /* only LOOKUP_REVAL is allowed in extra flags */
- -      return filename_parentat(dfd, getname(path), flags & LOOKUP_REVAL,
- -                               parent, last, type);
- -}
- -
   /**
    * mountpoint_last - look up last component for umount
    * @nd:   pathwalk nameidata - currently pointing at parent directory of "last"
- - * @path: pointer to container for result
    *
    * This is a special lookup_last function just for umount. In this case, we
    * need to resolve the path without doing any revalidation.
@@@ -2566,20 -2591,23 +2565,20 @@@
    *
    * Returns:
    * -error: if there was an error during lookup. This includes -ENOENT if the
- - *         lookup found a negative dentry. The nd->path reference will also be
- - *         put in this case.
+ + *         lookup found a negative dentry.
    *
- - * 0:      if we successfully resolved nd->path and found it to not to be a
- - *         symlink that needs to be followed. "path" will also be populated.
- - *         The nd->path reference will also be put.
+ + * 0:      if we successfully resolved nd->last and found it to not to be a
+ + *         symlink that needs to be followed.
    *
    * 1:      if we successfully resolved nd->last and found it to be a symlink
- - *         that needs to be followed. "path" will be populated with the path
- - *         to the link, and nd->path will *not* be put.
+ + *         that needs to be followed.
    */
   static int
- -mountpoint_last(struct nameidata *nd, struct path *path)
+ +mountpoint_last(struct nameidata *nd)
   {
         int error = 0;
- -      struct dentry *dentry;
         struct dentry *dir = nd->path.dentry;
+ +      struct path path;
   
         /* If we're in rcuwalk, drop out of it to handle last component */
         if (nd->flags & LOOKUP_RCU) {
@@@ -2593,28 -2621,37 +2592,28 @@@
                 error = handle_dots(nd, nd->last_type);
                 if (error)
                         return error;
- -              dentry = dget(nd->path.dentry);
+ +              path.dentry = dget(nd->path.dentry);
         } else {
- -              dentry = d_lookup(dir, &nd->last);
- -              if (!dentry) {
+ +              path.dentry = d_lookup(dir, &nd->last);
+ +              if (!path.dentry) {
                         /*
                          * No cached dentry. Mounted dentries are pinned in the
                          * cache, so that means that this dentry is probably
                          * a symlink or the path doesn't actually point
                          * to a mounted dentry.
                          */
- -                      dentry = lookup_slow(&nd->last, dir,
+ +                      path.dentry = lookup_slow(&nd->last, dir,
                                              nd->flags | LOOKUP_NO_REVAL);
- -                      if (IS_ERR(dentry))
- -                              return PTR_ERR(dentry);
+ +                      if (IS_ERR(path.dentry))
+ +                              return PTR_ERR(path.dentry);
                 }
         }
- -      if (d_is_negative(dentry)) {
- -              dput(dentry);
+ +      if (d_is_negative(path.dentry)) {
+ +              dput(path.dentry);
                 return -ENOENT;
         }
- -      if (nd->depth)
- -              put_link(nd);
- -      path->dentry = dentry;
- -      path->mnt = nd->path.mnt;
- -      error = should_follow_link(nd, path, nd->flags & LOOKUP_FOLLOW,
- -                                 d_backing_inode(dentry), 0);
- -      if (unlikely(error))
- -              return error;
- -      mntget(path->mnt);
- -      follow_mount(path);
- -      return 0;
+ +      path.mnt = nd->path.mnt;
+ +      return step_into(nd, &path, 0, d_backing_inode(path.dentry), 0);
   }
   
   /**
@@@ -2634,19 -2671,13 +2633,19 @@@ path_mountpoint(struct nameidata *nd, u
         if (IS_ERR(s))
                 return PTR_ERR(s);
         while (!(err = link_path_walk(s, nd)) &&
- -              (err = mountpoint_last(nd, path)) > 0) {
+ +              (err = mountpoint_last(nd)) > 0) {
                 s = trailing_symlink(nd);
                 if (IS_ERR(s)) {
                         err = PTR_ERR(s);
                         break;
                 }
         }
+ +      if (!err) {
+ +              *path = nd->path;
+ +              nd->path.mnt = NULL;
+ +              nd->path.dentry = NULL;
+ +              follow_mount(path);
+ +      }
         terminate_walk(nd);
         return err;
   }
@@@ -2863,7 -2894,7 +2862,7 @@@ bool may_open_dev(const struct path *pa
                 !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
   }
   
- static int may_open(struct path *path, int acc_mode, int flag)
+ static int may_open(const struct path *path, int acc_mode, int flag)
   {
         struct dentry *dentry = path->dentry;
         struct inode *inode = dentry->d_inode;
@@@ -2913,7 -2944,7 +2912,7 @@@
   
   static int handle_truncate(struct file *filp)
   {
-       struct path *path = &filp->f_path;
+       const struct path *path = &filp->f_path;
         struct inode *inode = path->dentry->d_inode;
         int error = get_write_access(inode);
         if (error)
@@@ -3303,11 -3334,18 +3302,11 @@@ static int do_last(struct nameidata *nd
         seq = 0;        /* out of RCU mode, so the value doesn't matter */
         inode = d_backing_inode(path.dentry);
   finish_lookup:
- -      if (nd->depth)
- -              put_link(nd);
- -      error = should_follow_link(nd, &path, nd->flags & LOOKUP_FOLLOW,
- -                                 inode, seq);
+ +      error = step_into(nd, &path, 0, inode, seq);
         if (unlikely(error))
                 return error;
- -
- -      path_to_nameidata(&path, nd);
- -      nd->inode = inode;
- -      nd->seq = seq;
- -      /* Why this, you ask?  _Now_ we might have grown LOOKUP_JUMPED... */
   finish_open:
+ +      /* Why this, you ask?  _Now_ we might have grown LOOKUP_JUMPED... */
         error = complete_walk(nd);
         if (error)
                 return error;
@@@ -3822,8 -3860,8 +3821,8 @@@ static long do_rmdir(int dfd, const cha
         int type;
         unsigned int lookup_flags = 0;
   retry:
- -      name = user_path_parent(dfd, pathname,
- -                              &path, &last, &type, lookup_flags);
+ +      name = filename_parentat(dfd, getname(pathname), lookup_flags,
+ +                              &path, &last, &type);
         if (IS_ERR(name))
                 return PTR_ERR(name);
   
@@@ -3952,8 -3990,8 +3951,8 @@@ static long do_unlinkat(int dfd, const 
         struct inode *delegated_inode = NULL;
         unsigned int lookup_flags = 0;
   retry:
- -      name = user_path_parent(dfd, pathname,
- -                              &path, &last, &type, lookup_flags);
+ +      name = filename_parentat(dfd, getname(pathname), lookup_flags,
+ +                              &path, &last, &type);
         if (IS_ERR(name))
                 return PTR_ERR(name);
   
@@@ -4306,7 -4344,11 +4305,7 @@@ int vfs_rename(struct inode *old_dir, s
         bool new_is_dir = false;
         unsigned max_links = new_dir->i_sb->s_max_links;
   
- -      /*
- -       * Check source == target.
- -       * On overlayfs need to look at underlying inodes.
- -       */
- -      if (d_real_inode(old_dentry) == d_real_inode(new_dentry))
+ +      if (source == target)
                 return 0;
   
         error = may_delete(old_dir, old_dentry, is_dir);
@@@ -4448,15 -4490,15 +4447,15 @@@ SYSCALL_DEFINE5(renameat2, int, olddfd
                 target_flags = 0;
   
   retry:
- -      from = user_path_parent(olddfd, oldname,
- -                              &old_path, &old_last, &old_type, lookup_flags);
+ +      from = filename_parentat(olddfd, getname(oldname), lookup_flags,
+ +                              &old_path, &old_last, &old_type);
         if (IS_ERR(from)) {
                 error = PTR_ERR(from);
                 goto exit;
         }
   
- -      to = user_path_parent(newdfd, newname,
- -                              &new_path, &new_last, &new_type, lookup_flags);
+ +      to = filename_parentat(newdfd, getname(newname), lookup_flags,
+ +                              &new_path, &new_last, &new_type);
         if (IS_ERR(to)) {
                 error = PTR_ERR(to);
                 goto exit1;
diff --combined fs/nfs/file.c

index 64c11f3,a1de8ef..55208b9
--- 1/fs/nfs/file.c
--- 2/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@@ -102,11 -102,8 +102,11 @@@ static int nfs_revalidate_file_size(str
   {
         struct nfs_server *server = NFS_SERVER(inode);
         struct nfs_inode *nfsi = NFS_I(inode);
+ +      const unsigned long force_reval = NFS_INO_REVAL_PAGECACHE|NFS_INO_REVAL_FORCED;
+ +      unsigned long cache_validity = nfsi->cache_validity;
   
- -      if (nfs_have_delegated_attributes(inode))
+ +      if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ) &&
+ +          (cache_validity & force_reval) != force_reval)
                 goto out_noreval;
   
         if (filp->f_flags & O_DIRECT)
@@@ -377,7 -374,7 +377,7 @@@ static int nfs_write_end(struct file *f
          */
         if (!PageUptodate(page)) {
                 unsigned pglen = nfs_page_length(page);
-               unsigned end = offset + len;
+               unsigned end = offset + copied;
   
                 if (pglen == 0) {
                         zero_user_segments(page, 0, offset,
diff --combined fs/ocfs2/aops.c

index 4d9c6f5,3372d82..11556b7
--- 1/fs/ocfs2/aops.c
--- 2/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@@ -464,6 -464,15 +464,15 @@@ static sector_t ocfs2_bmap(struct addre
         trace_ocfs2_bmap((unsigned long long)OCFS2_I(inode)->ip_blkno,
                          (unsigned long long)block);
   
+       /*
+        * The swap code (ab-)uses ->bmap to get a block mapping and then
+        * bypasseѕ the file system for actual I/O.  We really can't allow
+        * that on refcounted inodes, so we have to skip out here.  And yes,
+        * 0 is the magic code for a bmap error..
+        */
+       if (ocfs2_is_refcount_inode(inode))
+               return 0;
+ 
         /* We don't need to lock journal system files, since they aren't
          * accessed concurrently from multiple nodes.
          */
@@@ -630,7 -639,7 +639,7 @@@ int ocfs2_map_page_blocks(struct page *
   
                 if (!buffer_mapped(bh)) {
                         map_bh(bh, inode->i_sb, *p_blkno);
- -                      unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+ +                      clean_bdev_bh_alias(bh);
                 }
   
                 if (PageUptodate(page)) {
@@@ -1950,7 -1959,8 +1959,7 @@@ static void ocfs2_write_end_inline(stru
   }
   
   int ocfs2_write_end_nolock(struct address_space *mapping,
- -                         loff_t pos, unsigned len, unsigned copied,
- -                         struct page *page, void *fsdata)
+ +                         loff_t pos, unsigned len, unsigned copied, void *fsdata)
   {
         int i, ret;
         unsigned from, to, start = pos & (PAGE_SIZE - 1);
@@@ -2063,7 -2073,7 +2072,7 @@@ static int ocfs2_write_end(struct file 
         int ret;
         struct inode *inode = mapping->host;
   
- -      ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata);
+ +      ret = ocfs2_write_end_nolock(mapping, pos, len, copied, fsdata);
   
         up_write(&OCFS2_I(inode)->ip_alloc_sem);
         ocfs2_inode_unlock(inode, 1);
@@@ -2240,7 -2250,7 +2249,7 @@@ static int ocfs2_dio_get_block(struct i
                 dwc->dw_zero_count++;
         }
   
- -      ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, NULL, wc);
+ +      ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, wc);
         BUG_ON(ret != len);
         ret = 0;
   unlock:
@@@ -2253,10 -2263,10 +2262,10 @@@ out
         return ret;
   }
   
- static void ocfs2_dio_end_io_write(struct inode *inode,
-                                  struct ocfs2_dio_write_ctxt *dwc,
-                                  loff_t offset,
-                                  ssize_t bytes)
+ static int ocfs2_dio_end_io_write(struct inode *inode,
+                                 struct ocfs2_dio_write_ctxt *dwc,
+                                 loff_t offset,
+                                 ssize_t bytes)
   {
         struct ocfs2_cached_dealloc_ctxt dealloc;
         struct ocfs2_extent_tree et;
@@@ -2307,7 -2317,7 +2316,7 @@@
                         mlog_errno(ret);
         }
   
-       di = (struct ocfs2_dinode *)di_bh;
+       di = (struct ocfs2_dinode *)di_bh->b_data;
   
         ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
   
@@@ -2364,6 -2374,8 +2373,8 @@@ out
         if (locked)
                 inode_unlock(inode);
         ocfs2_dio_free_write_ctx(inode, dwc);
+ 
+       return ret;
   }
   
   /*
@@@ -2378,21 -2390,19 +2389,19 @@@ static int ocfs2_dio_end_io(struct kioc
   {
         struct inode *inode = file_inode(iocb->ki_filp);
         int level;
- 
-       if (bytes <= 0)
-               return 0;
+       int ret = 0;
   
         /* this io's submitter should not have unlocked this before we could */
         BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
   
-       if (private)
-               ocfs2_dio_end_io_write(inode, private, offset, bytes);
+       if (bytes > 0 && private)
+               ret = ocfs2_dio_end_io_write(inode, private, offset, bytes);
   
         ocfs2_iocb_clear_rw_locked(iocb);
   
         level = ocfs2_iocb_rw_locked_level(iocb);
         ocfs2_rw_unlock(inode, level);
-       return 0;
+       return ret;
   }
   
   static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
diff --combined fs/ocfs2/refcounttree.c

index 738b4ea,b18465e..d171d2c
--- 1/fs/ocfs2/refcounttree.c
--- 2/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@@ -34,6 -34,7 +34,7 @@@
   #include "xattr.h"
   #include "namei.h"
   #include "ocfs2_trace.h"
+ #include "file.h"
   
   #include <linux/bio.h>
   #include <linux/blkdev.h>
@@@ -410,7 -411,7 +411,7 @@@ static int ocfs2_get_refcount_block(str
                 goto out;
         }
   
-       BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+       BUG_ON(!ocfs2_is_refcount_inode(inode));
   
         di = (struct ocfs2_dinode *)di_bh->b_data;
         *ref_blkno = le64_to_cpu(di->i_refcount_loc);
@@@ -478,6 -479,7 +479,6 @@@ again
         if (ret) {
                 mlog_errno(ret);
                 ocfs2_unlock_refcount_tree(osb, tree, rw);
- -              ocfs2_refcount_tree_put(tree);
                 goto out;
         }
   
@@@ -569,7 -571,7 +570,7 @@@ static int ocfs2_create_refcount_tree(s
         u32 num_got;
         u64 suballoc_loc, first_blkno;
   
-       BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
+       BUG_ON(ocfs2_is_refcount_inode(inode));
   
         trace_ocfs2_create_refcount_tree(
                 (unsigned long long)OCFS2_I(inode)->ip_blkno);
@@@ -707,7 -709,7 +708,7 @@@ static int ocfs2_set_refcount_tree(stru
         struct ocfs2_refcount_block *rb;
         struct ocfs2_refcount_tree *ref_tree;
   
-       BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
+       BUG_ON(ocfs2_is_refcount_inode(inode));
   
         ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
                                        &ref_tree, &ref_root_bh);
@@@ -774,7 -776,7 +775,7 @@@ int ocfs2_remove_refcount_tree(struct i
         u64 blk = 0, bg_blkno = 0, ref_blkno = le64_to_cpu(di->i_refcount_loc);
         u16 bit = 0;
   
-       if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL))
+       if (!ocfs2_is_refcount_inode(inode))
                 return 0;
   
         BUG_ON(!ref_blkno);
@@@ -2298,11 -2300,10 +2299,10 @@@ int ocfs2_decrease_refcount(struct inod
   {
         int ret;
         u64 ref_blkno;
-       struct ocfs2_inode_info *oi = OCFS2_I(inode);
         struct buffer_head *ref_root_bh = NULL;
         struct ocfs2_refcount_tree *tree;
   
-       BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+       BUG_ON(!ocfs2_is_refcount_inode(inode));
   
         ret = ocfs2_get_refcount_block(inode, &ref_blkno);
         if (ret) {
@@@ -2532,7 -2533,6 +2532,6 @@@ int ocfs2_prepare_refcount_change_for_d
                                           int *ref_blocks)
   {
         int ret;
-       struct ocfs2_inode_info *oi = OCFS2_I(inode);
         struct buffer_head *ref_root_bh = NULL;
         struct ocfs2_refcount_tree *tree;
         u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno);
@@@ -2543,7 -2543,7 +2542,7 @@@
                 goto out;
         }
   
-       BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+       BUG_ON(!ocfs2_is_refcount_inode(inode));
   
         ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb),
                                       refcount_loc, &tree);
@@@ -3411,14 -3411,13 +3410,13 @@@ static int ocfs2_refcount_cow_hunk(stru
   {
         int ret;
         u32 cow_start = 0, cow_len = 0;
-       struct ocfs2_inode_info *oi = OCFS2_I(inode);
         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
         struct buffer_head *ref_root_bh = NULL;
         struct ocfs2_refcount_tree *ref_tree;
         struct ocfs2_cow_context *context = NULL;
   
-       BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+       BUG_ON(!ocfs2_is_refcount_inode(inode));
   
         ret = ocfs2_refcount_cal_cow_clusters(inode, &di->id2.i_list,
                                               cpos, write_len, max_cpos,
@@@ -3628,11 -3627,10 +3626,10 @@@ int ocfs2_refcount_cow_xattr(struct ino
   {
         int ret;
         struct ocfs2_xattr_value_root *xv = vb->vb_xv;
-       struct ocfs2_inode_info *oi = OCFS2_I(inode);
         struct ocfs2_cow_context *context = NULL;
         u32 cow_start, cow_len;
   
-       BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+       BUG_ON(!ocfs2_is_refcount_inode(inode));
   
         ret = ocfs2_refcount_cal_cow_clusters(inode, &xv->xr_list,
                                               cpos, write_len, UINT_MAX,
@@@ -3695,6 -3693,9 +3692,9 @@@ int ocfs2_add_refcount_flag(struct inod
         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
         struct ocfs2_alloc_context *meta_ac = NULL;
   
+       /* We need to be able to handle at least an extent tree split. */
+       ref_blocks = ocfs2_extend_meta_needed(data_et->et_root_el);
+ 
         ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
                                                ref_ci, ref_root_bh,
                                                p_cluster, num_clusters,
@@@ -3806,7 -3807,7 +3806,7 @@@ static int ocfs2_attach_refcount_tree(s
   
         ocfs2_init_dealloc_ctxt(&dealloc);
   
-       if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) {
+       if (!ocfs2_is_refcount_inode(inode)) {
                 ret = ocfs2_create_refcount_tree(inode, di_bh);
                 if (ret) {
                         mlog_errno(ret);
@@@ -3933,6 -3934,13 +3933,13 @@@ static int ocfs2_add_refcounted_extent(
         ret = ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
                                       p_cluster, num_clusters,
                                       meta_ac, dealloc);
+       if (ret) {
+               mlog_errno(ret);
+               goto out_commit;
+       }
+ 
+       ret = dquot_alloc_space_nodirty(inode,
+               ocfs2_clusters_to_bytes(osb->sb, num_clusters));
         if (ret)
                 mlog_errno(ret);
   
@@@ -4441,3 -4449,434 +4448,434 @@@ out
   
         return error;
   }
+ 
+ /* Update destination inode size, if necessary. */
+ static int ocfs2_reflink_update_dest(struct inode *dest,
+                                    struct buffer_head *d_bh,
+                                    loff_t newlen)
+ {
+       handle_t *handle;
+       int ret;
+ 
+       dest->i_blocks = ocfs2_inode_sector_count(dest);
+ 
+       if (newlen <= i_size_read(dest))
+               return 0;
+ 
+       handle = ocfs2_start_trans(OCFS2_SB(dest->i_sb),
+                                  OCFS2_INODE_UPDATE_CREDITS);
+       if (IS_ERR(handle)) {
+               ret = PTR_ERR(handle);
+               mlog_errno(ret);
+               return ret;
+       }
+ 
+       /* Extend i_size if needed. */
+       spin_lock(&OCFS2_I(dest)->ip_lock);
+       if (newlen > i_size_read(dest))
+               i_size_write(dest, newlen);
+       spin_unlock(&OCFS2_I(dest)->ip_lock);
+       dest->i_ctime = dest->i_mtime = current_time(dest);
+ 
+       ret = ocfs2_mark_inode_dirty(handle, dest, d_bh);
+       if (ret) {
+               mlog_errno(ret);
+               goto out_commit;
+       }
+ 
+ out_commit:
+       ocfs2_commit_trans(OCFS2_SB(dest->i_sb), handle);
+       return ret;
+ }
+ 
+ /* Remap the range pos_in:len in s_inode to pos_out:len in t_inode. */
+ static int ocfs2_reflink_remap_extent(struct inode *s_inode,
+                                     struct buffer_head *s_bh,
+                                     loff_t pos_in,
+                                     struct inode *t_inode,
+                                     struct buffer_head *t_bh,
+                                     loff_t pos_out,
+                                     loff_t len,
+                                     struct ocfs2_cached_dealloc_ctxt *dealloc)
+ {
+       struct ocfs2_extent_tree s_et;
+       struct ocfs2_extent_tree t_et;
+       struct ocfs2_dinode *dis;
+       struct buffer_head *ref_root_bh = NULL;
+       struct ocfs2_refcount_tree *ref_tree;
+       struct ocfs2_super *osb;
+       loff_t pstart, plen;
+       u32 p_cluster, num_clusters, slast, spos, tpos;
+       unsigned int ext_flags;
+       int ret = 0;
+ 
+       osb = OCFS2_SB(s_inode->i_sb);
+       dis = (struct ocfs2_dinode *)s_bh->b_data;
+       ocfs2_init_dinode_extent_tree(&s_et, INODE_CACHE(s_inode), s_bh);
+       ocfs2_init_dinode_extent_tree(&t_et, INODE_CACHE(t_inode), t_bh);
+ 
+       spos = ocfs2_bytes_to_clusters(s_inode->i_sb, pos_in);
+       tpos = ocfs2_bytes_to_clusters(t_inode->i_sb, pos_out);
+       slast = ocfs2_clusters_for_bytes(s_inode->i_sb, pos_in + len);
+ 
+       while (spos < slast) {
+               if (fatal_signal_pending(current)) {
+                       ret = -EINTR;
+                       goto out;
+               }
+ 
+               /* Look up the extent. */
+               ret = ocfs2_get_clusters(s_inode, spos, &p_cluster,
+                                        &num_clusters, &ext_flags);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+ 
+               num_clusters = min_t(u32, num_clusters, slast - spos);
+ 
+               /* Punch out the dest range. */
+               pstart = ocfs2_clusters_to_bytes(t_inode->i_sb, tpos);
+               plen = ocfs2_clusters_to_bytes(t_inode->i_sb, num_clusters);
+               ret = ocfs2_remove_inode_range(t_inode, t_bh, pstart, plen);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+ 
+               if (p_cluster == 0)
+                       goto next_loop;
+ 
+               /* Lock the refcount btree... */
+               ret = ocfs2_lock_refcount_tree(osb,
+                                              le64_to_cpu(dis->i_refcount_loc),
+                                              1, &ref_tree, &ref_root_bh);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+ 
+               /* Mark s_inode's extent as refcounted. */
+               if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) {
+                       ret = ocfs2_add_refcount_flag(s_inode, &s_et,
+                                                     &ref_tree->rf_ci,
+                                                     ref_root_bh, spos,
+                                                     p_cluster, num_clusters,
+                                                     dealloc, NULL);
+                       if (ret) {
+                               mlog_errno(ret);
+                               goto out_unlock_refcount;
+                       }
+               }
+ 
+               /* Map in the new extent. */
+               ext_flags |= OCFS2_EXT_REFCOUNTED;
+               ret = ocfs2_add_refcounted_extent(t_inode, &t_et,
+                                                 &ref_tree->rf_ci,
+                                                 ref_root_bh,
+                                                 tpos, p_cluster,
+                                                 num_clusters,
+                                                 ext_flags,
+                                                 dealloc);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out_unlock_refcount;
+               }
+ 
+               ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+               brelse(ref_root_bh);
+ next_loop:
+               spos += num_clusters;
+               tpos += num_clusters;
+       }
+ 
+ out:
+       return ret;
+ out_unlock_refcount:
+       ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+       brelse(ref_root_bh);
+       return ret;
+ }
+ 
+ /* Set up refcount tree and remap s_inode to t_inode. */
+ static int ocfs2_reflink_remap_blocks(struct inode *s_inode,
+                                     struct buffer_head *s_bh,
+                                     loff_t pos_in,
+                                     struct inode *t_inode,
+                                     struct buffer_head *t_bh,
+                                     loff_t pos_out,
+                                     loff_t len)
+ {
+       struct ocfs2_cached_dealloc_ctxt dealloc;
+       struct ocfs2_super *osb;
+       struct ocfs2_dinode *dis;
+       struct ocfs2_dinode *dit;
+       int ret;
+ 
+       osb = OCFS2_SB(s_inode->i_sb);
+       dis = (struct ocfs2_dinode *)s_bh->b_data;
+       dit = (struct ocfs2_dinode *)t_bh->b_data;
+       ocfs2_init_dealloc_ctxt(&dealloc);
+ 
+       /*
+        * If we're reflinking the entire file and the source is inline
+        * data, just copy the contents.
+        */
+       if (pos_in == pos_out && pos_in == 0 && len == i_size_read(s_inode) &&
+           i_size_read(t_inode) <= len &&
+           (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) {
+               ret = ocfs2_duplicate_inline_data(s_inode, s_bh, t_inode, t_bh);
+               if (ret)
+                       mlog_errno(ret);
+               goto out;
+       }
+ 
+       /*
+        * If both inodes belong to two different refcount groups then
+        * forget it because we don't know how (or want) to go merging
+        * refcount trees.
+        */
+       ret = -EOPNOTSUPP;
+       if (ocfs2_is_refcount_inode(s_inode) &&
+           ocfs2_is_refcount_inode(t_inode) &&
+           le64_to_cpu(dis->i_refcount_loc) !=
+           le64_to_cpu(dit->i_refcount_loc))
+               goto out;
+ 
+       /* Neither inode has a refcount tree.  Add one to s_inode. */
+       if (!ocfs2_is_refcount_inode(s_inode) &&
+           !ocfs2_is_refcount_inode(t_inode)) {
+               ret = ocfs2_create_refcount_tree(s_inode, s_bh);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+       }
+ 
+       /* Ensure that both inodes end up with the same refcount tree. */
+       if (!ocfs2_is_refcount_inode(s_inode)) {
+               ret = ocfs2_set_refcount_tree(s_inode, s_bh,
+                                             le64_to_cpu(dit->i_refcount_loc));
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+       }
+       if (!ocfs2_is_refcount_inode(t_inode)) {
+               ret = ocfs2_set_refcount_tree(t_inode, t_bh,
+                                             le64_to_cpu(dis->i_refcount_loc));
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+       }
+ 
+       /* Turn off inline data in the dest file. */
+       if (OCFS2_I(t_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+               ret = ocfs2_convert_inline_data_to_extents(t_inode, t_bh);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+       }
+ 
+       /* Actually remap extents now. */
+       ret = ocfs2_reflink_remap_extent(s_inode, s_bh, pos_in, t_inode, t_bh,
+                                        pos_out, len, &dealloc);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+ 
+ out:
+       if (ocfs2_dealloc_has_cluster(&dealloc)) {
+               ocfs2_schedule_truncate_log_flush(osb, 1);
+               ocfs2_run_deallocs(osb, &dealloc);
+       }
+ 
+       return ret;
+ }
+ 
+ /* Lock an inode and grab a bh pointing to the inode. */
+ static int ocfs2_reflink_inodes_lock(struct inode *s_inode,
+                                    struct buffer_head **bh1,
+                                    struct inode *t_inode,
+                                    struct buffer_head **bh2)
+ {
+       struct inode *inode1;
+       struct inode *inode2;
+       struct ocfs2_inode_info *oi1;
+       struct ocfs2_inode_info *oi2;
+       bool same_inode = (s_inode == t_inode);
+       int status;
+ 
+       /* First grab the VFS and rw locks. */
+       lock_two_nondirectories(s_inode, t_inode);
+       inode1 = s_inode;
+       inode2 = t_inode;
+       if (inode1->i_ino > inode2->i_ino)
+               swap(inode1, inode2);
+ 
+       status = ocfs2_rw_lock(inode1, 1);
+       if (status) {
+               mlog_errno(status);
+               goto out_i1;
+       }
+       if (!same_inode) {
+               status = ocfs2_rw_lock(inode2, 1);
+               if (status) {
+                       mlog_errno(status);
+                       goto out_i2;
+               }
+       }
+ 
+       /* Now go for the cluster locks */
+       oi1 = OCFS2_I(inode1);
+       oi2 = OCFS2_I(inode2);
+ 
+       trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno,
+                               (unsigned long long)oi2->ip_blkno);
+ 
+       if (*bh1)
+               *bh1 = NULL;
+       if (*bh2)
+               *bh2 = NULL;
+ 
+       /* We always want to lock the one with the lower lockid first. */
+       if (oi1->ip_blkno > oi2->ip_blkno)
+               mlog_errno(-ENOLCK);
+ 
+       /* lock id1 */
+       status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_REFLINK_TARGET);
+       if (status < 0) {
+               if (status != -ENOENT)
+                       mlog_errno(status);
+               goto out_rw2;
+       }
+ 
+       /* lock id2 */
+       if (!same_inode) {
+               status = ocfs2_inode_lock_nested(inode2, bh2, 1,
+                                                OI_LS_REFLINK_TARGET);
+               if (status < 0) {
+                       if (status != -ENOENT)
+                               mlog_errno(status);
+                       goto out_cl1;
+               }
+       } else
+               *bh2 = *bh1;
+ 
+       trace_ocfs2_double_lock_end(
+                       (unsigned long long)OCFS2_I(inode1)->ip_blkno,
+                       (unsigned long long)OCFS2_I(inode2)->ip_blkno);
+ 
+       return 0;
+ 
+ out_cl1:
+       ocfs2_inode_unlock(inode1, 1);
+       brelse(*bh1);
+       *bh1 = NULL;
+ out_rw2:
+       ocfs2_rw_unlock(inode2, 1);
+ out_i2:
+       ocfs2_rw_unlock(inode1, 1);
+ out_i1:
+       unlock_two_nondirectories(s_inode, t_inode);
+       return status;
+ }
+ 
+ /* Unlock both inodes and release buffers. */
+ static void ocfs2_reflink_inodes_unlock(struct inode *s_inode,
+                                       struct buffer_head *s_bh,
+                                       struct inode *t_inode,
+                                       struct buffer_head *t_bh)
+ {
+       ocfs2_inode_unlock(s_inode, 1);
+       ocfs2_rw_unlock(s_inode, 1);
+       brelse(s_bh);
+       if (s_inode != t_inode) {
+               ocfs2_inode_unlock(t_inode, 1);
+               ocfs2_rw_unlock(t_inode, 1);
+               brelse(t_bh);
+       }
+       unlock_two_nondirectories(s_inode, t_inode);
+ }
+ 
+ /* Link a range of blocks from one file to another. */
+ int ocfs2_reflink_remap_range(struct file *file_in,
+                             loff_t pos_in,
+                             struct file *file_out,
+                             loff_t pos_out,
+                             u64 len,
+                             bool is_dedupe)
+ {
+       struct inode *inode_in = file_inode(file_in);
+       struct inode *inode_out = file_inode(file_out);
+       struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb);
+       struct buffer_head *in_bh = NULL, *out_bh = NULL;
+       bool same_inode = (inode_in == inode_out);
+       ssize_t ret;
+ 
+       if (!ocfs2_refcount_tree(osb))
+               return -EOPNOTSUPP;
+       if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+               return -EROFS;
+ 
+       /* Lock both files against IO */
+       ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh);
+       if (ret)
+               return ret;
+ 
+       /* Check file eligibility and prepare for block sharing. */
+       ret = -EINVAL;
+       if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) ||
+           (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
+               goto out_unlock;
+ 
+       ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out,
+                       &len, is_dedupe);
+       if (ret || len == 0)
+               goto out_unlock;
+ 
+       /* Lock out changes to the allocation maps and remap. */
+       down_write(&OCFS2_I(inode_in)->ip_alloc_sem);
+       if (!same_inode)
+               down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem,
+                                 SINGLE_DEPTH_NESTING);
+ 
+       ret = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, inode_out,
+                                        out_bh, pos_out, len);
+ 
+       /* Zap any page cache for the destination file's range. */
+       if (!ret)
+               truncate_inode_pages_range(&inode_out->i_data, pos_out,
+                                          PAGE_ALIGN(pos_out + len) - 1);
+ 
+       up_write(&OCFS2_I(inode_in)->ip_alloc_sem);
+       if (!same_inode)
+               up_write(&OCFS2_I(inode_out)->ip_alloc_sem);
+       if (ret) {
+               mlog_errno(ret);
+               goto out_unlock;
+       }
+ 
+       /*
+        * Empty the extent map so that we may get the right extent
+        * record from the disk.
+        */
+       ocfs2_extent_map_trunc(inode_in, 0);
+       ocfs2_extent_map_trunc(inode_out, 0);
+ 
+       ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len);
+       if (ret) {
+               mlog_errno(ret);
+               goto out_unlock;
+       }
+ 
+       ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
+       return 0;
+ 
+ out_unlock:
+       ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
+       return ret;
+ }
diff --combined fs/read_write.c

index 53bccd1,dbf3f7f..da6de12
--- 1/fs/read_write.c
--- 2/fs/read_write.c
+++ b/fs/read_write.c
@@@ -1538,26 -1538,45 +1538,43 @@@ ssize_t vfs_copy_file_range(struct fil
         if (len == 0)
                 return 0;
   
- -      ret = mnt_want_write_file(file_out);
- -      if (ret)
- -              return ret;
+ +      sb_start_write(inode_out->i_sb);
   
-       ret = -EOPNOTSUPP;
-       if (file_out->f_op->copy_file_range)
+       /*
+        * Try cloning first, this is supported by more file systems, and
+        * more efficient if both clone and copy are supported (e.g. NFS).
+        */
+       if (file_in->f_op->clone_file_range) {
+               ret = file_in->f_op->clone_file_range(file_in, pos_in,
+                               file_out, pos_out, len);
+               if (ret == 0) {
+                       ret = len;
+                       goto done;
+               }
+       }
+ 
+       if (file_out->f_op->copy_file_range) {
                 ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out,
                                                       pos_out, len, flags);
-       if (ret == -EOPNOTSUPP)
-               ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
-                               len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
+               if (ret != -EOPNOTSUPP)
+                       goto done;
+       }
+ 
+       ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
+                       len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
   
+ done:
         if (ret > 0) {
                 fsnotify_access(file_in);
                 add_rchar(current, ret);
                 fsnotify_modify(file_out);
                 add_wchar(current, ret);
         }
+ 
         inc_syscr(current);
         inc_syscw(current);
   
- -      mnt_drop_write_file(file_out);
+ +      sb_end_write(inode_out->i_sb);
   
         return ret;
   }
@@@ -1648,6 -1667,114 +1665,114 @@@ static int clone_verify_area(struct fil
         return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
   }
   
+ /*
+  * Check that the two inodes are eligible for cloning, the ranges make
+  * sense, and then flush all dirty data.  Caller must ensure that the
+  * inodes have been locked against any other modifications.
+  */
+ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
+                              struct inode *inode_out, loff_t pos_out,
+                              u64 *len, bool is_dedupe)
+ {
+       loff_t bs = inode_out->i_sb->s_blocksize;
+       loff_t blen;
+       loff_t isize;
+       bool same_inode = (inode_in == inode_out);
+       int ret;
+ 
+       /* Don't touch certain kinds of inodes */
+       if (IS_IMMUTABLE(inode_out))
+               return -EPERM;
+ 
+       if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
+               return -ETXTBSY;
+ 
+       /* Don't reflink dirs, pipes, sockets... */
+       if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
+               return -EISDIR;
+       if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
+               return -EINVAL;
+ 
+       /* Are we going all the way to the end? */
+       isize = i_size_read(inode_in);
+       if (isize == 0) {
+               *len = 0;
+               return 0;
+       }
+ 
+       /* Zero length dedupe exits immediately; reflink goes to EOF. */
+       if (*len == 0) {
+               if (is_dedupe) {
+                       *len = 0;
+                       return 0;
+               }
+               *len = isize - pos_in;
+       }
+ 
+       /* Ensure offsets don't wrap and the input is inside i_size */
+       if (pos_in + *len < pos_in || pos_out + *len < pos_out ||
+           pos_in + *len > isize)
+               return -EINVAL;
+ 
+       /* Don't allow dedupe past EOF in the dest file */
+       if (is_dedupe) {
+               loff_t  disize;
+ 
+               disize = i_size_read(inode_out);
+               if (pos_out >= disize || pos_out + *len > disize)
+                       return -EINVAL;
+       }
+ 
+       /* If we're linking to EOF, continue to the block boundary. */
+       if (pos_in + *len == isize)
+               blen = ALIGN(isize, bs) - pos_in;
+       else
+               blen = *len;
+ 
+       /* Only reflink if we're aligned to block boundaries */
+       if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
+           !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
+               return -EINVAL;
+ 
+       /* Don't allow overlapped reflink within the same file */
+       if (same_inode) {
+               if (pos_out + blen > pos_in && pos_out < pos_in + blen)
+                       return -EINVAL;
+       }
+ 
+       /* Wait for the completion of any pending IOs on both files */
+       inode_dio_wait(inode_in);
+       if (!same_inode)
+               inode_dio_wait(inode_out);
+ 
+       ret = filemap_write_and_wait_range(inode_in->i_mapping,
+                       pos_in, pos_in + *len - 1);
+       if (ret)
+               return ret;
+ 
+       ret = filemap_write_and_wait_range(inode_out->i_mapping,
+                       pos_out, pos_out + *len - 1);
+       if (ret)
+               return ret;
+ 
+       /*
+        * Check that the extents are the same.
+        */
+       if (is_dedupe) {
+               bool            is_same = false;
+ 
+               ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
+                               inode_out, pos_out, *len, &is_same);
+               if (ret)
+                       return ret;
+               if (!is_same)
+                       return -EBADE;
+       }
+ 
+       return 0;
+ }
+ EXPORT_SYMBOL(vfs_clone_file_prep_inodes);
+ 
   int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
                 struct file *file_out, loff_t pos_out, u64 len)
   {
@@@ -1655,19 -1782,15 +1780,19 @@@
         struct inode *inode_out = file_inode(file_out);
         int ret;
   
- -      if (inode_in->i_sb != inode_out->i_sb ||
- -          file_in->f_path.mnt != file_out->f_path.mnt)
- -              return -EXDEV;
- -
         if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
                 return -EISDIR;
         if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
                 return -EINVAL;
   
+ +      /*
+ +       * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
+ +       * the same mount. Practically, they only need to be on the same file
+ +       * system.
+ +       */
+ +      if (inode_in->i_sb != inode_out->i_sb)
+ +              return -EXDEV;
+ +
         if (!(file_in->f_mode & FMODE_READ) ||
             !(file_out->f_mode & FMODE_WRITE) ||
             (file_out->f_flags & O_APPEND))
@@@ -1687,6 -1810,10 +1812,6 @@@
         if (pos_in + len > i_size_read(inode_in))
                 return -EINVAL;
   
- -      ret = mnt_want_write_file(file_out);
- -      if (ret)
- -              return ret;
- -
         ret = file_in->f_op->clone_file_range(file_in, pos_in,
                         file_out, pos_out, len);
         if (!ret) {
@@@ -1694,10 -1821,107 +1819,106 @@@
                 fsnotify_modify(file_out);
         }
   
- -      mnt_drop_write_file(file_out);
         return ret;
   }
   EXPORT_SYMBOL(vfs_clone_file_range);
   
+ /*
+  * Read a page's worth of file data into the page cache.  Return the page
+  * locked.
+  */
+ static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
+ {
+       struct address_space *mapping;
+       struct page *page;
+       pgoff_t n;
+ 
+       n = offset >> PAGE_SHIFT;
+       mapping = inode->i_mapping;
+       page = read_mapping_page(mapping, n, NULL);
+       if (IS_ERR(page))
+               return page;
+       if (!PageUptodate(page)) {
+               put_page(page);
+               return ERR_PTR(-EIO);
+       }
+       lock_page(page);
+       return page;
+ }
+ 
+ /*
+  * Compare extents of two files to see if they are the same.
+  * Caller must have locked both inodes to prevent write races.
+  */
+ int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
+                                 struct inode *dest, loff_t destoff,
+                                 loff_t len, bool *is_same)
+ {
+       loff_t src_poff;
+       loff_t dest_poff;
+       void *src_addr;
+       void *dest_addr;
+       struct page *src_page;
+       struct page *dest_page;
+       loff_t cmp_len;
+       bool same;
+       int error;
+ 
+       error = -EINVAL;
+       same = true;
+       while (len) {
+               src_poff = srcoff & (PAGE_SIZE - 1);
+               dest_poff = destoff & (PAGE_SIZE - 1);
+               cmp_len = min(PAGE_SIZE - src_poff,
+                             PAGE_SIZE - dest_poff);
+               cmp_len = min(cmp_len, len);
+               if (cmp_len <= 0)
+                       goto out_error;
+ 
+               src_page = vfs_dedupe_get_page(src, srcoff);
+               if (IS_ERR(src_page)) {
+                       error = PTR_ERR(src_page);
+                       goto out_error;
+               }
+               dest_page = vfs_dedupe_get_page(dest, destoff);
+               if (IS_ERR(dest_page)) {
+                       error = PTR_ERR(dest_page);
+                       unlock_page(src_page);
+                       put_page(src_page);
+                       goto out_error;
+               }
+               src_addr = kmap_atomic(src_page);
+               dest_addr = kmap_atomic(dest_page);
+ 
+               flush_dcache_page(src_page);
+               flush_dcache_page(dest_page);
+ 
+               if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
+                       same = false;
+ 
+               kunmap_atomic(dest_addr);
+               kunmap_atomic(src_addr);
+               unlock_page(dest_page);
+               unlock_page(src_page);
+               put_page(dest_page);
+               put_page(src_page);
+ 
+               if (!same)
+                       break;
+ 
+               srcoff += cmp_len;
+               destoff += cmp_len;
+               len -= cmp_len;
+       }
+ 
+       *is_same = same;
+       return 0;
+ 
+ out_error:
+       return error;
+ }
+ EXPORT_SYMBOL(vfs_dedupe_file_range_compare);
+ 
   int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
   {
         struct file_dedupe_range_info *info;
diff --combined fs/xfs/xfs_file.c

index 65d27a5,86ecc9b..bbb9eb6
--- 1/fs/xfs/xfs_file.c
--- 2/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@@ -47,6 -47,40 +47,6 @@@
   
   static const struct vm_operations_struct xfs_file_vm_ops;
   
- -/*
- - * Locking primitives for read and write IO paths to ensure we consistently use
- - * and order the inode->i_mutex, ip->i_lock and ip->i_iolock.
- - */
- -static inline void
- -xfs_rw_ilock(
- -      struct xfs_inode        *ip,
- -      int                     type)
- -{
- -      if (type & XFS_IOLOCK_EXCL)
- -              inode_lock(VFS_I(ip));
- -      xfs_ilock(ip, type);
- -}
- -
- -static inline void
- -xfs_rw_iunlock(
- -      struct xfs_inode        *ip,
- -      int                     type)
- -{
- -      xfs_iunlock(ip, type);
- -      if (type & XFS_IOLOCK_EXCL)
- -              inode_unlock(VFS_I(ip));
- -}
- -
- -static inline void
- -xfs_rw_ilock_demote(
- -      struct xfs_inode        *ip,
- -      int                     type)
- -{
- -      xfs_ilock_demote(ip, type);
- -      if (type & XFS_IOLOCK_EXCL)
- -              inode_unlock(VFS_I(ip));
- -}
- -
   /*
    * Clear the specified ranges to zero through either the pagecache or DAX.
    * Holes and unwritten extents will be left as-is as they already are zeroed.
@@@ -149,16 -183,19 +149,16 @@@ xfs_file_fsync
   
         xfs_iflags_clear(ip, XFS_ITRUNCATED);
   
- -      if (mp->m_flags & XFS_MOUNT_BARRIER) {
- -              /*
- -               * If we have an RT and/or log subvolume we need to make sure
- -               * to flush the write cache the device used for file data
- -               * first.  This is to ensure newly written file data make
- -               * it to disk before logging the new inode size in case of
- -               * an extending write.
- -               */
- -              if (XFS_IS_REALTIME_INODE(ip))
- -                      xfs_blkdev_issue_flush(mp->m_rtdev_targp);
- -              else if (mp->m_logdev_targp != mp->m_ddev_targp)
- -                      xfs_blkdev_issue_flush(mp->m_ddev_targp);
- -      }
+ +      /*
+ +       * If we have an RT and/or log subvolume we need to make sure to flush
+ +       * the write cache the device used for file data first.  This is to
+ +       * ensure newly written file data make it to disk before logging the new
+ +       * inode size in case of an extending write.
+ +       */
+ +      if (XFS_IS_REALTIME_INODE(ip))
+ +              xfs_blkdev_issue_flush(mp->m_rtdev_targp);
+ +      else if (mp->m_logdev_targp != mp->m_ddev_targp)
+ +              xfs_blkdev_issue_flush(mp->m_ddev_targp);
   
         /*
          * All metadata updates are logged, which means that we just have to
@@@ -193,8 -230,10 +193,8 @@@
          * an already allocated file and thus do not have any metadata to
          * commit.
          */
- -      if ((mp->m_flags & XFS_MOUNT_BARRIER) &&
- -          mp->m_logdev_targp == mp->m_ddev_targp &&
- -          !XFS_IS_REALTIME_INODE(ip) &&
- -          !log_flushed)
+ +      if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
+ +          mp->m_logdev_targp == mp->m_ddev_targp)
                 xfs_blkdev_issue_flush(mp->m_ddev_targp);
   
         return error;
@@@ -205,21 -244,62 +205,21 @@@ xfs_file_dio_aio_read
         struct kiocb            *iocb,
         struct iov_iter         *to)
   {
- -      struct address_space    *mapping = iocb->ki_filp->f_mapping;
- -      struct inode            *inode = mapping->host;
- -      struct xfs_inode        *ip = XFS_I(inode);
- -      loff_t                  isize = i_size_read(inode);
+ +      struct xfs_inode        *ip = XFS_I(file_inode(iocb->ki_filp));
         size_t                  count = iov_iter_count(to);
- -      loff_t                  end = iocb->ki_pos + count - 1;
- -      struct iov_iter         data;
- -      struct xfs_buftarg      *target;
- -      ssize_t                 ret = 0;
+ +      ssize_t                 ret;
   
         trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
   
         if (!count)
                 return 0; /* skip atime */
   
- -      if (XFS_IS_REALTIME_INODE(ip))
- -              target = ip->i_mount->m_rtdev_targp;
- -      else
- -              target = ip->i_mount->m_ddev_targp;
- -
- -      /* DIO must be aligned to device logical sector size */
- -      if ((iocb->ki_pos | count) & target->bt_logical_sectormask) {
- -              if (iocb->ki_pos == isize)
- -                      return 0;
- -              return -EINVAL;
- -      }
- -
         file_accessed(iocb->ki_filp);
   
- -      xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
- -      if (mapping->nrpages) {
- -              ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end);
- -              if (ret)
- -                      goto out_unlock;
- -
- -              /*
- -               * Invalidate whole pages. This can return an error if we fail
- -               * to invalidate a page, but this should never happen on XFS.
- -               * Warn if it does fail.
- -               */
- -              ret = invalidate_inode_pages2_range(mapping,
- -                              iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
- -              WARN_ON_ONCE(ret);
- -              ret = 0;
- -      }
+ +      xfs_ilock(ip, XFS_IOLOCK_SHARED);
+ +      ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL);
+ +      xfs_iunlock(ip, XFS_IOLOCK_SHARED);
   
- -      data = *to;
- -      ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
- -                      xfs_get_blocks_direct, NULL, NULL, 0);
- -      if (ret >= 0) {
- -              iocb->ki_pos += ret;
- -              iov_iter_advance(to, ret);
- -      }
- -
- -out_unlock:
- -      xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
         return ret;
   }
   
@@@ -237,9 -317,9 +237,9 @@@ xfs_file_dax_read
         if (!count)
                 return 0; /* skip atime */
   
- -      xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
- -      ret = iomap_dax_rw(iocb, to, &xfs_iomap_ops);
- -      xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+ +      xfs_ilock(ip, XFS_IOLOCK_SHARED);
+ +      ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops);
+ +      xfs_iunlock(ip, XFS_IOLOCK_SHARED);
   
         file_accessed(iocb->ki_filp);
         return ret;
@@@ -255,9 -335,9 +255,9 @@@ xfs_file_buffered_aio_read
   
         trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
   
- -      xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+ +      xfs_ilock(ip, XFS_IOLOCK_SHARED);
         ret = generic_file_read_iter(iocb, to);
- -      xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+ +      xfs_iunlock(ip, XFS_IOLOCK_SHARED);
   
         return ret;
   }
@@@ -338,18 -418,15 +338,18 @@@ restart
         if (error <= 0)
                 return error;
   
- -      error = xfs_break_layouts(inode, iolock, true);
+ +      error = xfs_break_layouts(inode, iolock);
         if (error)
                 return error;
   
- -      /* For changing security info in file_remove_privs() we need i_mutex */
+ +      /*
+ +       * For changing security info in file_remove_privs() we need i_rwsem
+ +       * exclusively.
+ +       */
         if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
- -              xfs_rw_iunlock(ip, *iolock);
+ +              xfs_iunlock(ip, *iolock);
                 *iolock = XFS_IOLOCK_EXCL;
- -              xfs_rw_ilock(ip, *iolock);
+ +              xfs_ilock(ip, *iolock);
                 goto restart;
         }
         /*
@@@ -374,9 -451,9 +374,9 @@@
                 spin_unlock(&ip->i_flags_lock);
                 if (!drained_dio) {
                         if (*iolock == XFS_IOLOCK_SHARED) {
- -                              xfs_rw_iunlock(ip, *iolock);
+ +                              xfs_iunlock(ip, *iolock);
                                 *iolock = XFS_IOLOCK_EXCL;
- -                              xfs_rw_ilock(ip, *iolock);
+ +                              xfs_ilock(ip, *iolock);
                                 iov_iter_reexpand(from, count);
                         }
                         /*
@@@ -419,58 -496,6 +419,58 @@@
         return 0;
   }
   
+ +static int
+ +xfs_dio_write_end_io(
+ +      struct kiocb            *iocb,
+ +      ssize_t                 size,
+ +      unsigned                flags)
+ +{
+ +      struct inode            *inode = file_inode(iocb->ki_filp);
+ +      struct xfs_inode        *ip = XFS_I(inode);
+ +      loff_t                  offset = iocb->ki_pos;
+ +      bool                    update_size = false;
+ +      int                     error = 0;
+ +
+ +      trace_xfs_end_io_direct_write(ip, offset, size);
+ +
+ +      if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ +              return -EIO;
+ +
+ +      if (size <= 0)
+ +              return size;
+ +
+ +      /*
+ +       * We need to update the in-core inode size here so that we don't end up
+ +       * with the on-disk inode size being outside the in-core inode size. We
+ +       * have no other method of updating EOF for AIO, so always do it here
+ +       * if necessary.
+ +       *
+ +       * We need to lock the test/set EOF update as we can be racing with
+ +       * other IO completions here to update the EOF. Failing to serialise
+ +       * here can result in EOF moving backwards and Bad Things Happen when
+ +       * that occurs.
+ +       */
+ +      spin_lock(&ip->i_flags_lock);
+ +      if (offset + size > i_size_read(inode)) {
+ +              i_size_write(inode, offset + size);
+ +              update_size = true;
+ +      }
+ +      spin_unlock(&ip->i_flags_lock);
+ +
+ +      if (flags & IOMAP_DIO_COW) {
+ +              error = xfs_reflink_end_cow(ip, offset, size);
+ +              if (error)
+ +                      return error;
+ +      }
+ +
+ +      if (flags & IOMAP_DIO_UNWRITTEN)
+ +              error = xfs_iomap_write_unwritten(ip, offset, size);
+ +      else if (update_size)
+ +              error = xfs_setfilesize(ip, offset, size);
+ +
+ +      return error;
+ +}
+ +
   /*
    * xfs_file_dio_aio_write - handle direct IO writes
    *
@@@ -510,7 -535,9 +510,7 @@@ xfs_file_dio_aio_write
         int                     unaligned_io = 0;
         int                     iolock;
         size_t                  count = iov_iter_count(from);
- -      loff_t                  end;
- -      struct iov_iter         data;
- -      struct xfs_buftarg      *target = XFS_IS_REALTIME_INODE(ip) ?
+ +      struct xfs_buftarg      *target = XFS_IS_REALTIME_INODE(ip) ?
                                         mp->m_rtdev_targp : mp->m_ddev_targp;
   
         /* DIO must be aligned to device logical sector size */
@@@ -532,12 -559,29 +532,12 @@@
                 iolock = XFS_IOLOCK_SHARED;
         }
   
- -      xfs_rw_ilock(ip, iolock);
+ +      xfs_ilock(ip, iolock);
   
         ret = xfs_file_aio_write_checks(iocb, from, &iolock);
         if (ret)
                 goto out;
         count = iov_iter_count(from);
- -      end = iocb->ki_pos + count - 1;
- -
- -      if (mapping->nrpages) {
- -              ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end);
- -              if (ret)
- -                      goto out;
- -
- -              /*
- -               * Invalidate whole pages. This can return an error if we fail
- -               * to invalidate a page, but this should never happen on XFS.
- -               * Warn if it does fail.
- -               */
- -              ret = invalidate_inode_pages2_range(mapping,
- -                              iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
- -              WARN_ON_ONCE(ret);
- -              ret = 0;
- -      }
   
         /*
          * If we are doing unaligned IO, wait for all other IO to drain,
@@@ -547,7 -591,7 +547,7 @@@
         if (unaligned_io)
                 inode_dio_wait(inode);
         else if (iolock == XFS_IOLOCK_EXCL) {
- -              xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+ +              xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
                 iolock = XFS_IOLOCK_SHARED;
         }
   
@@@ -560,9 -604,24 +560,9 @@@
                         goto out;
         }
   
- -      data = *from;
- -      ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
- -                      xfs_get_blocks_direct, xfs_end_io_direct_write,
- -                      NULL, DIO_ASYNC_EXTEND);
- -
- -      /* see generic_file_direct_write() for why this is necessary */
- -      if (mapping->nrpages) {
- -              invalidate_inode_pages2_range(mapping,
- -                                            iocb->ki_pos >> PAGE_SHIFT,
- -                                            end >> PAGE_SHIFT);
- -      }
- -
- -      if (ret > 0) {
- -              iocb->ki_pos += ret;
- -              iov_iter_advance(from, ret);
- -      }
+ +      ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io);
   out:
- -      xfs_rw_iunlock(ip, iolock);
+ +      xfs_iunlock(ip, iolock);
   
         /*
          * No fallback to buffered IO on errors for XFS, direct IO will either
@@@ -584,7 -643,7 +584,7 @@@ xfs_file_dax_write
         size_t                  count;
         loff_t                  pos;
   
- -      xfs_rw_ilock(ip, iolock);
+ +      xfs_ilock(ip, iolock);
         ret = xfs_file_aio_write_checks(iocb, from, &iolock);
         if (ret)
                 goto out;
@@@ -593,13 -652,15 +593,13 @@@
         count = iov_iter_count(from);
   
         trace_xfs_file_dax_write(ip, count, pos);
- -
- -      ret = iomap_dax_rw(iocb, from, &xfs_iomap_ops);
+ +      ret = dax_iomap_rw(iocb, from, &xfs_iomap_ops);
         if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
                 i_size_write(inode, iocb->ki_pos);
                 error = xfs_setfilesize(ip, pos, ret);
         }
- -
   out:
- -      xfs_rw_iunlock(ip, iolock);
+ +      xfs_iunlock(ip, iolock);
         return error ? error : ret;
   }
   
@@@ -616,7 -677,7 +616,7 @@@ xfs_file_buffered_aio_write
         int                     enospc = 0;
         int                     iolock = XFS_IOLOCK_EXCL;
   
- -      xfs_rw_ilock(ip, iolock);
+ +      xfs_ilock(ip, iolock);
   
         ret = xfs_file_aio_write_checks(iocb, from, &iolock);
         if (ret)
@@@ -660,7 -721,7 +660,7 @@@ write_retry
   
         current->backing_dev_info = NULL;
   out:
- -      xfs_rw_iunlock(ip, iolock);
+ +      xfs_iunlock(ip, iolock);
         return ret;
   }
   
@@@ -736,7 -797,7 +736,7 @@@ xfs_file_fallocate
                 return -EOPNOTSUPP;
   
         xfs_ilock(ip, iolock);
- -      error = xfs_break_layouts(inode, &iolock, false);
+ +      error = xfs_break_layouts(inode, &iolock);
         if (error)
                 goto out_unlock;
   
@@@ -848,24 -909,6 +848,6 @@@ out_unlock
         return error;
   }
   
- STATIC ssize_t
- xfs_file_copy_range(
-       struct file     *file_in,
-       loff_t          pos_in,
-       struct file     *file_out,
-       loff_t          pos_out,
-       size_t          len,
-       unsigned int    flags)
- {
-       int             error;
- 
-       error = xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
-                                    len, false);
-       if (error)
-               return error;
-       return len;
- }
- 
   STATIC int
   xfs_file_clone_range(
         struct file     *file_in,
@@@ -878,6 -921,7 +860,6 @@@
                                      len, false);
   }
   
- -#define XFS_MAX_DEDUPE_LEN    (16 * 1024 * 1024)
   STATIC ssize_t
   xfs_file_dedupe_range(
         struct file     *src_file,
@@@ -888,6 -932,14 +870,6 @@@
   {
         int             error;
   
- -      /*
- -       * Limit the total length we will dedupe for each operation.
- -       * This is intended to bound the total time spent in this
- -       * ioctl to something sane.
- -       */
- -      if (len > XFS_MAX_DEDUPE_LEN)
- -              len = XFS_MAX_DEDUPE_LEN;
- -
         error = xfs_reflink_remap_range(src_file, loff, dst_file, dst_loff,
                                      len, true);
         if (error)
@@@ -1404,7 -1456,7 +1386,7 @@@ xfs_filemap_page_mkwrite
         xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
   
         if (IS_DAX(inode)) {
- -              ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops);
+ +              ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
         } else {
                 ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
                 ret = block_page_mkwrite_return(ret);
@@@ -1431,9 -1483,15 +1413,9 @@@ xfs_filemap_fault
                 return xfs_filemap_page_mkwrite(vma, vmf);
   
         xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- -      if (IS_DAX(inode)) {
- -              /*
- -               * we do not want to trigger unwritten extent conversion on read
- -               * faults - that is unnecessary overhead and would also require
- -               * changes to xfs_get_blocks_direct() to map unwritten extent
- -               * ioend for conversion on read-only mappings.
- -               */
- -              ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops);
- -      } else
+ +      if (IS_DAX(inode))
+ +              ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
+ +      else
                 ret = filemap_fault(vma, vmf);
         xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
   
@@@ -1469,7 -1527,7 +1451,7 @@@ xfs_filemap_pmd_fault
         }
   
         xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- -      ret = dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault);
+ +      ret = dax_iomap_pmd_fault(vma, addr, pmd, flags, &xfs_iomap_ops);
         xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
   
         if (flags & FAULT_FLAG_WRITE)
@@@ -1549,7 -1607,6 +1531,6 @@@ const struct file_operations xfs_file_o
         .fsync          = xfs_file_fsync,
         .get_unmapped_area = thp_get_unmapped_area,
         .fallocate      = xfs_file_fallocate,
-       .copy_file_range = xfs_file_copy_range,
         .clone_file_range = xfs_file_clone_range,
         .dedupe_file_range = xfs_file_dedupe_range,
   };
diff --combined fs/xfs/xfs_reflink.c

index 88fd03c,95d6828..aca2d4b
--- 1/fs/xfs/xfs_reflink.c
--- 2/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@@ -243,11 -243,12 +243,11 @@@ xfs_reflink_reserve_cow
         struct xfs_bmbt_irec    *imap,
         bool                    *shared)
   {
- -      struct xfs_bmbt_irec    got, prev;
- -      xfs_fileoff_t           end_fsb, orig_end_fsb;
- -      int                     eof = 0, error = 0;
- -      bool                    trimmed;
+ +      struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+ +      struct xfs_bmbt_irec    got;
+ +      int                     error = 0;
+ +      bool                    eof = false, trimmed;
         xfs_extnum_t            idx;
- -      xfs_extlen_t            align;
   
         /*
          * Search the COW fork extent list first.  This serves two purposes:
@@@ -257,9 -258,8 +257,9 @@@
          * extent list is generally faster than going out to the shared extent
          * tree.
          */
- -      xfs_bmap_search_extents(ip, imap->br_startoff, XFS_COW_FORK, &eof, &idx,
- -                      &got, &prev);
+ +
+ +      if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &idx, &got))
+ +              eof = true;
         if (!eof && got.br_startoff <= imap->br_startoff) {
                 trace_xfs_reflink_cow_found(ip, imap);
                 xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
@@@ -285,12 -285,33 +285,12 @@@
         if (error)
                 return error;
   
- -      end_fsb = orig_end_fsb = imap->br_startoff + imap->br_blockcount;
- -
- -      align = xfs_eof_alignment(ip, xfs_get_cowextsz_hint(ip));
- -      if (align)
- -              end_fsb = roundup_64(end_fsb, align);
- -
- -retry:
         error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff,
- -                      end_fsb - imap->br_startoff, &got, &prev, &idx, eof);
- -      switch (error) {
- -      case 0:
- -              break;
- -      case -ENOSPC:
- -      case -EDQUOT:
- -              /* retry without any preallocation */
+ +                      imap->br_blockcount, 0, &got, &idx, eof);
+ +      if (error == -ENOSPC || error == -EDQUOT)
                 trace_xfs_reflink_cow_enospc(ip, imap);
- -              if (end_fsb != orig_end_fsb) {
- -                      end_fsb = orig_end_fsb;
- -                      goto retry;
- -              }
- -              /*FALLTHRU*/
- -      default:
+ +      if (error)
                 return error;
- -      }
- -
- -      if (end_fsb != orig_end_fsb)
- -              xfs_inode_set_cowblocks_tag(ip);
   
         trace_xfs_reflink_cow_alloc(ip, &got);
         return 0;
@@@ -397,65 -418,87 +397,65 @@@ xfs_reflink_allocate_cow_range
   }
   
   /*
- - * Find the CoW reservation (and whether or not it needs block allocation)
- - * for a given byte offset of a file.
+ + * Find the CoW reservation for a given byte offset of a file.
    */
   bool
   xfs_reflink_find_cow_mapping(
         struct xfs_inode                *ip,
         xfs_off_t                       offset,
- -      struct xfs_bmbt_irec            *imap,
- -      bool                            *need_alloc)
+ +      struct xfs_bmbt_irec            *imap)
   {
- -      struct xfs_bmbt_irec            irec;
- -      struct xfs_ifork                *ifp;
- -      struct xfs_bmbt_rec_host        *gotp;
- -      xfs_fileoff_t                   bno;
+ +      struct xfs_ifork                *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+ +      xfs_fileoff_t                   offset_fsb;
+ +      struct xfs_bmbt_irec            got;
         xfs_extnum_t                    idx;
   
         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED));
         ASSERT(xfs_is_reflink_inode(ip));
   
- -      /* Find the extent in the CoW fork. */
- -      ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
- -      bno = XFS_B_TO_FSBT(ip->i_mount, offset);
- -      gotp = xfs_iext_bno_to_ext(ifp, bno, &idx);
- -      if (!gotp)
+ +      offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+ +      if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got))
                 return false;
- -
- -      xfs_bmbt_get_all(gotp, &irec);
- -      if (bno >= irec.br_startoff + irec.br_blockcount ||
- -          bno < irec.br_startoff)
+ +      if (got.br_startoff > offset_fsb)
                 return false;
   
         trace_xfs_reflink_find_cow_mapping(ip, offset, 1, XFS_IO_OVERWRITE,
- -                      &irec);
- -
- -      /* If it's still delalloc, we must allocate later. */
- -      *imap = irec;
- -      *need_alloc = !!(isnullstartblock(irec.br_startblock));
- -
+ +                      &got);
+ +      *imap = got;
         return true;
   }
   
   /*
    * Trim an extent to end at the next CoW reservation past offset_fsb.
    */
- -int
+ +void
   xfs_reflink_trim_irec_to_next_cow(
         struct xfs_inode                *ip,
         xfs_fileoff_t                   offset_fsb,
         struct xfs_bmbt_irec            *imap)
   {
- -      struct xfs_bmbt_irec            irec;
- -      struct xfs_ifork                *ifp;
- -      struct xfs_bmbt_rec_host        *gotp;
+ +      struct xfs_ifork                *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+ +      struct xfs_bmbt_irec            got;
         xfs_extnum_t                    idx;
   
         if (!xfs_is_reflink_inode(ip))
- -              return 0;
+ +              return;
   
         /* Find the extent in the CoW fork. */
- -      ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
- -      gotp = xfs_iext_bno_to_ext(ifp, offset_fsb, &idx);
- -      if (!gotp)
- -              return 0;
- -      xfs_bmbt_get_all(gotp, &irec);
+ +      if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got))
+ +              return;
   
         /* This is the extent before; try sliding up one. */
- -      if (irec.br_startoff < offset_fsb) {
- -              idx++;
- -              if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
- -                      return 0;
- -              gotp = xfs_iext_get_ext(ifp, idx);
- -              xfs_bmbt_get_all(gotp, &irec);
+ +      if (got.br_startoff < offset_fsb) {
+ +              if (!xfs_iext_get_extent(ifp, idx + 1, &got))
+ +                      return;
         }
   
- -      if (irec.br_startoff >= imap->br_startoff + imap->br_blockcount)
- -              return 0;
+ +      if (got.br_startoff >= imap->br_startoff + imap->br_blockcount)
+ +              return;
   
- -      imap->br_blockcount = irec.br_startoff - imap->br_startoff;
+ +      imap->br_blockcount = got.br_startoff - imap->br_startoff;
         trace_xfs_reflink_trim_irec(ip, imap);
- -
- -      return 0;
   }
   
   /*
@@@ -469,15 -512,18 +469,15 @@@ xfs_reflink_cancel_cow_blocks
         xfs_fileoff_t                   end_fsb)
   {
         struct xfs_ifork                *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
- -      struct xfs_bmbt_irec            got, prev, del;
+ +      struct xfs_bmbt_irec            got, del;
         xfs_extnum_t                    idx;
         xfs_fsblock_t                   firstfsb;
         struct xfs_defer_ops            dfops;
- -      int                             error = 0, eof = 0;
+ +      int                             error = 0;
   
         if (!xfs_is_reflink_inode(ip))
                 return 0;
- -
- -      xfs_bmap_search_extents(ip, offset_fsb, XFS_COW_FORK, &eof, &idx,
- -                      &got, &prev);
- -      if (eof)
+ +      if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got))
                 return 0;
   
         while (got.br_startoff < end_fsb) {
@@@ -520,8 -566,9 +520,8 @@@
                         xfs_bmap_del_extent_cow(ip, &idx, &got, &del);
                 }
   
- -              if (++idx >= ifp->if_bytes / sizeof(struct xfs_bmbt_rec))
+ +              if (!xfs_iext_get_extent(ifp, ++idx, &got))
                         break;
- -              xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &got);
         }
   
         /* clear tag if cow fork is emptied */
@@@ -591,13 -638,13 +591,13 @@@ xfs_reflink_end_cow
         xfs_off_t                       count)
   {
         struct xfs_ifork                *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
- -      struct xfs_bmbt_irec            got, prev, del;
+ +      struct xfs_bmbt_irec            got, del;
         struct xfs_trans                *tp;
         xfs_fileoff_t                   offset_fsb;
         xfs_fileoff_t                   end_fsb;
         xfs_fsblock_t                   firstfsb;
         struct xfs_defer_ops            dfops;
- -      int                             error, eof = 0;
+ +      int                             error;
         unsigned int                    resblks;
         xfs_filblks_t                   rlen;
         xfs_extnum_t                    idx;
@@@ -621,11 -668,13 +621,11 @@@
         xfs_ilock(ip, XFS_ILOCK_EXCL);
         xfs_trans_ijoin(tp, ip, 0);
   
- -      xfs_bmap_search_extents(ip, end_fsb - 1, XFS_COW_FORK, &eof, &idx,
- -                      &got, &prev);
- -
         /* If there is a hole at end_fsb - 1 go to the previous extent */
- -      if (eof || got.br_startoff > end_fsb) {
+ +      if (!xfs_iext_lookup_extent(ip, ifp, end_fsb - 1, &idx, &got) ||
+ +          got.br_startoff > end_fsb) {
                 ASSERT(idx > 0);
- -              xfs_bmbt_get_all(xfs_iext_get_ext(ifp, --idx), &got);
+ +              xfs_iext_get_extent(ifp, --idx, &got);
         }
   
         /* Walk backwards until we're out of the I/O range... */
@@@ -673,9 -722,11 +673,9 @@@
                 error = xfs_defer_finish(&tp, &dfops, ip);
                 if (error)
                         goto out_defer;
- -
   next_extent:
- -              if (idx < 0)
+ +              if (!xfs_iext_get_extent(ifp, idx, &got))
                         break;
- -              xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &got);
         }
   
         error = xfs_trans_commit(tp);
@@@ -1113,111 -1164,6 +1113,6 @@@ err
         return error;
   }
   
- /*
-  * Read a page's worth of file data into the page cache.  Return the page
-  * locked.
-  */
- static struct page *
- xfs_get_page(
-       struct inode    *inode,
-       xfs_off_t       offset)
- {
-       struct address_space    *mapping;
-       struct page             *page;
-       pgoff_t                 n;
- 
-       n = offset >> PAGE_SHIFT;
-       mapping = inode->i_mapping;
-       page = read_mapping_page(mapping, n, NULL);
-       if (IS_ERR(page))
-               return page;
-       if (!PageUptodate(page)) {
-               put_page(page);
-               return ERR_PTR(-EIO);
-       }
-       lock_page(page);
-       return page;
- }
- 
- /*
-  * Compare extents of two files to see if they are the same.
-  */
- static int
- xfs_compare_extents(
-       struct inode    *src,
-       xfs_off_t       srcoff,
-       struct inode    *dest,
-       xfs_off_t       destoff,
-       xfs_off_t       len,
-       bool            *is_same)
- {
-       xfs_off_t       src_poff;
-       xfs_off_t       dest_poff;
-       void            *src_addr;
-       void            *dest_addr;
-       struct page     *src_page;
-       struct page     *dest_page;
-       xfs_off_t       cmp_len;
-       bool            same;
-       int             error;
- 
-       error = -EINVAL;
-       same = true;
-       while (len) {
-               src_poff = srcoff & (PAGE_SIZE - 1);
-               dest_poff = destoff & (PAGE_SIZE - 1);
-               cmp_len = min(PAGE_SIZE - src_poff,
-                             PAGE_SIZE - dest_poff);
-               cmp_len = min(cmp_len, len);
-               ASSERT(cmp_len > 0);
- 
-               trace_xfs_reflink_compare_extents(XFS_I(src), srcoff, cmp_len,
-                               XFS_I(dest), destoff);
- 
-               src_page = xfs_get_page(src, srcoff);
-               if (IS_ERR(src_page)) {
-                       error = PTR_ERR(src_page);
-                       goto out_error;
-               }
-               dest_page = xfs_get_page(dest, destoff);
-               if (IS_ERR(dest_page)) {
-                       error = PTR_ERR(dest_page);
-                       unlock_page(src_page);
-                       put_page(src_page);
-                       goto out_error;
-               }
-               src_addr = kmap_atomic(src_page);
-               dest_addr = kmap_atomic(dest_page);
- 
-               flush_dcache_page(src_page);
-               flush_dcache_page(dest_page);
- 
-               if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
-                       same = false;
- 
-               kunmap_atomic(dest_addr);
-               kunmap_atomic(src_addr);
-               unlock_page(dest_page);
-               unlock_page(src_page);
-               put_page(dest_page);
-               put_page(src_page);
- 
-               if (!same)
-                       break;
- 
-               srcoff += cmp_len;
-               destoff += cmp_len;
-               len -= cmp_len;
-       }
- 
-       *is_same = same;
-       return 0;
- 
- out_error:
-       trace_xfs_reflink_compare_extents_error(XFS_I(dest), error, _RET_IP_);
-       return error;
- }
- 
   /*
    * Link a range of blocks from one file to another.
    */
@@@ -1235,14 -1181,11 +1130,11 @@@ xfs_reflink_remap_range
         struct inode            *inode_out = file_inode(file_out);
         struct xfs_inode        *dest = XFS_I(inode_out);
         struct xfs_mount        *mp = src->i_mount;
-       loff_t                  bs = inode_out->i_sb->s_blocksize;
         bool                    same_inode = (inode_in == inode_out);
         xfs_fileoff_t           sfsbno, dfsbno;
         xfs_filblks_t           fsblen;
         xfs_extlen_t            cowextsize;
-       loff_t                  isize;
         ssize_t                 ret;
-       loff_t                  blen;
   
         if (!xfs_sb_version_hasreflink(&mp->m_sb))
                 return -EOPNOTSUPP;
@@@ -1251,32 -1194,16 +1143,14 @@@
                 return -EIO;
   
         /* Lock both files against IO */
- -      if (same_inode) {
- -              xfs_ilock(src, XFS_IOLOCK_EXCL);
+ +      lock_two_nondirectories(inode_in, inode_out);
+ +      if (same_inode)
                 xfs_ilock(src, XFS_MMAPLOCK_EXCL);
- -      } else {
- -              xfs_lock_two_inodes(src, dest, XFS_IOLOCK_EXCL);
+ +      else
                 xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL);
- -      }
   
-       /* Don't touch certain kinds of inodes */
-       ret = -EPERM;
-       if (IS_IMMUTABLE(inode_out))
-               goto out_unlock;
- 
-       ret = -ETXTBSY;
-       if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
-               goto out_unlock;
- 
- 
-       /* Don't reflink dirs, pipes, sockets... */
-       ret = -EISDIR;
-       if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
-               goto out_unlock;
+       /* Check file eligibility and prepare for block sharing. */
         ret = -EINVAL;
-       if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode))
-               goto out_unlock;
-       if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
-               goto out_unlock;
- 
         /* Don't reflink realtime inodes */
         if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest))
                 goto out_unlock;
@@@ -1285,97 -1212,18 +1159,18 @@@
         if (IS_DAX(inode_in) || IS_DAX(inode_out))
                 goto out_unlock;
   
-       /* Are we going all the way to the end? */
-       isize = i_size_read(inode_in);
-       if (isize == 0) {
-               ret = 0;
-               goto out_unlock;
-       }
- 
-       /* Zero length dedupe exits immediately; reflink goes to EOF. */
-       if (len == 0) {
-               if (is_dedupe) {
-                       ret = 0;
-                       goto out_unlock;
-               }
-               len = isize - pos_in;
-       }
- 
-       /* Ensure offsets don't wrap and the input is inside i_size */
-       if (pos_in + len < pos_in || pos_out + len < pos_out ||
-           pos_in + len > isize)
-               goto out_unlock;
- 
-       /* Don't allow dedupe past EOF in the dest file */
-       if (is_dedupe) {
-               loff_t  disize;
- 
-               disize = i_size_read(inode_out);
-               if (pos_out >= disize || pos_out + len > disize)
-                       goto out_unlock;
-       }
- 
-       /* If we're linking to EOF, continue to the block boundary. */
-       if (pos_in + len == isize)
-               blen = ALIGN(isize, bs) - pos_in;
-       else
-               blen = len;
- 
-       /* Only reflink if we're aligned to block boundaries */
-       if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
-           !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
-               goto out_unlock;
- 
-       /* Don't allow overlapped reflink within the same file */
-       if (same_inode) {
-               if (pos_out + blen > pos_in && pos_out < pos_in + blen)
-                       goto out_unlock;
-       }
- 
-       /* Wait for the completion of any pending IOs on both files */
-       inode_dio_wait(inode_in);
-       if (!same_inode)
-               inode_dio_wait(inode_out);
- 
-       ret = filemap_write_and_wait_range(inode_in->i_mapping,
-                       pos_in, pos_in + len - 1);
-       if (ret)
-               goto out_unlock;
- 
-       ret = filemap_write_and_wait_range(inode_out->i_mapping,
-                       pos_out, pos_out + len - 1);
-       if (ret)
+       ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out,
+                       &len, is_dedupe);
+       if (ret || len == 0)
                 goto out_unlock;
   
         trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
   
-       /*
-        * Check that the extents are the same.
-        */
-       if (is_dedupe) {
-               bool            is_same = false;
- 
-               ret = xfs_compare_extents(inode_in, pos_in, inode_out, pos_out,
-                               len, &is_same);
-               if (ret)
-                       goto out_unlock;
-               if (!is_same) {
-                       ret = -EBADE;
-                       goto out_unlock;
-               }
-       }
- 
+       /* Set flags and remap blocks. */
         ret = xfs_reflink_set_inode_flag(src, dest);
         if (ret)
                 goto out_unlock;
   
-       /*
-        * Invalidate the page cache so that we can clear any CoW mappings
-        * in the destination file.
-        */
-       truncate_inode_pages_range(&inode_out->i_data, pos_out,
-                                  PAGE_ALIGN(pos_out + len) - 1);
- 
         dfsbno = XFS_B_TO_FSBT(mp, pos_out);
         sfsbno = XFS_B_TO_FSBT(mp, pos_in);
         fsblen = XFS_B_TO_FSB(mp, len);
@@@ -1384,6 -1232,10 +1179,10 @@@
         if (ret)
                 goto out_unlock;
   
+       /* Zap any page cache for the destination file's range. */
+       truncate_inode_pages_range(&inode_out->i_data, pos_out,
+                                  PAGE_ALIGN(pos_out + len) - 1);
+ 
         /*
          * Carry the cowextsize hint from src to dest if we're sharing the
          * entire source file to the entire destination file, the source file
@@@ -1400,9 -1252,11 +1199,9 @@@
   
   out_unlock:
         xfs_iunlock(src, XFS_MMAPLOCK_EXCL);
- -      xfs_iunlock(src, XFS_IOLOCK_EXCL);
- -      if (src->i_ino != dest->i_ino) {
+ +      if (!same_inode)
                 xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
- -              xfs_iunlock(dest, XFS_IOLOCK_EXCL);
- -      }
+ +      unlock_two_nondirectories(inode_in, inode_out);
         if (ret)
                 trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
         return ret;
@@@ -1648,3 -1502,37 +1447,3 @@@ out
         trace_xfs_reflink_unshare_error(ip, error, _RET_IP_);
         return error;
   }
- -
- -/*
- - * Does this inode have any real CoW reservations?
- - */
- -bool
- -xfs_reflink_has_real_cow_blocks(
- -      struct xfs_inode                *ip)
- -{
- -      struct xfs_bmbt_irec            irec;
- -      struct xfs_ifork                *ifp;
- -      struct xfs_bmbt_rec_host        *gotp;
- -      xfs_extnum_t                    idx;
- -
- -      if (!xfs_is_reflink_inode(ip))
- -              return false;
- -
- -      /* Go find the old extent in the CoW fork. */
- -      ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
- -      gotp = xfs_iext_bno_to_ext(ifp, 0, &idx);
- -      while (gotp) {
- -              xfs_bmbt_get_all(gotp, &irec);
- -
- -              if (!isnullstartblock(irec.br_startblock))
- -                      return true;
- -
- -              /* Roll on... */
- -              idx++;
- -              if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
- -                      break;
- -              gotp = xfs_iext_get_ext(ifp, idx);
- -      }
- -
- -      return false;
- -}
diff --combined include/linux/fs.h

index 83de8b6,087813f..3587896
--- 1/include/linux/fs.h
--- 2/include/linux/fs.h
+++ b/include/linux/fs.h
@@@ -28,6 -28,7 +28,6 @@@
   #include <linux/uidgid.h>
   #include <linux/lockdep.h>
   #include <linux/percpu-rwsem.h>
- -#include <linux/blk_types.h>
   #include <linux/workqueue.h>
   #include <linux/percpu-rwsem.h>
   #include <linux/delayed_call.h>
@@@ -37,7 -38,6 +37,7 @@@
   
   struct backing_dev_info;
   struct bdi_writeback;
+ +struct bio;
   struct export_operations;
   struct hd_geometry;
   struct iovec;
@@@ -151,6 -151,58 +151,6 @@@ typedef int (dio_iodone_t)(struct kioc
    */
   #define CHECK_IOVEC_ONLY -1
   
- -/*
- - * The below are the various read and write flags that we support. Some of
- - * them include behavioral modifiers that send information down to the
- - * block layer and IO scheduler. They should be used along with a req_op.
- - * Terminology:
- - *
- - *    The block layer uses device plugging to defer IO a little bit, in
- - *    the hope that we will see more IO very shortly. This increases
- - *    coalescing of adjacent IO and thus reduces the number of IOs we
- - *    have to send to the device. It also allows for better queuing,
- - *    if the IO isn't mergeable. If the caller is going to be waiting
- - *    for the IO, then he must ensure that the device is unplugged so
- - *    that the IO is dispatched to the driver.
- - *
- - *    All IO is handled async in Linux. This is fine for background
- - *    writes, but for reads or writes that someone waits for completion
- - *    on, we want to notify the block layer and IO scheduler so that they
- - *    know about it. That allows them to make better scheduling
- - *    decisions. So when the below references 'sync' and 'async', it
- - *    is referencing this priority hint.
- - *
- - * With that in mind, the available types are:
- - *
- - * READ                       A normal read operation. Device will be plugged.
- - * READ_SYNC          A synchronous read. Device is not plugged, caller can
- - *                    immediately wait on this read without caring about
- - *                    unplugging.
- - * WRITE              A normal async write. Device will be plugged.
- - * WRITE_SYNC         Synchronous write. Identical to WRITE, but passes down
- - *                    the hint that someone will be waiting on this IO
- - *                    shortly. The write equivalent of READ_SYNC.
- - * WRITE_ODIRECT      Special case write for O_DIRECT only.
- - * WRITE_FLUSH                Like WRITE_SYNC but with preceding cache flush.
- - * WRITE_FUA          Like WRITE_SYNC but data is guaranteed to be on
- - *                    non-volatile media on completion.
- - * WRITE_FLUSH_FUA    Combination of WRITE_FLUSH and FUA. The IO is preceded
- - *                    by a cache flush and data is guaranteed to be on
- - *                    non-volatile media on completion.
- - *
- - */
- -#define RW_MASK                       REQ_OP_WRITE
- -
- -#define READ                  REQ_OP_READ
- -#define WRITE                 REQ_OP_WRITE
- -
- -#define READ_SYNC             REQ_SYNC
- -#define WRITE_SYNC            (REQ_SYNC | REQ_NOIDLE)
- -#define WRITE_ODIRECT         REQ_SYNC
- -#define WRITE_FLUSH           (REQ_SYNC | REQ_NOIDLE | REQ_PREFLUSH)
- -#define WRITE_FUA             (REQ_SYNC | REQ_NOIDLE | REQ_FUA)
- -#define WRITE_FLUSH_FUA               (REQ_SYNC | REQ_NOIDLE | REQ_PREFLUSH | REQ_FUA)
- -
   /*
    * Attribute flags.  These should be or-ed together to figure out what
    * has been changed!
@@@ -1726,24 -1778,17 +1726,30 @@@ extern ssize_t vfs_writev(struct file *
                 unsigned long, loff_t *, int);
   extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
                                    loff_t, size_t, unsigned int);
+ extern int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
+                                     struct inode *inode_out, loff_t pos_out,
+                                     u64 *len, bool is_dedupe);
   extern int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
                 struct file *file_out, loff_t pos_out, u64 len);
+ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
+                                        struct inode *dest, loff_t destoff,
+                                        loff_t len, bool *is_same);
   extern int vfs_dedupe_file_range(struct file *file,
                                  struct file_dedupe_range *same);
   
+ +static inline int do_clone_file_range(struct file *file_in, loff_t pos_in,
+ +                                    struct file *file_out, loff_t pos_out,
+ +                                    u64 len)
+ +{
+ +      int ret;
+ +
+ +      sb_start_write(file_inode(file_out)->i_sb);
+ +      ret = vfs_clone_file_range(file_in, pos_in, file_out, pos_out, len);
+ +      sb_end_write(file_inode(file_out)->i_sb);
+ +
+ +      return ret;
+ +}
+ +
   struct super_operations {
         struct inode *(*alloc_inode)(struct super_block *sb);
         void (*destroy_inode)(struct inode *);
@@@ -2084,11 -2129,11 +2090,11 @@@ extern int may_umount_tree(struct vfsmo
   extern int may_umount(struct vfsmount *);
   extern long do_mount(const char *, const char __user *,
                      const char *, unsigned long, void *);
- extern struct vfsmount *collect_mounts(struct path *);
+ extern struct vfsmount *collect_mounts(const struct path *);
   extern void drop_collected_mounts(struct vfsmount *);
   extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *,
                           struct vfsmount *);
- extern int vfs_statfs(struct path *, struct kstatfs *);
+ extern int vfs_statfs(const struct path *, struct kstatfs *);
   extern int user_statfs(const char __user *, struct kstatfs *);
   extern int fd_statfs(int, struct kstatfs *);
   extern int vfs_ustat(dev_t, struct kstatfs *);
@@@ -2460,6 -2505,19 +2466,6 @@@ extern void make_bad_inode(struct inod
   extern bool is_bad_inode(struct inode *);
   
   #ifdef CONFIG_BLOCK
- -static inline bool op_is_write(unsigned int op)
- -{
- -      return op == REQ_OP_READ ? false : true;
- -}
- -
- -/*
- - * return data direction, READ or WRITE
- - */
- -static inline int bio_data_dir(struct bio *bio)
- -{
- -      return op_is_write(bio_op(bio)) ? WRITE : READ;
- -}
- -
   extern void check_disk_size_change(struct gendisk *disk,
                                    struct block_device *bdev);
   extern int revalidate_disk(struct gendisk *);
@@@ -2657,7 -2715,7 +2663,7 @@@ extern struct file * open_exec(const ch
    
   /* fs/dcache.c -- generic fs support functions */
   extern bool is_subdir(struct dentry *, struct dentry *);
- extern bool path_is_under(struct path *, struct path *);
+ extern bool path_is_under(const struct path *, const struct path *);
   
   extern char *file_path(struct file *, char *, int);
   
@@@ -2730,6 -2788,7 +2736,6 @@@ static inline void remove_inode_hash(st
   extern void inode_sb_list_add(struct inode *inode);
   
   #ifdef CONFIG_BLOCK
- -extern blk_qc_t submit_bio(struct bio *);
   extern int bdev_read_only(struct block_device *);
   #endif
   extern int set_blocksize(struct block_device *, int);
diff --combined kernel/audit.c

index 91bff3c,06008c4..6e399bb
--- 1/kernel/audit.c
--- 2/kernel/audit.c
+++ b/kernel/audit.c
@@@ -107,6 -107,7 +107,6 @@@ static u32 audit_rate_limit
    * When set to zero, this means unlimited. */
   static u32    audit_backlog_limit = 64;
   #define AUDIT_BACKLOG_WAIT_TIME (60 * HZ)
- -static u32    audit_backlog_wait_time_master = AUDIT_BACKLOG_WAIT_TIME;
   static u32    audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
   
   /* The identity of the user shutting down the audit system. */
@@@ -125,7 -126,7 +125,7 @@@ static atomic_t    audit_lost = ATOMIC_
   
   /* The netlink socket. */
   static struct sock *audit_sock;
- -static int audit_net_id;
+ +static unsigned int audit_net_id;
   
   /* Hash for inode-based rules */
   struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
@@@ -137,18 -138,11 +137,18 @@@ static DEFINE_SPINLOCK(audit_freelist_l
   static int       audit_freelist_count;
   static LIST_HEAD(audit_freelist);
   
- -static struct sk_buff_head audit_skb_queue;
- -/* queue of skbs to send to auditd when/if it comes back */
- -static struct sk_buff_head audit_skb_hold_queue;
+ +/* queue msgs to send via kauditd_task */
+ +static struct sk_buff_head audit_queue;
+ +/* queue msgs due to temporary unicast send problems */
+ +static struct sk_buff_head audit_retry_queue;
+ +/* queue msgs waiting for new auditd connection */
+ +static struct sk_buff_head audit_hold_queue;
+ +
+ +/* queue servicing thread */
   static struct task_struct *kauditd_task;
   static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
+ +
+ +/* waitqueue for callers who are blocked on the audit backlog */
   static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
   
   static struct audit_features af = {.vers = AUDIT_FEATURE_VERSION,
@@@ -344,7 -338,7 +344,7 @@@ static int audit_set_backlog_limit(u32 
   static int audit_set_backlog_wait_time(u32 timeout)
   {
         return audit_do_config_change("audit_backlog_wait_time",
- -                                    &audit_backlog_wait_time_master, timeout);
+ +                                    &audit_backlog_wait_time, timeout);
   }
   
   static int audit_set_enabled(u32 state)
@@@ -370,11 -364,30 +370,11 @@@ static int audit_set_failure(u32 state
         return audit_do_config_change("audit_failure", &audit_failure, state);
   }
   
- -/*
- - * Queue skbs to be sent to auditd when/if it comes back.  These skbs should
- - * already have been sent via prink/syslog and so if these messages are dropped
- - * it is not a huge concern since we already passed the audit_log_lost()
- - * notification and stuff.  This is just nice to get audit messages during
- - * boot before auditd is running or messages generated while auditd is stopped.
- - * This only holds messages is audit_default is set, aka booting with audit=1
- - * or building your kernel that way.
- - */
- -static void audit_hold_skb(struct sk_buff *skb)
- -{
- -      if (audit_default &&
- -          (!audit_backlog_limit ||
- -           skb_queue_len(&audit_skb_hold_queue) < audit_backlog_limit))
- -              skb_queue_tail(&audit_skb_hold_queue, skb);
- -      else
- -              kfree_skb(skb);
- -}
- -
   /*
    * For one reason or another this nlh isn't getting delivered to the userspace
    * audit daemon, just send it to printk.
    */
- -static void audit_printk_skb(struct sk_buff *skb)
+ +static void kauditd_printk_skb(struct sk_buff *skb)
   {
         struct nlmsghdr *nlh = nlmsg_hdr(skb);
         char *data = nlmsg_data(nlh);
@@@ -385,123 -398,58 +385,123 @@@
                 else
                         audit_log_lost("printk limit exceeded");
         }
+ +}
+ +
+ +/**
+ + * kauditd_hold_skb - Queue an audit record, waiting for auditd
+ + * @skb: audit record
+ + *
+ + * Description:
+ + * Queue the audit record, waiting for an instance of auditd.  When this
+ + * function is called we haven't given up yet on sending the record, but things
+ + * are not looking good.  The first thing we want to do is try to write the
+ + * record via printk and then see if we want to try and hold on to the record
+ + * and queue it, if we have room.  If we want to hold on to the record, but we
+ + * don't have room, record a record lost message.
+ + */
+ +static void kauditd_hold_skb(struct sk_buff *skb)
+ +{
+ +      /* at this point it is uncertain if we will ever send this to auditd so
+ +       * try to send the message via printk before we go any further */
+ +      kauditd_printk_skb(skb);
+ +
+ +      /* can we just silently drop the message? */
+ +      if (!audit_default) {
+ +              kfree_skb(skb);
+ +              return;
+ +      }
+ +
+ +      /* if we have room, queue the message */
+ +      if (!audit_backlog_limit ||
+ +          skb_queue_len(&audit_hold_queue) < audit_backlog_limit) {
+ +              skb_queue_tail(&audit_hold_queue, skb);
+ +              return;
+ +      }
   
- -      audit_hold_skb(skb);
+ +      /* we have no other options - drop the message */
+ +      audit_log_lost("kauditd hold queue overflow");
+ +      kfree_skb(skb);
   }
   
- -static void kauditd_send_skb(struct sk_buff *skb)
+ +/**
+ + * kauditd_retry_skb - Queue an audit record, attempt to send again to auditd
+ + * @skb: audit record
+ + *
+ + * Description:
+ + * Not as serious as kauditd_hold_skb() as we still have a connected auditd,
+ + * but for some reason we are having problems sending it audit records so
+ + * queue the given record and attempt to resend.
+ + */
+ +static void kauditd_retry_skb(struct sk_buff *skb)
   {
- -      int err;
- -      int attempts = 0;
- -#define AUDITD_RETRIES 5
+ +      /* NOTE: because records should only live in the retry queue for a
+ +       * short period of time, before either being sent or moved to the hold
+ +       * queue, we don't currently enforce a limit on this queue */
+ +      skb_queue_tail(&audit_retry_queue, skb);
+ +}
+ +
+ +/**
+ + * auditd_reset - Disconnect the auditd connection
+ + *
+ + * Description:
+ + * Break the auditd/kauditd connection and move all the records in the retry
+ + * queue into the hold queue in case auditd reconnects.  The audit_cmd_mutex
+ + * must be held when calling this function.
+ + */
+ +static void auditd_reset(void)
+ +{
+ +      struct sk_buff *skb;
+ +
+ +      /* break the connection */
+ +      if (audit_sock) {
+ +              sock_put(audit_sock);
+ +              audit_sock = NULL;
+ +      }
+ +      audit_pid = 0;
+ +      audit_nlk_portid = 0;
+ +
+ +      /* flush all of the retry queue to the hold queue */
+ +      while ((skb = skb_dequeue(&audit_retry_queue)))
+ +              kauditd_hold_skb(skb);
+ +}
+ +
+ +/**
+ + * kauditd_send_unicast_skb - Send a record via unicast to auditd
+ + * @skb: audit record
+ + */
+ +static int kauditd_send_unicast_skb(struct sk_buff *skb)
+ +{
+ +      int rc;
   
- -restart:
- -      /* take a reference in case we can't send it and we want to hold it */
+ +      /* if we know nothing is connected, don't even try the netlink call */
+ +      if (!audit_pid)
+ +              return -ECONNREFUSED;
+ +
+ +      /* get an extra skb reference in case we fail to send */
         skb_get(skb);
- -      err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0);
- -      if (err < 0) {
- -              pr_err("netlink_unicast sending to audit_pid=%d returned error: %d\n",
- -                     audit_pid, err);
- -              if (audit_pid) {
- -                      if (err == -ECONNREFUSED || err == -EPERM
- -                          || ++attempts >= AUDITD_RETRIES) {
- -                              char s[32];
- -
- -                              snprintf(s, sizeof(s), "audit_pid=%d reset", audit_pid);
- -                              audit_log_lost(s);
- -                              audit_pid = 0;
- -                              audit_sock = NULL;
- -                      } else {
- -                              pr_warn("re-scheduling(#%d) write to audit_pid=%d\n",
- -                                      attempts, audit_pid);
- -                              set_current_state(TASK_INTERRUPTIBLE);
- -                              schedule();
- -                              goto restart;
- -                      }
- -              }
- -              /* we might get lucky and get this in the next auditd */
- -              audit_hold_skb(skb);
- -      } else
- -              /* drop the extra reference if sent ok */
+ +      rc = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0);
+ +      if (rc >= 0) {
                 consume_skb(skb);
+ +              rc = 0;
+ +      }
+ +
+ +      return rc;
   }
   
   /*
- - * kauditd_send_multicast_skb - send the skb to multicast userspace listeners
+ + * kauditd_send_multicast_skb - Send a record to any multicast listeners
+ + * @skb: audit record
    *
+ + * Description:
    * This function doesn't consume an skb as might be expected since it has to
    * copy it anyways.
    */
- -static void kauditd_send_multicast_skb(struct sk_buff *skb, gfp_t gfp_mask)
+ +static void kauditd_send_multicast_skb(struct sk_buff *skb)
   {
- -      struct sk_buff          *copy;
- -      struct audit_net        *aunet = net_generic(&init_net, audit_net_id);
- -      struct sock             *sock = aunet->nlsk;
+ +      struct sk_buff *copy;
+ +      struct audit_net *aunet = net_generic(&init_net, audit_net_id);
+ +      struct sock *sock = aunet->nlsk;
+ +      struct nlmsghdr *nlh;
   
         if (!netlink_has_listeners(sock, AUDIT_NLGRP_READLOG))
                 return;
@@@ -516,161 -464,74 +516,161 @@@
          * no reason for new multicast clients to continue with this
          * non-compliance.
          */
- -      copy = skb_copy(skb, gfp_mask);
+ +      copy = skb_copy(skb, GFP_KERNEL);
         if (!copy)
                 return;
+ +      nlh = nlmsg_hdr(copy);
+ +      nlh->nlmsg_len = skb->len;
   
- -      nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, gfp_mask);
+ +      nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL);
   }
   
- -/*
- - * flush_hold_queue - empty the hold queue if auditd appears
- - *
- - * If auditd just started, drain the queue of messages already
- - * sent to syslog/printk.  Remember loss here is ok.  We already
- - * called audit_log_lost() if it didn't go out normally.  so the
- - * race between the skb_dequeue and the next check for audit_pid
- - * doesn't matter.
+ +/**
+ + * kauditd_wake_condition - Return true when it is time to wake kauditd_thread
    *
- - * If you ever find kauditd to be too slow we can get a perf win
- - * by doing our own locking and keeping better track if there
- - * are messages in this queue.  I don't see the need now, but
- - * in 5 years when I want to play with this again I'll see this
- - * note and still have no friggin idea what i'm thinking today.
+ + * Description:
+ + * This function is for use by the wait_event_freezable() call in
+ + * kauditd_thread().
    */
- -static void flush_hold_queue(void)
+ +static int kauditd_wake_condition(void)
   {
- -      struct sk_buff *skb;
- -
- -      if (!audit_default || !audit_pid)
- -              return;
- -
- -      skb = skb_dequeue(&audit_skb_hold_queue);
- -      if (likely(!skb))
- -              return;
+ +      static int pid_last = 0;
+ +      int rc;
+ +      int pid = audit_pid;
   
- -      while (skb && audit_pid) {
- -              kauditd_send_skb(skb);
- -              skb = skb_dequeue(&audit_skb_hold_queue);
- -      }
+ +      /* wake on new messages or a change in the connected auditd */
+ +      rc = skb_queue_len(&audit_queue) || (pid && pid != pid_last);
+ +      if (rc)
+ +              pid_last = pid;
   
- -      /*
- -       * if auditd just disappeared but we
- -       * dequeued an skb we need to drop ref
- -       */
- -      consume_skb(skb);
+ +      return rc;
   }
   
   static int kauditd_thread(void *dummy)
   {
+ +      int rc;
+ +      int auditd = 0;
+ +      int reschedule = 0;
+ +      struct sk_buff *skb;
+ +      struct nlmsghdr *nlh;
+ +
+ +#define UNICAST_RETRIES 5
+ +#define AUDITD_BAD(x,y) \
+ +      ((x) == -ECONNREFUSED || (x) == -EPERM || ++(y) >= UNICAST_RETRIES)
+ +
+ +      /* NOTE: we do invalidate the auditd connection flag on any sending
+ +       * errors, but we only "restore" the connection flag at specific places
+ +       * in the loop in order to help ensure proper ordering of audit
+ +       * records */
+ +
         set_freezable();
         while (!kthread_should_stop()) {
- -              struct sk_buff *skb;
- -
- -              flush_hold_queue();
+ +              /* NOTE: possible area for future improvement is to look at
+ +               *       the hold and retry queues, since only this thread
+ +               *       has access to these queues we might be able to do
+ +               *       our own queuing and skip some/all of the locking */
+ +
+ +              /* NOTE: it might be a fun experiment to split the hold and
+ +               *       retry queue handling to another thread, but the
+ +               *       synchronization issues and other overhead might kill
+ +               *       any performance gains */
+ +
+ +              /* attempt to flush the hold queue */
+ +              while (auditd && (skb = skb_dequeue(&audit_hold_queue))) {
+ +                      rc = kauditd_send_unicast_skb(skb);
+ +                      if (rc) {
+ +                              /* requeue to the same spot */
+ +                              skb_queue_head(&audit_hold_queue, skb);
+ +
+ +                              auditd = 0;
+ +                              if (AUDITD_BAD(rc, reschedule)) {
+ +                                      mutex_lock(&audit_cmd_mutex);
+ +                                      auditd_reset();
+ +                                      mutex_unlock(&audit_cmd_mutex);
+ +                                      reschedule = 0;
+ +                              }
+ +                      } else
+ +                              /* we were able to send successfully */
+ +                              reschedule = 0;
+ +              }
   
- -              skb = skb_dequeue(&audit_skb_queue);
+ +              /* attempt to flush the retry queue */
+ +              while (auditd && (skb = skb_dequeue(&audit_retry_queue))) {
+ +                      rc = kauditd_send_unicast_skb(skb);
+ +                      if (rc) {
+ +                              auditd = 0;
+ +                              if (AUDITD_BAD(rc, reschedule)) {
+ +                                      kauditd_hold_skb(skb);
+ +                                      mutex_lock(&audit_cmd_mutex);
+ +                                      auditd_reset();
+ +                                      mutex_unlock(&audit_cmd_mutex);
+ +                                      reschedule = 0;
+ +                              } else
+ +                                      /* temporary problem (we hope), queue
+ +                                       * to the same spot and retry */
+ +                                      skb_queue_head(&audit_retry_queue, skb);
+ +                      } else
+ +                              /* we were able to send successfully */
+ +                              reschedule = 0;
+ +              }
   
+ +              /* standard queue processing, try to be as quick as possible */
+ +quick_loop:
+ +              skb = skb_dequeue(&audit_queue);
                 if (skb) {
- -                      if (!audit_backlog_limit ||
- -                          (skb_queue_len(&audit_skb_queue) <= audit_backlog_limit))
- -                              wake_up(&audit_backlog_wait);
- -                      if (audit_pid)
- -                              kauditd_send_skb(skb);
+ +                      /* setup the netlink header, see the comments in
+ +                       * kauditd_send_multicast_skb() for length quirks */
+ +                      nlh = nlmsg_hdr(skb);
+ +                      nlh->nlmsg_len = skb->len - NLMSG_HDRLEN;
+ +
+ +                      /* attempt to send to any multicast listeners */
+ +                      kauditd_send_multicast_skb(skb);
+ +
+ +                      /* attempt to send to auditd, queue on failure */
+ +                      if (auditd) {
+ +                              rc = kauditd_send_unicast_skb(skb);
+ +                              if (rc) {
+ +                                      auditd = 0;
+ +                                      if (AUDITD_BAD(rc, reschedule)) {
+ +                                              mutex_lock(&audit_cmd_mutex);
+ +                                              auditd_reset();
+ +                                              mutex_unlock(&audit_cmd_mutex);
+ +                                              reschedule = 0;
+ +                                      }
+ +
+ +                                      /* move to the retry queue */
+ +                                      kauditd_retry_skb(skb);
+ +                              } else
+ +                                      /* everything is working so go fast! */
+ +                                      goto quick_loop;
+ +                      } else if (reschedule)
+ +                              /* we are currently having problems, move to
+ +                               * the retry queue */
+ +                              kauditd_retry_skb(skb);
                         else
- -                              audit_printk_skb(skb);
- -                      continue;
- -              }
+ +                              /* dump the message via printk and hold it */
+ +                              kauditd_hold_skb(skb);
+ +              } else {
+ +                      /* we have flushed the backlog so wake everyone */
+ +                      wake_up(&audit_backlog_wait);
+ +
+ +                      /* if everything is okay with auditd (if present), go
+ +                       * to sleep until there is something new in the queue
+ +                       * or we have a change in the connected auditd;
+ +                       * otherwise simply reschedule to give things a chance
+ +                       * to recover */
+ +                      if (reschedule) {
+ +                              set_current_state(TASK_INTERRUPTIBLE);
+ +                              schedule();
+ +                      } else
+ +                              wait_event_freezable(kauditd_wait,
+ +                                                   kauditd_wake_condition());
   
- -              wait_event_freezable(kauditd_wait, skb_queue_len(&audit_skb_queue));
+ +                      /* update the auditd connection status */
+ +                      auditd = (audit_pid ? 1 : 0);
+ +              }
         }
+ +
         return 0;
   }
   
@@@ -735,7 -596,6 +735,7 @@@ static int audit_send_reply_thread(voi
         kfree(reply);
         return 0;
   }
+ +
   /**
    * audit_send_reply - send an audit reply message via netlink
    * @request_skb: skb of request we are replying to (used to target the reply)
@@@ -972,6 -832,16 +972,6 @@@ static int audit_receive_msg(struct sk_
         if (err)
                 return err;
   
- -      /* As soon as there's any sign of userspace auditd,
- -       * start kauditd to talk to it */
- -      if (!kauditd_task) {
- -              kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
- -              if (IS_ERR(kauditd_task)) {
- -                      err = PTR_ERR(kauditd_task);
- -                      kauditd_task = NULL;
- -                      return err;
- -              }
- -      }
         seq  = nlh->nlmsg_seq;
         data = nlmsg_data(nlh);
   
@@@ -985,9 -855,9 +985,9 @@@
                 s.rate_limit            = audit_rate_limit;
                 s.backlog_limit         = audit_backlog_limit;
                 s.lost                  = atomic_read(&audit_lost);
- -              s.backlog               = skb_queue_len(&audit_skb_queue);
+ +              s.backlog               = skb_queue_len(&audit_queue);
                 s.feature_bitmap        = AUDIT_FEATURE_BITMAP_ALL;
- -              s.backlog_wait_time     = audit_backlog_wait_time_master;
+ +              s.backlog_wait_time     = audit_backlog_wait_time;
                 audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s));
                 break;
         }
@@@ -1027,17 -897,9 +1027,17 @@@
                         }
                         if (audit_enabled != AUDIT_OFF)
                                 audit_log_config_change("audit_pid", new_pid, audit_pid, 1);
- -                      audit_pid = new_pid;
- -                      audit_nlk_portid = NETLINK_CB(skb).portid;
- -                      audit_sock = skb->sk;
+ +                      if (new_pid) {
+ +                              if (audit_sock)
+ +                                      sock_put(audit_sock);
+ +                              audit_pid = new_pid;
+ +                              audit_nlk_portid = NETLINK_CB(skb).portid;
+ +                              sock_hold(skb->sk);
+ +                              audit_sock = skb->sk;
+ +                      } else {
+ +                              auditd_reset();
+ +                      }
+ +                      wake_up_interruptible(&kauditd_wait);
                 }
                 if (s.mask & AUDIT_STATUS_RATE_LIMIT) {
                         err = audit_set_rate_limit(s.rate_limit);
@@@ -1305,13 -1167,14 +1305,13 @@@ static void __net_exit audit_net_exit(s
   {
         struct audit_net *aunet = net_generic(net, audit_net_id);
         struct sock *sock = aunet->nlsk;
- -      if (sock == audit_sock) {
- -              audit_pid = 0;
- -              audit_sock = NULL;
- -      }
+ +      mutex_lock(&audit_cmd_mutex);
+ +      if (sock == audit_sock)
+ +              auditd_reset();
+ +      mutex_unlock(&audit_cmd_mutex);
   
- -      RCU_INIT_POINTER(aunet->nlsk, NULL);
- -      synchronize_net();
         netlink_kernel_release(sock);
+ +      aunet->nlsk = NULL;
   }
   
   static struct pernet_operations audit_net_ops __net_initdata = {
@@@ -1333,24 -1196,17 +1333,24 @@@ static int __init audit_init(void
                 audit_default ? "enabled" : "disabled");
         register_pernet_subsys(&audit_net_ops);
   
- -      skb_queue_head_init(&audit_skb_queue);
- -      skb_queue_head_init(&audit_skb_hold_queue);
+ +      skb_queue_head_init(&audit_queue);
+ +      skb_queue_head_init(&audit_retry_queue);
+ +      skb_queue_head_init(&audit_hold_queue);
         audit_initialized = AUDIT_INITIALIZED;
         audit_enabled = audit_default;
         audit_ever_enabled |= !!audit_default;
   
- -      audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
- -
         for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
                 INIT_LIST_HEAD(&audit_inode_hash[i]);
   
+ +      kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
+ +      if (IS_ERR(kauditd_task)) {
+ +              int err = PTR_ERR(kauditd_task);
+ +              panic("audit: failed to start the kauditd thread (%d)\n", err);
+ +      }
+ +
+ +      audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
+ +
         return 0;
   }
   __initcall(audit_init);
@@@ -1483,6 -1339,24 +1483,6 @@@ static inline void audit_get_stamp(stru
         }
   }
   
- -/*
- - * Wait for auditd to drain the queue a little
- - */
- -static long wait_for_auditd(long sleep_time)
- -{
- -      DECLARE_WAITQUEUE(wait, current);
- -
- -      if (audit_backlog_limit &&
- -          skb_queue_len(&audit_skb_queue) > audit_backlog_limit) {
- -              add_wait_queue_exclusive(&audit_backlog_wait, &wait);
- -              set_current_state(TASK_UNINTERRUPTIBLE);
- -              sleep_time = schedule_timeout(sleep_time);
- -              remove_wait_queue(&audit_backlog_wait, &wait);
- -      }
- -
- -      return sleep_time;
- -}
- -
   /**
    * audit_log_start - obtain an audit buffer
    * @ctx: audit_context (may be NULL)
@@@ -1501,9 -1375,12 +1501,9 @@@
   struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
                                      int type)
   {
- -      struct audit_buffer     *ab     = NULL;
- -      struct timespec         t;
- -      unsigned int            uninitialized_var(serial);
- -      int reserve = 5; /* Allow atomic callers to go up to five
- -                          entries over the normal backlog limit */
- -      unsigned long timeout_start = jiffies;
+ +      struct audit_buffer *ab;
+ +      struct timespec t;
+ +      unsigned int uninitialized_var(serial);
   
         if (audit_initialized != AUDIT_INITIALIZED)
                 return NULL;
@@@ -1511,48 -1388,38 +1511,48 @@@
         if (unlikely(!audit_filter(type, AUDIT_FILTER_TYPE)))
                 return NULL;
   
- -      if (gfp_mask & __GFP_DIRECT_RECLAIM) {
- -              if (audit_pid && audit_pid == current->tgid)
- -                      gfp_mask &= ~__GFP_DIRECT_RECLAIM;
- -              else
- -                      reserve = 0;
- -      }
- -
- -      while (audit_backlog_limit
- -             && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) {
- -              if (gfp_mask & __GFP_DIRECT_RECLAIM && audit_backlog_wait_time) {
- -                      long sleep_time;
+ +      /* don't ever fail/sleep on these two conditions:
+ +       * 1. auditd generated record - since we need auditd to drain the
+ +       *    queue; also, when we are checking for auditd, compare PIDs using
+ +       *    task_tgid_vnr() since auditd_pid is set in audit_receive_msg()
+ +       *    using a PID anchored in the caller's namespace
+ +       * 2. audit command message - record types 1000 through 1099 inclusive
+ +       *    are command messages/records used to manage the kernel subsystem
+ +       *    and the audit userspace, blocking on these messages could cause
+ +       *    problems under load so don't do it (note: not all of these
+ +       *    command types are valid as record types, but it is quicker to
+ +       *    just check two ints than a series of ints in a if/switch stmt) */
+ +      if (!((audit_pid && audit_pid == task_tgid_vnr(current)) ||
+ +            (type >= 1000 && type <= 1099))) {
+ +              long sleep_time = audit_backlog_wait_time;
+ +
+ +              while (audit_backlog_limit &&
+ +                     (skb_queue_len(&audit_queue) > audit_backlog_limit)) {
+ +                      /* wake kauditd to try and flush the queue */
+ +                      wake_up_interruptible(&kauditd_wait);
   
- -                      sleep_time = timeout_start + audit_backlog_wait_time - jiffies;
- -                      if (sleep_time > 0) {
- -                              sleep_time = wait_for_auditd(sleep_time);
- -                              if (sleep_time > 0)
- -                                      continue;
+ +                      /* sleep if we are allowed and we haven't exhausted our
+ +                       * backlog wait limit */
+ +                      if ((gfp_mask & __GFP_DIRECT_RECLAIM) &&
+ +                          (sleep_time > 0)) {
+ +                              DECLARE_WAITQUEUE(wait, current);
+ +
+ +                              add_wait_queue_exclusive(&audit_backlog_wait,
+ +                                                       &wait);
+ +                              set_current_state(TASK_UNINTERRUPTIBLE);
+ +                              sleep_time = schedule_timeout(sleep_time);
+ +                              remove_wait_queue(&audit_backlog_wait, &wait);
+ +                      } else {
+ +                              if (audit_rate_check() && printk_ratelimit())
+ +                                      pr_warn("audit_backlog=%d > audit_backlog_limit=%d\n",
+ +                                              skb_queue_len(&audit_queue),
+ +                                              audit_backlog_limit);
+ +                              audit_log_lost("backlog limit exceeded");
+ +                              return NULL;
                         }
                 }
- -              if (audit_rate_check() && printk_ratelimit())
- -                      pr_warn("audit_backlog=%d > audit_backlog_limit=%d\n",
- -                              skb_queue_len(&audit_skb_queue),
- -                              audit_backlog_limit);
- -              audit_log_lost("backlog limit exceeded");
- -              audit_backlog_wait_time = 0;
- -              wake_up(&audit_backlog_wait);
- -              return NULL;
         }
   
- -      if (!reserve && !audit_backlog_wait_time)
- -              audit_backlog_wait_time = audit_backlog_wait_time_master;
- -
         ab = audit_buffer_alloc(ctx, gfp_mask, type);
         if (!ab) {
                 audit_log_lost("out of memory in audit_log_start");
@@@ -1560,9 -1427,9 +1560,9 @@@
         }
   
         audit_get_stamp(ab->ctx, &t, &serial);
- -
         audit_log_format(ab, "audit(%lu.%03lu:%u): ",
                          t.tv_sec, t.tv_nsec/1000000, serial);
+ +
         return ab;
   }
   
@@@ -1893,7 -1760,7 +1893,7 @@@ void audit_copy_inode(struct audit_name
    * @call_panic: optional pointer to int that will be updated if secid fails
    */
   void audit_log_name(struct audit_context *context, struct audit_names *n,
-                   struct path *path, int record_num, int *call_panic)
+                   const struct path *path, int record_num, int *call_panic)
   {
         struct audit_buffer *ab;
         ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
@@@ -2081,7 -1948,7 +2081,7 @@@ EXPORT_SYMBOL(audit_log_task_info)
    * @operation: specific link operation
    * @link: the path that triggered the restriction
    */
- void audit_log_link_denied(const char *operation, struct path *link)
+ void audit_log_link_denied(const char *operation, const struct path *link)
   {
         struct audit_buffer *ab;
         struct audit_names *name;
@@@ -2112,10 -1979,10 +2112,10 @@@ out
    * audit_log_end - end one audit record
    * @ab: the audit_buffer
    *
- - * netlink_unicast() cannot be called inside an irq context because it blocks
- - * (last arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed
- - * on a queue and a tasklet is scheduled to remove them from the queue outside
- - * the irq context.  May be called in any context.
+ + * We can not do a netlink send inside an irq context because it blocks (last
+ + * arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed on a
+ + * queue and a tasklet is scheduled to remove them from the queue outside the
+ + * irq context.  May be called in any context.
    */
   void audit_log_end(struct audit_buffer *ab)
   {
@@@ -2124,8 -1991,28 +2124,8 @@@
         if (!audit_rate_check()) {
                 audit_log_lost("rate limit exceeded");
         } else {
- -              struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
- -
- -              nlh->nlmsg_len = ab->skb->len;
- -              kauditd_send_multicast_skb(ab->skb, ab->gfp_mask);
- -
- -              /*
- -               * The original kaudit unicast socket sends up messages with
- -               * nlmsg_len set to the payload length rather than the entire
- -               * message length.  This breaks the standard set by netlink.
- -               * The existing auditd daemon assumes this breakage.  Fixing
- -               * this would require co-ordinating a change in the established
- -               * protocol between the kaudit kernel subsystem and the auditd
- -               * userspace code.
- -               */
- -              nlh->nlmsg_len -= NLMSG_HDRLEN;
- -
- -              if (audit_pid) {
- -                      skb_queue_tail(&audit_skb_queue, ab->skb);
- -                      wake_up_interruptible(&kauditd_wait);
- -              } else {
- -                      audit_printk_skb(ab->skb);
- -              }
+ +              skb_queue_tail(&audit_queue, ab->skb);
+ +              wake_up_interruptible(&kauditd_wait);
                 ab->skb = NULL;
         }
         audit_buffer_free(ab);
diff --combined kernel/audit_fsnotify.c

index f751548,1173f7c..7ea57e5
--- 1/kernel/audit_fsnotify.c
--- 2/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@@ -74,7 -74,7 +74,7 @@@ int audit_mark_compare(struct audit_fsn
   }
   
   static void audit_update_mark(struct audit_fsnotify_mark *audit_mark,
-                            struct inode *inode)
+                            const struct inode *inode)
   {
         audit_mark->dev = inode ? inode->i_sb->s_dev : AUDIT_DEV_UNSET;
         audit_mark->ino = inode ? inode->i_ino : AUDIT_INO_UNSET;
@@@ -130,9 -130,10 +130,9 @@@ static void audit_mark_log_rule_change(
         ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
         if (unlikely(!ab))
                 return;
- -      audit_log_format(ab, "auid=%u ses=%u op=",
+ +      audit_log_format(ab, "auid=%u ses=%u op=%s",
                          from_kuid(&init_user_ns, audit_get_loginuid(current)),
- -                       audit_get_sessionid(current));
- -      audit_log_string(ab, op);
+ +                       audit_get_sessionid(current), op);
         audit_log_format(ab, " path=");
         audit_log_untrustedstring(ab, audit_mark->path);
         audit_log_key(ab, rule->filterkey);
@@@ -167,11 -168,11 +167,11 @@@ static int audit_mark_handle_event(stru
                                     struct inode *to_tell,
                                     struct fsnotify_mark *inode_mark,
                                     struct fsnotify_mark *vfsmount_mark,
-                                   u32 mask, void *data, int data_type,
+                                   u32 mask, const void *data, int data_type,
                                     const unsigned char *dname, u32 cookie)
   {
         struct audit_fsnotify_mark *audit_mark;
-       struct inode *inode = NULL;
+       const struct inode *inode = NULL;
   
         audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark);
   
@@@ -179,10 -180,10 +179,10 @@@
   
         switch (data_type) {
         case (FSNOTIFY_EVENT_PATH):
-               inode = ((struct path *)data)->dentry->d_inode;
+               inode = ((const struct path *)data)->dentry->d_inode;
                 break;
         case (FSNOTIFY_EVENT_INODE):
-               inode = (struct inode *)data;
+               inode = (const struct inode *)data;
                 break;
         default:
                 BUG();
diff --combined kernel/audit_tree.c

index 055f11b,3a2f5df..8b1dde9
--- 1/kernel/audit_tree.c
--- 2/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@@ -458,7 -458,8 +458,7 @@@ static void audit_tree_log_remove_rule(
         ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
         if (unlikely(!ab))
                 return;
- -      audit_log_format(ab, "op=");
- -      audit_log_string(ab, "remove_rule");
+ +      audit_log_format(ab, "op=remove_rule");
         audit_log_format(ab, " dir=");
         audit_log_untrustedstring(ab, rule->tree->pathname);
         audit_log_key(ab, rule->filterkey);
@@@ -947,7 -948,7 +947,7 @@@ static int audit_tree_handle_event(stru
                                    struct inode *to_tell,
                                    struct fsnotify_mark *inode_mark,
                                    struct fsnotify_mark *vfsmount_mark,
-                                  u32 mask, void *data, int data_type,
+                                  u32 mask, const void *data, int data_type,
                                    const unsigned char *file_name, u32 cookie)
   {
         return 0;
diff --combined kernel/audit_watch.c

index 2d7bdcb,f476c46..f79e465
--- 1/kernel/audit_watch.c
--- 2/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@@ -242,9 -242,10 +242,9 @@@ static void audit_watch_log_rule_change
                 ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
                 if (unlikely(!ab))
                         return;
- -              audit_log_format(ab, "auid=%u ses=%u op=",
+ +              audit_log_format(ab, "auid=%u ses=%u op=%s",
                                  from_kuid(&init_user_ns, audit_get_loginuid(current)),
- -                               audit_get_sessionid(current));
- -              audit_log_string(ab, op);
+ +                               audit_get_sessionid(current), op);
                 audit_log_format(ab, " path=");
                 audit_log_untrustedstring(ab, w->path);
                 audit_log_key(ab, r->filterkey);
@@@ -471,10 -472,10 +471,10 @@@ static int audit_watch_handle_event(str
                                     struct inode *to_tell,
                                     struct fsnotify_mark *inode_mark,
                                     struct fsnotify_mark *vfsmount_mark,
-                                   u32 mask, void *data, int data_type,
+                                   u32 mask, const void *data, int data_type,
                                     const unsigned char *dname, u32 cookie)
   {
-       struct inode *inode;
+       const struct inode *inode;
         struct audit_parent *parent;
   
         parent = container_of(inode_mark, struct audit_parent, mark);
@@@ -483,10 -484,10 +483,10 @@@
   
         switch (data_type) {
         case (FSNOTIFY_EVENT_PATH):
-               inode = d_backing_inode(((struct path *)data)->dentry);
+               inode = d_backing_inode(((const struct path *)data)->dentry);
                 break;
         case (FSNOTIFY_EVENT_INODE):
-               inode = (struct inode *)data;
+               inode = (const struct inode *)data;
                 break;
         default:
                 BUG();
@@@ -547,8 -548,8 +547,8 @@@ int audit_exe_compare(struct task_struc
         exe_file = get_task_exe_file(tsk);
         if (!exe_file)
                 return 0;
- -      ino = exe_file->f_inode->i_ino;
- -      dev = exe_file->f_inode->i_sb->s_dev;
+ +      ino = file_inode(exe_file)->i_ino;
+ +      dev = file_inode(exe_file)->i_sb->s_dev;
         fput(exe_file);
         return audit_mark_compare(mark, ino, dev);
   }
author	Linus Torvalds <torvalds@linux-foundation.org>
	Sun, 18 Dec 2016 02:44:00 +0000 (18:44 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sun, 18 Dec 2016 02:44:00 +0000 (18:44 -0800)
		1	2
Documentation/filesystems/Locking	patch \|	diff1 \|	diff2 \|	blob \| history
fs/9p/vfs_addr.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/ctree.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/ioctl.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ceph/addr.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext4/super.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/internal.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/namei.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/nfs/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ocfs2/aops.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ocfs2/refcounttree.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/read_write.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_reflink.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/fs.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/audit.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/audit_fsnotify.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/audit_tree.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/audit_watch.c	patch \|	diff1 \|	diff2 \|	blob \| history