Merge branch 'for-linus-4.7' of git://git.kernel.org/pub/scm/linux/kernel/git/mason...

author Linus Torvalds <torvalds@linux-foundation.org>

Sat, 21 May 2016 17:49:22 +0000 (10:49 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sat, 21 May 2016 17:49:22 +0000 (10:49 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Sat, 21 May 2016 17:49:22 +0000 (10:49 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sat, 21 May 2016 17:49:22 +0000 (10:49 -0700)
diff --combined fs/btrfs/file.c

index ea9f10b,af059c4..c98805c
--- 1/fs/btrfs/file.c
--- 2/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@@ -1696,24 -1696,27 +1696,26 @@@ again
                         btrfs_end_write_no_snapshoting(root);
                         btrfs_delalloc_release_metadata(inode, release_bytes);
                 } else {
-                       btrfs_delalloc_release_space(inode, pos, release_bytes);
+                       btrfs_delalloc_release_space(inode,
+                                               round_down(pos, root->sectorsize),
+                                               release_bytes);
                 }
         }
   
         return num_written ? num_written : ret;
   }
   
- -static ssize_t __btrfs_direct_write(struct kiocb *iocb,
- -                                  struct iov_iter *from,
- -                                  loff_t pos)
+ +static ssize_t __btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
   {
         struct file *file = iocb->ki_filp;
         struct inode *inode = file_inode(file);
+ +      loff_t pos = iocb->ki_pos;
         ssize_t written;
         ssize_t written_buffered;
         loff_t endbyte;
         int err;
   
- -      written = generic_file_direct_write(iocb, from, pos);
+ +      written = generic_file_direct_write(iocb, from);
   
         if (written < 0 || !iov_iter_count(from))
                 return written;
@@@ -1831,7 -1834,7 +1833,7 @@@ static ssize_t btrfs_file_write_iter(st
                 atomic_inc(&BTRFS_I(inode)->sync_writers);
   
         if (iocb->ki_flags & IOCB_DIRECT) {
- -              num_written = __btrfs_direct_write(iocb, from, pos);
+ +              num_written = __btrfs_direct_write(iocb, from);
         } else {
                 num_written = __btrfs_buffered_write(file, from, pos);
                 if (num_written > 0)
@@@ -1851,8 -1854,11 +1853,8 @@@
         spin_lock(&BTRFS_I(inode)->lock);
         BTRFS_I(inode)->last_sub_trans = root->log_transid;
         spin_unlock(&BTRFS_I(inode)->lock);
- -      if (num_written > 0) {
- -              err = generic_write_sync(file, pos, num_written);
- -              if (err < 0)
- -                      num_written = err;
- -      }
+ +      if (num_written > 0)
+ +              num_written = generic_write_sync(iocb, num_written);
   
         if (sync)
                 atomic_dec(&BTRFS_I(inode)->sync_writers);
@@@ -2952,7 -2958,7 +2954,7 @@@ const struct file_operations btrfs_file
         .fallocate      = btrfs_fallocate,
         .unlocked_ioctl = btrfs_ioctl,
   #ifdef CONFIG_COMPAT
-       .compat_ioctl   = btrfs_ioctl,
+       .compat_ioctl   = btrfs_compat_ioctl,
   #endif
         .copy_file_range = btrfs_copy_file_range,
         .clone_file_range = btrfs_clone_file_range,
diff --combined fs/btrfs/inode.c

index 6b7fe29,8fc99fb..91419ef
--- 1/fs/btrfs/inode.c
--- 2/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@@ -824,6 -824,7 +824,7 @@@ retry
                                                 async_extent->ram_size - 1, 0);
                         goto out_free_reserve;
                 }
+               btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
   
                 /*
                  * clear dirty, set writeback and unlock the pages.
@@@ -861,6 -862,7 +862,7 @@@
         }
         return;
   out_free_reserve:
+       btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
         btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
   out_free:
         extent_clear_unlock_delalloc(inode, async_extent->start,
@@@ -1038,6 -1040,8 +1040,8 @@@ static noinline int cow_file_range(stru
                                 goto out_drop_extent_cache;
                 }
   
+               btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
+ 
                 if (disk_num_bytes < cur_alloc_size)
                         break;
   
@@@ -1066,6 -1070,7 +1070,7 @@@ out
   out_drop_extent_cache:
         btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
   out_reserve:
+       btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
         btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
   out_unlock:
         extent_clear_unlock_delalloc(inode, start, end, locked_page,
@@@ -1377,6 -1382,9 +1382,9 @@@ next_slot
                          */
                         if (csum_exist_in_range(root, disk_bytenr, num_bytes))
                                 goto out_check;
+                       if (!btrfs_inc_nocow_writers(root->fs_info,
+                                                    disk_bytenr))
+                               goto out_check;
                         nocow = 1;
                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
                         extent_end = found_key.offset +
@@@ -1391,6 -1399,9 +1399,9 @@@ out_check
                         path->slots[0]++;
                         if (!nolock && nocow)
                                 btrfs_end_write_no_snapshoting(root);
+                       if (nocow)
+                               btrfs_dec_nocow_writers(root->fs_info,
+                                                       disk_bytenr);
                         goto next_slot;
                 }
                 if (!nocow) {
@@@ -1411,6 -1422,9 +1422,9 @@@
                         if (ret) {
                                 if (!nolock && nocow)
                                         btrfs_end_write_no_snapshoting(root);
+                               if (nocow)
+                                       btrfs_dec_nocow_writers(root->fs_info,
+                                                               disk_bytenr);
                                 goto error;
                         }
                         cow_start = (u64)-1;
@@@ -1453,6 -1467,8 +1467,8 @@@
   
                 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
                                                num_bytes, num_bytes, type);
+               if (nocow)
+                       btrfs_dec_nocow_writers(root->fs_info, disk_bytenr);
                 BUG_ON(ret); /* -ENOMEM */
   
                 if (root->root_key.objectid ==
@@@ -7129,6 -7145,43 +7145,43 @@@ out
         return em;
   }
   
+ static struct extent_map *btrfs_create_dio_extent(struct inode *inode,
+                                                 const u64 start,
+                                                 const u64 len,
+                                                 const u64 orig_start,
+                                                 const u64 block_start,
+                                                 const u64 block_len,
+                                                 const u64 orig_block_len,
+                                                 const u64 ram_bytes,
+                                                 const int type)
+ {
+       struct extent_map *em = NULL;
+       int ret;
+ 
+       down_read(&BTRFS_I(inode)->dio_sem);
+       if (type != BTRFS_ORDERED_NOCOW) {
+               em = create_pinned_em(inode, start, len, orig_start,
+                                     block_start, block_len, orig_block_len,
+                                     ram_bytes, type);
+               if (IS_ERR(em))
+                       goto out;
+       }
+       ret = btrfs_add_ordered_extent_dio(inode, start, block_start,
+                                          len, block_len, type);
+       if (ret) {
+               if (em) {
+                       free_extent_map(em);
+                       btrfs_drop_extent_cache(inode, start,
+                                               start + len - 1, 0);
+               }
+               em = ERR_PTR(ret);
+       }
+  out:
+       up_read(&BTRFS_I(inode)->dio_sem);
+ 
+       return em;
+ }
+ 
   static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
                                                   u64 start, u64 len)
   {
@@@ -7144,41 -7197,13 +7197,13 @@@
         if (ret)
                 return ERR_PTR(ret);
   
-       /*
-        * Create the ordered extent before the extent map. This is to avoid
-        * races with the fast fsync path that would lead to it logging file
-        * extent items that point to disk extents that were not yet written to.
-        * The fast fsync path collects ordered extents into a local list and
-        * then collects all the new extent maps, so we must create the ordered
-        * extent first and make sure the fast fsync path collects any new
-        * ordered extents after collecting new extent maps as well.
-        * The fsync path simply can not rely on inode_dio_wait() because it
-        * causes deadlock with AIO.
-        */
-       ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
-                                          ins.offset, ins.offset, 0);
-       if (ret) {
+       em = btrfs_create_dio_extent(inode, start, ins.offset, start,
+                                    ins.objectid, ins.offset, ins.offset,
+                                    ins.offset, 0);
+       btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
+       if (IS_ERR(em))
                 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
-               return ERR_PTR(ret);
-       }
- 
-       em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
-                             ins.offset, ins.offset, ins.offset, 0);
-       if (IS_ERR(em)) {
-               struct btrfs_ordered_extent *oe;
   
-               btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
-               oe = btrfs_lookup_ordered_extent(inode, start);
-               ASSERT(oe);
-               if (WARN_ON(!oe))
-                       return em;
-               set_bit(BTRFS_ORDERED_IOERR, &oe->flags);
-               set_bit(BTRFS_ORDERED_IO_DONE, &oe->flags);
-               btrfs_remove_ordered_extent(inode, oe);
-               /* Once for our lookup and once for the ordered extents tree. */
-               btrfs_put_ordered_extent(oe);
-               btrfs_put_ordered_extent(oe);
-       }
         return em;
   }
   
@@@ -7650,24 -7675,21 +7675,21 @@@ static int btrfs_get_blocks_direct(stru
                 block_start = em->block_start + (start - em->start);
   
                 if (can_nocow_extent(inode, start, &len, &orig_start,
-                                    &orig_block_len, &ram_bytes) == 1) {
+                                    &orig_block_len, &ram_bytes) == 1 &&
+                   btrfs_inc_nocow_writers(root->fs_info, block_start)) {
+                       struct extent_map *em2;
+ 
+                       em2 = btrfs_create_dio_extent(inode, start, len,
+                                                     orig_start, block_start,
+                                                     len, orig_block_len,
+                                                     ram_bytes, type);
+                       btrfs_dec_nocow_writers(root->fs_info, block_start);
                         if (type == BTRFS_ORDERED_PREALLOC) {
                                 free_extent_map(em);
-                               em = create_pinned_em(inode, start, len,
-                                                      orig_start,
-                                                      block_start, len,
-                                                      orig_block_len,
-                                                      ram_bytes, type);
-                               if (IS_ERR(em)) {
-                                       ret = PTR_ERR(em);
-                                       goto unlock_err;
-                               }
+                               em = em2;
                         }
- 
-                       ret = btrfs_add_ordered_extent_dio(inode, start,
-                                          block_start, len, len, type);
-                       if (ret) {
-                               free_extent_map(em);
+                       if (em2 && IS_ERR(em2)) {
+                               ret = PTR_ERR(em2);
                                 goto unlock_err;
                         }
                         goto unlock;
@@@ -8541,13 -8563,13 +8563,13 @@@ out
         return retval;
   }
   
- -static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
- -                             loff_t offset)
+ +static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
   {
         struct file *file = iocb->ki_filp;
         struct inode *inode = file->f_mapping->host;
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct btrfs_dio_data dio_data = { 0 };
+ +      loff_t offset = iocb->ki_pos;
         size_t count = 0;
         int flags = 0;
         bool wakeup = true;
@@@ -8607,7 -8629,7 +8629,7 @@@
   
         ret = __blockdev_direct_IO(iocb, inode,
                                    BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
- -                                 iter, offset, btrfs_get_blocks_direct, NULL,
+ +                                 iter, btrfs_get_blocks_direct, NULL,
                                    btrfs_submit_direct, flags);
         if (iov_iter_rw(iter) == WRITE) {
                 current->journal_info = NULL;
@@@ -9230,6 -9252,7 +9252,7 @@@ struct inode *btrfs_alloc_inode(struct 
         INIT_LIST_HEAD(&ei->delalloc_inodes);
         INIT_LIST_HEAD(&ei->delayed_iput);
         RB_CLEAR_NODE(&ei->rb_node);
+       init_rwsem(&ei->dio_sem);
   
         return inode;
   }
@@@ -9387,10 -9410,281 +9410,281 @@@ static int btrfs_getattr(struct vfsmoun
         return 0;
   }
   
+ static int btrfs_rename_exchange(struct inode *old_dir,
+                             struct dentry *old_dentry,
+                             struct inode *new_dir,
+                             struct dentry *new_dentry)
+ {
+       struct btrfs_trans_handle *trans;
+       struct btrfs_root *root = BTRFS_I(old_dir)->root;
+       struct btrfs_root *dest = BTRFS_I(new_dir)->root;
+       struct inode *new_inode = new_dentry->d_inode;
+       struct inode *old_inode = old_dentry->d_inode;
+       struct timespec ctime = CURRENT_TIME;
+       struct dentry *parent;
+       u64 old_ino = btrfs_ino(old_inode);
+       u64 new_ino = btrfs_ino(new_inode);
+       u64 old_idx = 0;
+       u64 new_idx = 0;
+       u64 root_objectid;
+       int ret;
+       bool root_log_pinned = false;
+       bool dest_log_pinned = false;
+ 
+       /* we only allow rename subvolume link between subvolumes */
+       if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
+               return -EXDEV;
+ 
+       /* close the race window with snapshot create/destroy ioctl */
+       if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
+               down_read(&root->fs_info->subvol_sem);
+       if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
+               down_read(&dest->fs_info->subvol_sem);
+ 
+       /*
+        * We want to reserve the absolute worst case amount of items.  So if
+        * both inodes are subvols and we need to unlink them then that would
+        * require 4 item modifications, but if they are both normal inodes it
+        * would require 5 item modifications, so we'll assume their normal
+        * inodes.  So 5 * 2 is 10, plus 2 for the new links, so 12 total items
+        * should cover the worst case number of items we'll modify.
+        */
+       trans = btrfs_start_transaction(root, 12);
+       if (IS_ERR(trans)) {
+               ret = PTR_ERR(trans);
+               goto out_notrans;
+       }
+ 
+       /*
+        * We need to find a free sequence number both in the source and
+        * in the destination directory for the exchange.
+        */
+       ret = btrfs_set_inode_index(new_dir, &old_idx);
+       if (ret)
+               goto out_fail;
+       ret = btrfs_set_inode_index(old_dir, &new_idx);
+       if (ret)
+               goto out_fail;
+ 
+       BTRFS_I(old_inode)->dir_index = 0ULL;
+       BTRFS_I(new_inode)->dir_index = 0ULL;
+ 
+       /* Reference for the source. */
+       if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
+               /* force full log commit if subvolume involved. */
+               btrfs_set_log_full_commit(root->fs_info, trans);
+       } else {
+               btrfs_pin_log_trans(root);
+               root_log_pinned = true;
+               ret = btrfs_insert_inode_ref(trans, dest,
+                                            new_dentry->d_name.name,
+                                            new_dentry->d_name.len,
+                                            old_ino,
+                                            btrfs_ino(new_dir), old_idx);
+               if (ret)
+                       goto out_fail;
+       }
+ 
+       /* And now for the dest. */
+       if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
+               /* force full log commit if subvolume involved. */
+               btrfs_set_log_full_commit(dest->fs_info, trans);
+       } else {
+               btrfs_pin_log_trans(dest);
+               dest_log_pinned = true;
+               ret = btrfs_insert_inode_ref(trans, root,
+                                            old_dentry->d_name.name,
+                                            old_dentry->d_name.len,
+                                            new_ino,
+                                            btrfs_ino(old_dir), new_idx);
+               if (ret)
+                       goto out_fail;
+       }
+ 
+       /* Update inode version and ctime/mtime. */
+       inode_inc_iversion(old_dir);
+       inode_inc_iversion(new_dir);
+       inode_inc_iversion(old_inode);
+       inode_inc_iversion(new_inode);
+       old_dir->i_ctime = old_dir->i_mtime = ctime;
+       new_dir->i_ctime = new_dir->i_mtime = ctime;
+       old_inode->i_ctime = ctime;
+       new_inode->i_ctime = ctime;
+ 
+       if (old_dentry->d_parent != new_dentry->d_parent) {
+               btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
+               btrfs_record_unlink_dir(trans, new_dir, new_inode, 1);
+       }
+ 
+       /* src is a subvolume */
+       if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
+               root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
+               ret = btrfs_unlink_subvol(trans, root, old_dir,
+                                         root_objectid,
+                                         old_dentry->d_name.name,
+                                         old_dentry->d_name.len);
+       } else { /* src is an inode */
+               ret = __btrfs_unlink_inode(trans, root, old_dir,
+                                          old_dentry->d_inode,
+                                          old_dentry->d_name.name,
+                                          old_dentry->d_name.len);
+               if (!ret)
+                       ret = btrfs_update_inode(trans, root, old_inode);
+       }
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto out_fail;
+       }
+ 
+       /* dest is a subvolume */
+       if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
+               root_objectid = BTRFS_I(new_inode)->root->root_key.objectid;
+               ret = btrfs_unlink_subvol(trans, dest, new_dir,
+                                         root_objectid,
+                                         new_dentry->d_name.name,
+                                         new_dentry->d_name.len);
+       } else { /* dest is an inode */
+               ret = __btrfs_unlink_inode(trans, dest, new_dir,
+                                          new_dentry->d_inode,
+                                          new_dentry->d_name.name,
+                                          new_dentry->d_name.len);
+               if (!ret)
+                       ret = btrfs_update_inode(trans, dest, new_inode);
+       }
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto out_fail;
+       }
+ 
+       ret = btrfs_add_link(trans, new_dir, old_inode,
+                            new_dentry->d_name.name,
+                            new_dentry->d_name.len, 0, old_idx);
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto out_fail;
+       }
+ 
+       ret = btrfs_add_link(trans, old_dir, new_inode,
+                            old_dentry->d_name.name,
+                            old_dentry->d_name.len, 0, new_idx);
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto out_fail;
+       }
+ 
+       if (old_inode->i_nlink == 1)
+               BTRFS_I(old_inode)->dir_index = old_idx;
+       if (new_inode->i_nlink == 1)
+               BTRFS_I(new_inode)->dir_index = new_idx;
+ 
+       if (root_log_pinned) {
+               parent = new_dentry->d_parent;
+               btrfs_log_new_name(trans, old_inode, old_dir, parent);
+               btrfs_end_log_trans(root);
+               root_log_pinned = false;
+       }
+       if (dest_log_pinned) {
+               parent = old_dentry->d_parent;
+               btrfs_log_new_name(trans, new_inode, new_dir, parent);
+               btrfs_end_log_trans(dest);
+               dest_log_pinned = false;
+       }
+ out_fail:
+       /*
+        * If we have pinned a log and an error happened, we unpin tasks
+        * trying to sync the log and force them to fallback to a transaction
+        * commit if the log currently contains any of the inodes involved in
+        * this rename operation (to ensure we do not persist a log with an
+        * inconsistent state for any of these inodes or leading to any
+        * inconsistencies when replayed). If the transaction was aborted, the
+        * abortion reason is propagated to userspace when attempting to commit
+        * the transaction. If the log does not contain any of these inodes, we
+        * allow the tasks to sync it.
+        */
+       if (ret && (root_log_pinned || dest_log_pinned)) {
+               if (btrfs_inode_in_log(old_dir, root->fs_info->generation) ||
+                   btrfs_inode_in_log(new_dir, root->fs_info->generation) ||
+                   btrfs_inode_in_log(old_inode, root->fs_info->generation) ||
+                   (new_inode &&
+                    btrfs_inode_in_log(new_inode, root->fs_info->generation)))
+                   btrfs_set_log_full_commit(root->fs_info, trans);
+ 
+               if (root_log_pinned) {
+                       btrfs_end_log_trans(root);
+                       root_log_pinned = false;
+               }
+               if (dest_log_pinned) {
+                       btrfs_end_log_trans(dest);
+                       dest_log_pinned = false;
+               }
+       }
+       ret = btrfs_end_transaction(trans, root);
+ out_notrans:
+       if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
+               up_read(&dest->fs_info->subvol_sem);
+       if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
+               up_read(&root->fs_info->subvol_sem);
+ 
+       return ret;
+ }
+ 
+ static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *root,
+                                    struct inode *dir,
+                                    struct dentry *dentry)
+ {
+       int ret;
+       struct inode *inode;
+       u64 objectid;
+       u64 index;
+ 
+       ret = btrfs_find_free_ino(root, &objectid);
+       if (ret)
+               return ret;
+ 
+       inode = btrfs_new_inode(trans, root, dir,
+                               dentry->d_name.name,
+                               dentry->d_name.len,
+                               btrfs_ino(dir),
+                               objectid,
+                               S_IFCHR | WHITEOUT_MODE,
+                               &index);
+ 
+       if (IS_ERR(inode)) {
+               ret = PTR_ERR(inode);
+               return ret;
+       }
+ 
+       inode->i_op = &btrfs_special_inode_operations;
+       init_special_inode(inode, inode->i_mode,
+               WHITEOUT_DEV);
+ 
+       ret = btrfs_init_inode_security(trans, inode, dir,
+                               &dentry->d_name);
+       if (ret)
+               goto out;
+ 
+       ret = btrfs_add_nondir(trans, dir, dentry,
+                               inode, 0, index);
+       if (ret)
+               goto out;
+ 
+       ret = btrfs_update_inode(trans, root, inode);
+ out:
+       unlock_new_inode(inode);
+       if (ret)
+               inode_dec_link_count(inode);
+       iput(inode);
+ 
+       return ret;
+ }
+ 
   static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-                          struct inode *new_dir, struct dentry *new_dentry)
+                          struct inode *new_dir, struct dentry *new_dentry,
+                          unsigned int flags)
   {
         struct btrfs_trans_handle *trans;
+       unsigned int trans_num_items;
         struct btrfs_root *root = BTRFS_I(old_dir)->root;
         struct btrfs_root *dest = BTRFS_I(new_dir)->root;
         struct inode *new_inode = d_inode(new_dentry);
@@@ -9399,6 -9693,7 +9693,7 @@@
         u64 root_objectid;
         int ret;
         u64 old_ino = btrfs_ino(old_inode);
+       bool log_pinned = false;
   
         if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
                 return -EPERM;
@@@ -9449,15 -9744,21 +9744,21 @@@
          * We want to reserve the absolute worst case amount of items.  So if
          * both inodes are subvols and we need to unlink them then that would
          * require 4 item modifications, but if they are both normal inodes it
-        * would require 5 item modifications, so we'll assume their normal
+        * would require 5 item modifications, so we'll assume they are normal
          * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
          * should cover the worst case number of items we'll modify.
+        * If our rename has the whiteout flag, we need more 5 units for the
+        * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item
+        * when selinux is enabled).
          */
-       trans = btrfs_start_transaction(root, 11);
+       trans_num_items = 11;
+       if (flags & RENAME_WHITEOUT)
+               trans_num_items += 5;
+       trans = btrfs_start_transaction(root, trans_num_items);
         if (IS_ERR(trans)) {
-                 ret = PTR_ERR(trans);
-                 goto out_notrans;
-         }
+               ret = PTR_ERR(trans);
+               goto out_notrans;
+       }
   
         if (dest != root)
                 btrfs_record_root_in_trans(trans, dest);
@@@ -9471,6 -9772,8 +9772,8 @@@
                 /* force full log commit if subvolume involved. */
                 btrfs_set_log_full_commit(root->fs_info, trans);
         } else {
+               btrfs_pin_log_trans(root);
+               log_pinned = true;
                 ret = btrfs_insert_inode_ref(trans, dest,
                                              new_dentry->d_name.name,
                                              new_dentry->d_name.len,
@@@ -9478,14 -9781,6 +9781,6 @@@
                                              btrfs_ino(new_dir), index);
                 if (ret)
                         goto out_fail;
-               /*
-                * this is an ugly little race, but the rename is required
-                * to make sure that if we crash, the inode is either at the
-                * old name or the new one.  pinning the log transaction lets
-                * us make sure we don't allow a log commit to come in after
-                * we unlink the name but before we add the new name back in.
-                */
-               btrfs_pin_log_trans(root);
         }
   
         inode_inc_iversion(old_dir);
@@@ -9552,12 -9847,46 +9847,46 @@@
         if (old_inode->i_nlink == 1)
                 BTRFS_I(old_inode)->dir_index = index;
   
-       if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
+       if (log_pinned) {
                 struct dentry *parent = new_dentry->d_parent;
+ 
                 btrfs_log_new_name(trans, old_inode, old_dir, parent);
                 btrfs_end_log_trans(root);
+               log_pinned = false;
+       }
+ 
+       if (flags & RENAME_WHITEOUT) {
+               ret = btrfs_whiteout_for_rename(trans, root, old_dir,
+                                               old_dentry);
+ 
+               if (ret) {
+                       btrfs_abort_transaction(trans, root, ret);
+                       goto out_fail;
+               }
         }
   out_fail:
+       /*
+        * If we have pinned the log and an error happened, we unpin tasks
+        * trying to sync the log and force them to fallback to a transaction
+        * commit if the log currently contains any of the inodes involved in
+        * this rename operation (to ensure we do not persist a log with an
+        * inconsistent state for any of these inodes or leading to any
+        * inconsistencies when replayed). If the transaction was aborted, the
+        * abortion reason is propagated to userspace when attempting to commit
+        * the transaction. If the log does not contain any of these inodes, we
+        * allow the tasks to sync it.
+        */
+       if (ret && log_pinned) {
+               if (btrfs_inode_in_log(old_dir, root->fs_info->generation) ||
+                   btrfs_inode_in_log(new_dir, root->fs_info->generation) ||
+                   btrfs_inode_in_log(old_inode, root->fs_info->generation) ||
+                   (new_inode &&
+                    btrfs_inode_in_log(new_inode, root->fs_info->generation)))
+                   btrfs_set_log_full_commit(root->fs_info, trans);
+ 
+               btrfs_end_log_trans(root);
+               log_pinned = false;
+       }
         btrfs_end_transaction(trans, root);
   out_notrans:
         if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
@@@ -9570,10 -9899,14 +9899,14 @@@ static int btrfs_rename2(struct inode *
                          struct inode *new_dir, struct dentry *new_dentry,
                          unsigned int flags)
   {
-       if (flags & ~RENAME_NOREPLACE)
+       if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
                 return -EINVAL;
   
-       return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry);
+       if (flags & RENAME_EXCHANGE)
+               return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
+                                         new_dentry);
+ 
+       return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
   }
   
   static void btrfs_run_delalloc_work(struct btrfs_work *work)
@@@ -9942,6 -10275,7 +10275,7 @@@ static int __btrfs_prealloc_file_range(
                                 btrfs_end_transaction(trans, root);
                         break;
                 }
+               btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
   
                 last_alloc = ins.offset;
                 ret = insert_reserved_file_extent(trans, inode,
@@@ -10160,10 -10494,10 +10494,10 @@@ static const struct inode_operations bt
         .symlink        = btrfs_symlink,
         .setattr        = btrfs_setattr,
         .mknod          = btrfs_mknod,
- -      .setxattr       = btrfs_setxattr,
+ +      .setxattr       = generic_setxattr,
         .getxattr       = generic_getxattr,
         .listxattr      = btrfs_listxattr,
- -      .removexattr    = btrfs_removexattr,
+ +      .removexattr    = generic_removexattr,
         .permission     = btrfs_permission,
         .get_acl        = btrfs_get_acl,
         .set_acl        = btrfs_set_acl,
@@@ -10184,7 -10518,7 +10518,7 @@@ static const struct file_operations btr
         .iterate        = btrfs_real_readdir,
         .unlocked_ioctl = btrfs_ioctl,
   #ifdef CONFIG_COMPAT
-       .compat_ioctl   = btrfs_ioctl,
+       .compat_ioctl   = btrfs_compat_ioctl,
   #endif
         .release        = btrfs_release_file,
         .fsync          = btrfs_sync_file,
@@@ -10237,10 -10571,10 +10571,10 @@@ static const struct address_space_opera
   static const struct inode_operations btrfs_file_inode_operations = {
         .getattr        = btrfs_getattr,
         .setattr        = btrfs_setattr,
- -      .setxattr       = btrfs_setxattr,
+ +      .setxattr       = generic_setxattr,
         .getxattr       = generic_getxattr,
         .listxattr      = btrfs_listxattr,
- -      .removexattr    = btrfs_removexattr,
+ +      .removexattr    = generic_removexattr,
         .permission     = btrfs_permission,
         .fiemap         = btrfs_fiemap,
         .get_acl        = btrfs_get_acl,
@@@ -10251,10 -10585,10 +10585,10 @@@ static const struct inode_operations bt
         .getattr        = btrfs_getattr,
         .setattr        = btrfs_setattr,
         .permission     = btrfs_permission,
- -      .setxattr       = btrfs_setxattr,
+ +      .setxattr       = generic_setxattr,
         .getxattr       = generic_getxattr,
         .listxattr      = btrfs_listxattr,
- -      .removexattr    = btrfs_removexattr,
+ +      .removexattr    = generic_removexattr,
         .get_acl        = btrfs_get_acl,
         .set_acl        = btrfs_set_acl,
         .update_time    = btrfs_update_time,
@@@ -10265,10 -10599,10 +10599,10 @@@ static const struct inode_operations bt
         .getattr        = btrfs_getattr,
         .setattr        = btrfs_setattr,
         .permission     = btrfs_permission,
- -      .setxattr       = btrfs_setxattr,
+ +      .setxattr       = generic_setxattr,
         .getxattr       = generic_getxattr,
         .listxattr      = btrfs_listxattr,
- -      .removexattr    = btrfs_removexattr,
+ +      .removexattr    = generic_removexattr,
         .update_time    = btrfs_update_time,
   };
   
diff --combined fs/btrfs/ioctl.c

index 0b8ba71,73c0be7..4e70069
--- 1/fs/btrfs/ioctl.c
--- 2/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@@ -125,10 -125,10 +125,10 @@@ static unsigned int btrfs_flags_to_ioct
         if (flags & BTRFS_INODE_NODATACOW)
                 iflags |= FS_NOCOW_FL;
   
-       if ((flags & BTRFS_INODE_COMPRESS) && !(flags & BTRFS_INODE_NOCOMPRESS))
-               iflags |= FS_COMPR_FL;
-       else if (flags & BTRFS_INODE_NOCOMPRESS)
+       if (flags & BTRFS_INODE_NOCOMPRESS)
                 iflags |= FS_NOCOMP_FL;
+       else if (flags & BTRFS_INODE_COMPRESS)
+               iflags |= FS_COMPR_FL;
   
         return iflags;
   }
@@@ -439,7 -439,7 +439,7 @@@ static noinline int create_subvol(struc
   {
         struct btrfs_trans_handle *trans;
         struct btrfs_key key;
-       struct btrfs_root_item root_item;
+       struct btrfs_root_item *root_item;
         struct btrfs_inode_item *inode_item;
         struct extent_buffer *leaf;
         struct btrfs_root *root = BTRFS_I(dir)->root;
@@@ -455,16 -455,22 +455,22 @@@
         u64 qgroup_reserved;
         uuid_le new_uuid;
   
+       root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
+       if (!root_item)
+               return -ENOMEM;
+ 
         ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid);
         if (ret)
-               return ret;
+               goto fail_free;
   
         /*
          * Don't create subvolume whose level is not zero. Or qgroup will be
          * screwed up since it assume subvolme qgroup's level to be 0.
          */
-       if (btrfs_qgroup_level(objectid))
-               return -ENOSPC;
+       if (btrfs_qgroup_level(objectid)) {
+               ret = -ENOSPC;
+               goto fail_free;
+       }
   
         btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
         /*
@@@ -474,14 -480,14 +480,14 @@@
         ret = btrfs_subvolume_reserve_metadata(root, &block_rsv,
                                                8, &qgroup_reserved, false);
         if (ret)
-               return ret;
+               goto fail_free;
   
         trans = btrfs_start_transaction(root, 0);
         if (IS_ERR(trans)) {
                 ret = PTR_ERR(trans);
                 btrfs_subvolume_release_metadata(root, &block_rsv,
                                                  qgroup_reserved);
-               return ret;
+               goto fail_free;
         }
         trans->block_rsv = &block_rsv;
         trans->bytes_reserved = block_rsv.size;
@@@ -509,47 -515,45 +515,45 @@@
                             BTRFS_UUID_SIZE);
         btrfs_mark_buffer_dirty(leaf);
   
-       memset(&root_item, 0, sizeof(root_item));
- 
-       inode_item = &root_item.inode;
+       inode_item = &root_item->inode;
         btrfs_set_stack_inode_generation(inode_item, 1);
         btrfs_set_stack_inode_size(inode_item, 3);
         btrfs_set_stack_inode_nlink(inode_item, 1);
         btrfs_set_stack_inode_nbytes(inode_item, root->nodesize);
         btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
   
-       btrfs_set_root_flags(&root_item, 0);
-       btrfs_set_root_limit(&root_item, 0);
+       btrfs_set_root_flags(root_item, 0);
+       btrfs_set_root_limit(root_item, 0);
         btrfs_set_stack_inode_flags(inode_item, BTRFS_INODE_ROOT_ITEM_INIT);
   
-       btrfs_set_root_bytenr(&root_item, leaf->start);
-       btrfs_set_root_generation(&root_item, trans->transid);
-       btrfs_set_root_level(&root_item, 0);
-       btrfs_set_root_refs(&root_item, 1);
-       btrfs_set_root_used(&root_item, leaf->len);
-       btrfs_set_root_last_snapshot(&root_item, 0);
+       btrfs_set_root_bytenr(root_item, leaf->start);
+       btrfs_set_root_generation(root_item, trans->transid);
+       btrfs_set_root_level(root_item, 0);
+       btrfs_set_root_refs(root_item, 1);
+       btrfs_set_root_used(root_item, leaf->len);
+       btrfs_set_root_last_snapshot(root_item, 0);
   
-       btrfs_set_root_generation_v2(&root_item,
-                       btrfs_root_generation(&root_item));
+       btrfs_set_root_generation_v2(root_item,
+                       btrfs_root_generation(root_item));
         uuid_le_gen(&new_uuid);
-       memcpy(root_item.uuid, new_uuid.b, BTRFS_UUID_SIZE);
-       btrfs_set_stack_timespec_sec(&root_item.otime, cur_time.tv_sec);
-       btrfs_set_stack_timespec_nsec(&root_item.otime, cur_time.tv_nsec);
-       root_item.ctime = root_item.otime;
-       btrfs_set_root_ctransid(&root_item, trans->transid);
-       btrfs_set_root_otransid(&root_item, trans->transid);
+       memcpy(root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE);
+       btrfs_set_stack_timespec_sec(&root_item->otime, cur_time.tv_sec);
+       btrfs_set_stack_timespec_nsec(&root_item->otime, cur_time.tv_nsec);
+       root_item->ctime = root_item->otime;
+       btrfs_set_root_ctransid(root_item, trans->transid);
+       btrfs_set_root_otransid(root_item, trans->transid);
   
         btrfs_tree_unlock(leaf);
         free_extent_buffer(leaf);
         leaf = NULL;
   
-       btrfs_set_root_dirid(&root_item, new_dirid);
+       btrfs_set_root_dirid(root_item, new_dirid);
   
         key.objectid = objectid;
         key.offset = 0;
         key.type = BTRFS_ROOT_ITEM_KEY;
         ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
-                               &root_item);
+                               root_item);
         if (ret)
                 goto fail;
   
@@@ -601,12 -605,13 +605,13 @@@
         BUG_ON(ret);
   
         ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root,
-                                 root_item.uuid, BTRFS_UUID_KEY_SUBVOL,
+                                 root_item->uuid, BTRFS_UUID_KEY_SUBVOL,
                                   objectid);
         if (ret)
                 btrfs_abort_transaction(trans, root, ret);
   
   fail:
+       kfree(root_item);
         trans->block_rsv = NULL;
         trans->bytes_reserved = 0;
         btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
@@@ -629,6 -634,10 +634,10 @@@
                 d_instantiate(dentry, inode);
         }
         return ret;
+ 
+ fail_free:
+       kfree(root_item);
+       return ret;
   }
   
   static void btrfs_wait_for_no_snapshoting_writes(struct btrfs_root *root)
@@@ -681,7 -690,7 +690,7 @@@ static int create_snapshot(struct btrfs
         if (ret)
                 goto dec_and_free;
   
-       btrfs_wait_ordered_extents(root, -1);
+       btrfs_wait_ordered_extents(root, -1, 0, (u64)-1);
   
         btrfs_init_block_rsv(&pending_snapshot->block_rsv,
                              BTRFS_BLOCK_RSV_TEMP);
@@@ -837,11 -846,9 +846,11 @@@ static noinline int btrfs_mksubvol(stru
         struct dentry *dentry;
         int error;
   
- -      error = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
- -      if (error == -EINTR)
- -              return error;
+ +      inode_lock_nested(dir, I_MUTEX_PARENT);
+ +      // XXX: should've been
+ +      // mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
+ +      // if (error == -EINTR)
+ +      //      return error;
   
         dentry = lookup_one_len(name, parent->dentry, namelen);
         error = PTR_ERR(dentry);
@@@ -2368,11 -2375,9 +2377,11 @@@ static noinline int btrfs_ioctl_snap_de
                 goto out;
   
   
- -      err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
- -      if (err == -EINTR)
- -              goto out_drop_write;
+ +      inode_lock_nested(dir, I_MUTEX_PARENT);
+ +      // XXX: should've been
+ +      // err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
+ +      // if (err == -EINTR)
+ +      //      goto out_drop_write;
         dentry = lookup_one_len(vol_args->name, parent, namelen);
         if (IS_ERR(dentry)) {
                 err = PTR_ERR(dentry);
@@@ -2562,7 -2567,7 +2571,7 @@@ out_dput
         dput(dentry);
   out_unlock_dir:
         inode_unlock(dir);
- -out_drop_write:
+ +//out_drop_write:
         mnt_drop_write_file(file);
   out:
         kfree(vol_args);
@@@ -2671,10 -2676,10 +2680,10 @@@ out
         return ret;
   }
   
- static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
+ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
   {
         struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
-       struct btrfs_ioctl_vol_args *vol_args;
+       struct btrfs_ioctl_vol_args_v2 *vol_args;
         int ret;
   
         if (!capable(CAP_SYS_ADMIN))
@@@ -2690,7 -2695,9 +2699,9 @@@
                 goto err_drop;
         }
   
-       vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+       /* Check for compatibility reject unknown flags */
+       if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED)
+               return -EOPNOTSUPP;
   
         if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
                         1)) {
@@@ -2699,13 -2706,23 +2710,23 @@@
         }
   
         mutex_lock(&root->fs_info->volume_mutex);
-       ret = btrfs_rm_device(root, vol_args->name);
+       if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) {
+               ret = btrfs_rm_device(root, NULL, vol_args->devid);
+       } else {
+               vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
+               ret = btrfs_rm_device(root, vol_args->name, 0);
+       }
         mutex_unlock(&root->fs_info->volume_mutex);
         atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
   
-       if (!ret)
-               btrfs_info(root->fs_info, "disk deleted %s",vol_args->name);
- 
+       if (!ret) {
+               if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
+                       btrfs_info(root->fs_info, "device deleted: id %llu",
+                                       vol_args->devid);
+               else
+                       btrfs_info(root->fs_info, "device deleted: %s",
+                                       vol_args->name);
+       }
   out:
         kfree(vol_args);
   err_drop:
@@@ -2713,6 -2730,47 +2734,47 @@@
         return ret;
   }
   
+ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
+ {
+       struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+       struct btrfs_ioctl_vol_args *vol_args;
+       int ret;
+ 
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+ 
+       ret = mnt_want_write_file(file);
+       if (ret)
+               return ret;
+ 
+       if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+                       1)) {
+               ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+               goto out_drop_write;
+       }
+ 
+       vol_args = memdup_user(arg, sizeof(*vol_args));
+       if (IS_ERR(vol_args)) {
+               ret = PTR_ERR(vol_args);
+               goto out;
+       }
+ 
+       vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+       mutex_lock(&root->fs_info->volume_mutex);
+       ret = btrfs_rm_device(root, vol_args->name, 0);
+       mutex_unlock(&root->fs_info->volume_mutex);
+ 
+       if (!ret)
+               btrfs_info(root->fs_info, "disk deleted %s",vol_args->name);
+       kfree(vol_args);
+ out:
+       atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+ out_drop_write:
+       mnt_drop_write_file(file);
+ 
+       return ret;
+ }
+ 
   static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
   {
         struct btrfs_ioctl_fs_info_args *fi_args;
@@@ -3472,13 -3530,16 +3534,16 @@@ static int btrfs_clone(struct inode *sr
         u64 last_dest_end = destoff;
   
         ret = -ENOMEM;
-       buf = vmalloc(root->nodesize);
-       if (!buf)
-               return ret;
+       buf = kmalloc(root->nodesize, GFP_KERNEL | __GFP_NOWARN);
+       if (!buf) {
+               buf = vmalloc(root->nodesize);
+               if (!buf)
+                       return ret;
+       }
   
         path = btrfs_alloc_path();
         if (!path) {
-               vfree(buf);
+               kvfree(buf);
                 return ret;
         }
   
@@@ -3779,7 -3840,7 +3844,7 @@@ process_slot
   
   out:
         btrfs_free_path(path);
-       vfree(buf);
+       kvfree(buf);
         return ret;
   }
   
@@@ -4380,7 -4441,7 +4445,7 @@@ static long btrfs_ioctl_dev_replace(str
                         1)) {
                         ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
                 } else {
-                       ret = btrfs_dev_replace_start(root, p);
+                       ret = btrfs_dev_replace_by_ioctl(root, p);
                         atomic_set(
                          &root->fs_info->mutually_exclusive_operation_running,
                          0);
@@@ -4851,8 -4912,8 +4916,8 @@@ static long btrfs_ioctl_qgroup_assign(s
         /* update qgroup status and info */
         err = btrfs_run_qgroups(trans, root->fs_info);
         if (err < 0)
-               btrfs_std_error(root->fs_info, ret,
-                           "failed to update qgroup status and info\n");
+               btrfs_handle_fs_error(root->fs_info, err,
+                           "failed to update qgroup status and info");
         err = btrfs_end_transaction(trans, root);
         if (err && !ret)
                 ret = err;
@@@ -5398,9 -5459,15 +5463,15 @@@ static int btrfs_ioctl_set_features(str
         if (ret)
                 return ret;
   
+       ret = mnt_want_write_file(file);
+       if (ret)
+               return ret;
+ 
         trans = btrfs_start_transaction(root, 0);
-       if (IS_ERR(trans))
-               return PTR_ERR(trans);
+       if (IS_ERR(trans)) {
+               ret = PTR_ERR(trans);
+               goto out_drop_write;
+       }
   
         spin_lock(&root->fs_info->super_lock);
         newflags = btrfs_super_compat_flags(super_block);
@@@ -5419,7 -5486,11 +5490,11 @@@
         btrfs_set_super_incompat_flags(super_block, newflags);
         spin_unlock(&root->fs_info->super_lock);
   
-       return btrfs_commit_transaction(trans, root);
+       ret = btrfs_commit_transaction(trans, root);
+ out_drop_write:
+       mnt_drop_write_file(file);
+ 
+       return ret;
   }
   
   long btrfs_ioctl(struct file *file, unsigned int
@@@ -5463,6 -5534,8 +5538,8 @@@
                 return btrfs_ioctl_add_dev(root, argp);
         case BTRFS_IOC_RM_DEV:
                 return btrfs_ioctl_rm_dev(file, argp);
+       case BTRFS_IOC_RM_DEV_V2:
+               return btrfs_ioctl_rm_dev_v2(file, argp);
         case BTRFS_IOC_FS_INFO:
                 return btrfs_ioctl_fs_info(root, argp);
         case BTRFS_IOC_DEV_INFO:
@@@ -5556,3 -5629,24 +5633,24 @@@
   
         return -ENOTTY;
   }
+ 
+ #ifdef CONFIG_COMPAT
+ long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+ {
+       switch (cmd) {
+       case FS_IOC32_GETFLAGS:
+               cmd = FS_IOC_GETFLAGS;
+               break;
+       case FS_IOC32_SETFLAGS:
+               cmd = FS_IOC_SETFLAGS;
+               break;
+       case FS_IOC32_GETVERSION:
+               cmd = FS_IOC_GETVERSION;
+               break;
+       default:
+               return -ENOIOCTLCMD;
+       }
+ 
+       return btrfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
+ }
+ #endif
diff --combined fs/btrfs/tree-log.c

index e692eea,6aaab31..8aaca5c
--- 1/fs/btrfs/tree-log.c
--- 2/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@@ -4141,6 -4141,7 +4141,7 @@@ static int btrfs_log_changed_extents(st
   
         INIT_LIST_HEAD(&extents);
   
+       down_write(&BTRFS_I(inode)->dio_sem);
         write_lock(&tree->lock);
         test_gen = root->fs_info->last_trans_committed;
   
@@@ -4169,13 -4170,20 +4170,20 @@@
         }
   
         list_sort(NULL, &extents, extent_cmp);
+       btrfs_get_logged_extents(inode, logged_list, start, end);
         /*
-        * Collect any new ordered extents within the range. This is to
-        * prevent logging file extent items without waiting for the disk
-        * location they point to being written. We do this only to deal
-        * with races against concurrent lockless direct IO writes.
+        * Some ordered extents started by fsync might have completed
+        * before we could collect them into the list logged_list, which
+        * means they're gone, not in our logged_list nor in the inode's
+        * ordered tree. We want the application/user space to know an
+        * error happened while attempting to persist file data so that
+        * it can take proper action. If such error happened, we leave
+        * without writing to the log tree and the fsync must report the
+        * file data write error and not commit the current transaction.
          */
-       btrfs_get_logged_extents(inode, logged_list, start, end);
+       ret = btrfs_inode_check_errors(inode);
+       if (ret)
+               ctx->io_err = ret;
   process:
         while (!list_empty(&extents)) {
                 em = list_entry(extents.next, struct extent_map, list);
@@@ -4202,6 -4210,7 +4210,7 @@@
         }
         WARN_ON(!list_empty(&extents));
         write_unlock(&tree->lock);
+       up_write(&BTRFS_I(inode)->dio_sem);
   
         btrfs_release_path(path);
         return ret;
@@@ -4622,23 -4631,6 +4631,6 @@@ static int btrfs_log_inode(struct btrfs
   
         mutex_lock(&BTRFS_I(inode)->log_mutex);
   
-       /*
-        * Collect ordered extents only if we are logging data. This is to
-        * ensure a subsequent request to log this inode in LOG_INODE_ALL mode
-        * will process the ordered extents if they still exists at the time,
-        * because when we collect them we test and set for the flag
-        * BTRFS_ORDERED_LOGGED to prevent multiple log requests to process the
-        * same ordered extents. The consequence for the LOG_INODE_ALL log mode
-        * not processing the ordered extents is that we end up logging the
-        * corresponding file extent items, based on the extent maps in the
-        * inode's extent_map_tree's modified_list, without logging the
-        * respective checksums (since the may still be only attached to the
-        * ordered extents and have not been inserted in the csum tree by
-        * btrfs_finish_ordered_io() yet).
-        */
-       if (inode_only == LOG_INODE_ALL)
-               btrfs_get_logged_extents(inode, &logged_list, start, end);
- 
         /*
          * a brute force approach to making sure we get the most uptodate
          * copies of everything.
@@@ -4846,21 -4838,6 +4838,6 @@@ log_extents
                         goto out_unlock;
         }
         if (fast_search) {
-               /*
-                * Some ordered extents started by fsync might have completed
-                * before we collected the ordered extents in logged_list, which
-                * means they're gone, not in our logged_list nor in the inode's
-                * ordered tree. We want the application/user space to know an
-                * error happened while attempting to persist file data so that
-                * it can take proper action. If such error happened, we leave
-                * without writing to the log tree and the fsync must report the
-                * file data write error and not commit the current transaction.
-                */
-               err = btrfs_inode_check_errors(inode);
-               if (err) {
-                       ctx->io_err = err;
-                       goto out_unlock;
-               }
                 ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
                                                 &logged_list, ctx, start, end);
                 if (ret) {
@@@ -4988,7 -4965,7 +4965,7 @@@ static noinline int check_parent_dirs_f
                         goto out;
   
         if (!S_ISDIR(inode->i_mode)) {
- -              if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
+ +              if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
                         goto out;
                 inode = d_inode(parent);
         }
@@@ -5009,7 -4986,7 +4986,7 @@@
                         break;
                 }
   
- -              if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
+ +              if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
                         break;
   
                 if (IS_ROOT(parent))
@@@ -5158,7 -5135,7 +5135,7 @@@ process_leaf
                         }
   
                         ctx->log_new_dentries = false;
-                       if (type == BTRFS_FT_DIR)
+                       if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
                                 log_mode = LOG_INODE_ALL;
                         btrfs_release_path(path);
                         ret = btrfs_log_inode(trans, root, di_inode,
@@@ -5278,11 -5255,16 +5255,16 @@@ static int btrfs_log_all_parents(struc
                         if (IS_ERR(dir_inode))
                                 continue;
   
+                       if (ctx)
+                               ctx->log_new_dentries = false;
                         ret = btrfs_log_inode(trans, root, dir_inode,
                                               LOG_INODE_ALL, 0, LLONG_MAX, ctx);
                         if (!ret &&
                             btrfs_must_commit_transaction(trans, dir_inode))
                                 ret = 1;
+                       if (!ret && ctx && ctx->log_new_dentries)
+                               ret = log_new_dir_dentries(trans, root,
+                                                          dir_inode, ctx);
                         iput(dir_inode);
                         if (ret)
                                 goto out;
@@@ -5422,7 -5404,7 +5404,7 @@@ static int btrfs_log_inode_parent(struc
         }
   
         while (1) {
- -              if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
+ +              if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
                         break;
   
                 inode = d_inode(parent);
@@@ -5519,7 -5501,7 +5501,7 @@@ int btrfs_recover_log_trees(struct btrf
   
         ret = walk_log_tree(trans, log_root_tree, &wc);
         if (ret) {
-               btrfs_std_error(fs_info, ret, "Failed to pin buffers while "
+               btrfs_handle_fs_error(fs_info, ret, "Failed to pin buffers while "
                             "recovering log root tree.");
                 goto error;
         }
@@@ -5533,7 -5515,7 +5515,7 @@@ again
                 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
   
                 if (ret < 0) {
-                       btrfs_std_error(fs_info, ret,
+                       btrfs_handle_fs_error(fs_info, ret,
                                     "Couldn't find tree log root.");
                         goto error;
                 }
@@@ -5551,7 -5533,7 +5533,7 @@@
                 log = btrfs_read_fs_root(log_root_tree, &found_key);
                 if (IS_ERR(log)) {
                         ret = PTR_ERR(log);
-                       btrfs_std_error(fs_info, ret,
+                       btrfs_handle_fs_error(fs_info, ret,
                                     "Couldn't read tree log root.");
                         goto error;
                 }
@@@ -5566,7 -5548,7 +5548,7 @@@
                         free_extent_buffer(log->node);
                         free_extent_buffer(log->commit_root);
                         kfree(log);
-                       btrfs_std_error(fs_info, ret, "Couldn't read target root "
+                       btrfs_handle_fs_error(fs_info, ret, "Couldn't read target root "
                                     "for tree log recovery.");
                         goto error;
                 }
@@@ -5652,11 -5634,9 +5634,9 @@@ void btrfs_record_unlink_dir(struct btr
          * into the file.  When the file is logged we check it and
          * don't log the parents if the file is fully on disk.
          */
-       if (S_ISREG(inode->i_mode)) {
-               mutex_lock(&BTRFS_I(inode)->log_mutex);
-               BTRFS_I(inode)->last_unlink_trans = trans->transid;
-               mutex_unlock(&BTRFS_I(inode)->log_mutex);
-       }
+       mutex_lock(&BTRFS_I(inode)->log_mutex);
+       BTRFS_I(inode)->last_unlink_trans = trans->transid;
+       mutex_unlock(&BTRFS_I(inode)->log_mutex);
   
         /*
          * if this directory was already logged any new
diff --combined fs/btrfs/volumes.c

index bfb80da,e029030..2b88127
--- 1/fs/btrfs/volumes.c
--- 2/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@@ -20,13 -20,13 +20,13 @@@
   #include <linux/slab.h>
   #include <linux/buffer_head.h>
   #include <linux/blkdev.h>
- -#include <linux/random.h>
   #include <linux/iocontext.h>
   #include <linux/capability.h>
   #include <linux/ratelimit.h>
   #include <linux/kthread.h>
   #include <linux/raid/pq.h>
   #include <linux/semaphore.h>
+ +#include <linux/uuid.h>
   #include <asm/div64.h>
   #include "ctree.h"
   #include "extent_map.h"
@@@ -118,6 -118,21 +118,21 @@@ const u64 btrfs_raid_group[BTRFS_NR_RAI
         [BTRFS_RAID_RAID6]  = BTRFS_BLOCK_GROUP_RAID6,
   };
   
+ /*
+  * Table to convert BTRFS_RAID_* to the error code if minimum number of devices
+  * condition is not met. Zero means there's no corresponding
+  * BTRFS_ERROR_DEV_*_NOT_MET value.
+  */
+ const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = {
+       [BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
+       [BTRFS_RAID_RAID1]  = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
+       [BTRFS_RAID_DUP]    = 0,
+       [BTRFS_RAID_RAID0]  = 0,
+       [BTRFS_RAID_SINGLE] = 0,
+       [BTRFS_RAID_RAID5]  = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
+       [BTRFS_RAID_RAID6]  = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
+ };
+ 
   static int init_first_rw_device(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 struct btrfs_device *device);
@@@ -699,7 -714,8 +714,8 @@@ static noinline int device_list_add(con
          * if there is new btrfs on an already registered device,
          * then remove the stale device entry.
          */
-       btrfs_free_stale_device(device);
+       if (ret > 0)
+               btrfs_free_stale_device(device);
   
         *fs_devices_ret = fs_devices;
   
@@@ -988,6 -1004,56 +1004,56 @@@ int btrfs_open_devices(struct btrfs_fs_
         return ret;
   }
   
+ void btrfs_release_disk_super(struct page *page)
+ {
+       kunmap(page);
+       put_page(page);
+ }
+ 
+ int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
+               struct page **page, struct btrfs_super_block **disk_super)
+ {
+       void *p;
+       pgoff_t index;
+ 
+       /* make sure our super fits in the device */
+       if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
+               return 1;
+ 
+       /* make sure our super fits in the page */
+       if (sizeof(**disk_super) > PAGE_SIZE)
+               return 1;
+ 
+       /* make sure our super doesn't straddle pages on disk */
+       index = bytenr >> PAGE_SHIFT;
+       if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index)
+               return 1;
+ 
+       /* pull in the page with our super */
+       *page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
+                                  index, GFP_KERNEL);
+ 
+       if (IS_ERR_OR_NULL(*page))
+               return 1;
+ 
+       p = kmap(*page);
+ 
+       /* align our pointer to the offset of the super block */
+       *disk_super = p + (bytenr & ~PAGE_MASK);
+ 
+       if (btrfs_super_bytenr(*disk_super) != bytenr ||
+           btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
+               btrfs_release_disk_super(*page);
+               return 1;
+       }
+ 
+       if ((*disk_super)->label[0] &&
+               (*disk_super)->label[BTRFS_LABEL_SIZE - 1])
+               (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0';
+ 
+       return 0;
+ }
+ 
   /*
    * Look for a btrfs signature on a device. This may be called out of the mount path
    * and we are not allowed to call set_blocksize during the scan. The superblock
@@@ -999,13 -1065,11 +1065,11 @@@ int btrfs_scan_one_device(const char *p
         struct btrfs_super_block *disk_super;
         struct block_device *bdev;
         struct page *page;
-       void *p;
         int ret = -EINVAL;
         u64 devid;
         u64 transid;
         u64 total_devices;
         u64 bytenr;
-       pgoff_t index;
   
         /*
          * we would like to check all the supers, but that would make
@@@ -1018,41 -1082,14 +1082,14 @@@
         mutex_lock(&uuid_mutex);
   
         bdev = blkdev_get_by_path(path, flags, holder);
- 
         if (IS_ERR(bdev)) {
                 ret = PTR_ERR(bdev);
                 goto error;
         }
   
-       /* make sure our super fits in the device */
-       if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
-               goto error_bdev_put;
- 
-       /* make sure our super fits in the page */
-       if (sizeof(*disk_super) > PAGE_SIZE)
-               goto error_bdev_put;
- 
-       /* make sure our super doesn't straddle pages on disk */
-       index = bytenr >> PAGE_SHIFT;
-       if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
-               goto error_bdev_put;
- 
-       /* pull in the page with our super */
-       page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
-                                  index, GFP_NOFS);
- 
-       if (IS_ERR_OR_NULL(page))
+       if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super))
                 goto error_bdev_put;
   
-       p = kmap(page);
- 
-       /* align our pointer to the offset of the super block */
-       disk_super = p + (bytenr & ~PAGE_MASK);
- 
-       if (btrfs_super_bytenr(disk_super) != bytenr ||
-           btrfs_super_magic(disk_super) != BTRFS_MAGIC)
-               goto error_unmap;
- 
         devid = btrfs_stack_device_id(&disk_super->dev_item);
         transid = btrfs_super_generation(disk_super);
         total_devices = btrfs_super_num_devices(disk_super);
@@@ -1060,8 -1097,6 +1097,6 @@@
         ret = device_list_add(path, disk_super, devid, fs_devices_ret);
         if (ret > 0) {
                 if (disk_super->label[0]) {
-                       if (disk_super->label[BTRFS_LABEL_SIZE - 1])
-                               disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
                         printk(KERN_INFO "BTRFS: device label %s ", disk_super->label);
                 } else {
                         printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid);
@@@ -1073,9 -1108,7 +1108,7 @@@
         if (!ret && fs_devices_ret)
                 (*fs_devices_ret)->total_devices = total_devices;
   
- error_unmap:
-       kunmap(page);
-       put_page(page);
+       btrfs_release_disk_super(page);
   
   error_bdev_put:
         blkdev_put(bdev, flags);
@@@ -1454,7 -1487,7 +1487,7 @@@ again
                 extent = btrfs_item_ptr(leaf, path->slots[0],
                                         struct btrfs_dev_extent);
         } else {
-               btrfs_std_error(root->fs_info, ret, "Slot search failed");
+               btrfs_handle_fs_error(root->fs_info, ret, "Slot search failed");
                 goto out;
         }
   
@@@ -1462,7 -1495,7 +1495,7 @@@
   
         ret = btrfs_del_item(trans, root, path);
         if (ret) {
-               btrfs_std_error(root->fs_info, ret,
+               btrfs_handle_fs_error(root->fs_info, ret,
                             "Failed to remove dev extent item");
         } else {
                 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
@@@ -1688,32 -1721,92 +1721,92 @@@ out
         return ret;
   }
   
- int btrfs_rm_device(struct btrfs_root *root, char *device_path)
+ /*
+  * Verify that @num_devices satisfies the RAID profile constraints in the whole
+  * filesystem. It's up to the caller to adjust that number regarding eg. device
+  * replace.
+  */
+ static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
+               u64 num_devices)
+ {
+       u64 all_avail;
+       unsigned seq;
+       int i;
+ 
+       do {
+               seq = read_seqbegin(&fs_info->profiles_lock);
+ 
+               all_avail = fs_info->avail_data_alloc_bits |
+                           fs_info->avail_system_alloc_bits |
+                           fs_info->avail_metadata_alloc_bits;
+       } while (read_seqretry(&fs_info->profiles_lock, seq));
+ 
+       for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
+               if (!(all_avail & btrfs_raid_group[i]))
+                       continue;
+ 
+               if (num_devices < btrfs_raid_array[i].devs_min) {
+                       int ret = btrfs_raid_mindev_error[i];
+ 
+                       if (ret)
+                               return ret;
+               }
+       }
+ 
+       return 0;
+ }
+ 
+ struct btrfs_device *btrfs_find_next_active_device(struct btrfs_fs_devices *fs_devs,
+                                       struct btrfs_device *device)
   {
-       struct btrfs_device *device;
         struct btrfs_device *next_device;
-       struct block_device *bdev;
-       struct buffer_head *bh = NULL;
-       struct btrfs_super_block *disk_super;
+ 
+       list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
+               if (next_device != device &&
+                       !next_device->missing && next_device->bdev)
+                       return next_device;
+       }
+ 
+       return NULL;
+ }
+ 
+ /*
+  * Helper function to check if the given device is part of s_bdev / latest_bdev
+  * and replace it with the provided or the next active device, in the context
+  * where this function called, there should be always be another device (or
+  * this_dev) which is active.
+  */
+ void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info,
+               struct btrfs_device *device, struct btrfs_device *this_dev)
+ {
+       struct btrfs_device *next_device;
+ 
+       if (this_dev)
+               next_device = this_dev;
+       else
+               next_device = btrfs_find_next_active_device(fs_info->fs_devices,
+                                                               device);
+       ASSERT(next_device);
+ 
+       if (fs_info->sb->s_bdev &&
+                       (fs_info->sb->s_bdev == device->bdev))
+               fs_info->sb->s_bdev = next_device->bdev;
+ 
+       if (fs_info->fs_devices->latest_bdev == device->bdev)
+               fs_info->fs_devices->latest_bdev = next_device->bdev;
+ }
+ 
+ int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid)
+ {
+       struct btrfs_device *device;
         struct btrfs_fs_devices *cur_devices;
-       u64 all_avail;
-       u64 devid;
         u64 num_devices;
-       u8 *dev_uuid;
-       unsigned seq;
         int ret = 0;
         bool clear_super = false;
+       char *dev_name = NULL;
   
         mutex_lock(&uuid_mutex);
   
-       do {
-               seq = read_seqbegin(&root->fs_info->profiles_lock);
- 
-               all_avail = root->fs_info->avail_data_alloc_bits |
-                           root->fs_info->avail_system_alloc_bits |
-                           root->fs_info->avail_metadata_alloc_bits;
-       } while (read_seqretry(&root->fs_info->profiles_lock, seq));
- 
         num_devices = root->fs_info->fs_devices->num_devices;
         btrfs_dev_replace_lock(&root->fs_info->dev_replace, 0);
         if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
@@@ -1722,78 -1815,23 +1815,23 @@@
         }
         btrfs_dev_replace_unlock(&root->fs_info->dev_replace, 0);
   
-       if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
-               ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET;
-               goto out;
-       }
- 
-       if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
-               ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET;
+       ret = btrfs_check_raid_min_devices(root->fs_info, num_devices - 1);
+       if (ret)
                 goto out;
-       }
   
-       if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
-           root->fs_info->fs_devices->rw_devices <= 2) {
-               ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET;
-               goto out;
-       }
-       if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
-           root->fs_info->fs_devices->rw_devices <= 3) {
-               ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET;
+       ret = btrfs_find_device_by_devspec(root, devid, device_path,
+                               &device);
+       if (ret)
                 goto out;
-       }
- 
-       if (strcmp(device_path, "missing") == 0) {
-               struct list_head *devices;
-               struct btrfs_device *tmp;
- 
-               device = NULL;
-               devices = &root->fs_info->fs_devices->devices;
-               /*
-                * It is safe to read the devices since the volume_mutex
-                * is held.
-                */
-               list_for_each_entry(tmp, devices, dev_list) {
-                       if (tmp->in_fs_metadata &&
-                           !tmp->is_tgtdev_for_dev_replace &&
-                           !tmp->bdev) {
-                               device = tmp;
-                               break;
-                       }
-               }
-               bdev = NULL;
-               bh = NULL;
-               disk_super = NULL;
-               if (!device) {
-                       ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
-                       goto out;
-               }
-       } else {
-               ret = btrfs_get_bdev_and_sb(device_path,
-                                           FMODE_WRITE | FMODE_EXCL,
-                                           root->fs_info->bdev_holder, 0,
-                                           &bdev, &bh);
-               if (ret)
-                       goto out;
-               disk_super = (struct btrfs_super_block *)bh->b_data;
-               devid = btrfs_stack_device_id(&disk_super->dev_item);
-               dev_uuid = disk_super->dev_item.uuid;
-               device = btrfs_find_device(root->fs_info, devid, dev_uuid,
-                                          disk_super->fsid);
-               if (!device) {
-                       ret = -ENOENT;
-                       goto error_brelse;
-               }
-       }
   
         if (device->is_tgtdev_for_dev_replace) {
                 ret = BTRFS_ERROR_DEV_TGT_REPLACE;
-               goto error_brelse;
+               goto out;
         }
   
         if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
                 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
-               goto error_brelse;
+               goto out;
         }
   
         if (device->writeable) {
@@@ -1801,6 -1839,11 +1839,11 @@@
                 list_del_init(&device->dev_alloc_list);
                 device->fs_devices->rw_devices--;
                 unlock_chunks(root);
+               dev_name = kstrdup(device->name->str, GFP_KERNEL);
+               if (!dev_name) {
+                       ret = -ENOMEM;
+                       goto error_undo;
+               }
                 clear_super = true;
         }
   
@@@ -1842,12 -1885,7 +1885,7 @@@
         if (device->missing)
                 device->fs_devices->missing_devices--;
   
-       next_device = list_entry(root->fs_info->fs_devices->devices.next,
-                                struct btrfs_device, dev_list);
-       if (device->bdev == root->fs_info->sb->s_bdev)
-               root->fs_info->sb->s_bdev = next_device->bdev;
-       if (device->bdev == root->fs_info->fs_devices->latest_bdev)
-               root->fs_info->fs_devices->latest_bdev = next_device->bdev;
+       btrfs_assign_next_active_device(root->fs_info, device, NULL);
   
         if (device->bdev) {
                 device->fs_devices->open_devices--;
@@@ -1883,63 -1921,23 +1921,23 @@@
          * at this point, the device is zero sized.  We want to
          * remove it from the devices list and zero out the old super
          */
-       if (clear_super && disk_super) {
-               u64 bytenr;
-               int i;
- 
-               /* make sure this device isn't detected as part of
-                * the FS anymore
-                */
-               memset(&disk_super->magic, 0, sizeof(disk_super->magic));
-               set_buffer_dirty(bh);
-               sync_dirty_buffer(bh);
- 
-               /* clear the mirror copies of super block on the disk
-                * being removed, 0th copy is been taken care above and
-                * the below would take of the rest
-                */
-               for (i = 1; i < BTRFS_SUPER_MIRROR_MAX; i++) {
-                       bytenr = btrfs_sb_offset(i);
-                       if (bytenr + BTRFS_SUPER_INFO_SIZE >=
-                                       i_size_read(bdev->bd_inode))
-                               break;
- 
-                       brelse(bh);
-                       bh = __bread(bdev, bytenr / 4096,
-                                       BTRFS_SUPER_INFO_SIZE);
-                       if (!bh)
-                               continue;
- 
-                       disk_super = (struct btrfs_super_block *)bh->b_data;
- 
-                       if (btrfs_super_bytenr(disk_super) != bytenr ||
-                               btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
-                               continue;
-                       }
-                       memset(&disk_super->magic, 0,
-                                               sizeof(disk_super->magic));
-                       set_buffer_dirty(bh);
-                       sync_dirty_buffer(bh);
+       if (clear_super) {
+               struct block_device *bdev;
+ 
+               bdev = blkdev_get_by_path(dev_name, FMODE_READ | FMODE_EXCL,
+                                               root->fs_info->bdev_holder);
+               if (!IS_ERR(bdev)) {
+                       btrfs_scratch_superblocks(bdev, dev_name);
+                       blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
                 }
         }
   
-       ret = 0;
- 
-       if (bdev) {
-               /* Notify udev that device has changed */
-               btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
- 
-               /* Update ctime/mtime for device path for libblkid */
-               update_dev_time(device_path);
-       }
- 
- error_brelse:
-       brelse(bh);
-       if (bdev)
-               blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
   out:
+       kfree(dev_name);
+ 
         mutex_unlock(&uuid_mutex);
         return ret;
+ 
   error_undo:
         if (device->writeable) {
                 lock_chunks(root);
@@@ -1948,7 -1946,7 +1946,7 @@@
                 device->fs_devices->rw_devices++;
                 unlock_chunks(root);
         }
-       goto error_brelse;
+       goto out;
   }
   
   void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
@@@ -1972,11 -1970,8 +1970,8 @@@
         if (srcdev->missing)
                 fs_devices->missing_devices--;
   
-       if (srcdev->writeable) {
+       if (srcdev->writeable)
                 fs_devices->rw_devices--;
-               /* zero out the old super if it is writable */
-               btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
-       }
   
         if (srcdev->bdev)
                 fs_devices->open_devices--;
@@@ -1987,6 -1982,10 +1982,10 @@@ void btrfs_rm_dev_replace_free_srcdev(s
   {
         struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
   
+       if (srcdev->writeable) {
+               /* zero out the old super if it is writable */
+               btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
+       }
         call_rcu(&srcdev->rcu, free_device);
   
         /*
@@@ -2016,32 -2015,33 +2015,33 @@@
   void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
                                       struct btrfs_device *tgtdev)
   {
-       struct btrfs_device *next_device;
- 
         mutex_lock(&uuid_mutex);
         WARN_ON(!tgtdev);
         mutex_lock(&fs_info->fs_devices->device_list_mutex);
   
         btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
   
-       if (tgtdev->bdev) {
-               btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
+       if (tgtdev->bdev)
                 fs_info->fs_devices->open_devices--;
-       }
+ 
         fs_info->fs_devices->num_devices--;
   
-       next_device = list_entry(fs_info->fs_devices->devices.next,
-                                struct btrfs_device, dev_list);
-       if (tgtdev->bdev == fs_info->sb->s_bdev)
-               fs_info->sb->s_bdev = next_device->bdev;
-       if (tgtdev->bdev == fs_info->fs_devices->latest_bdev)
-               fs_info->fs_devices->latest_bdev = next_device->bdev;
-       list_del_rcu(&tgtdev->dev_list);
+       btrfs_assign_next_active_device(fs_info, tgtdev, NULL);
   
-       call_rcu(&tgtdev->rcu, free_device);
+       list_del_rcu(&tgtdev->dev_list);
   
         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
         mutex_unlock(&uuid_mutex);
+ 
+       /*
+        * The update_dev_time() with in btrfs_scratch_superblocks()
+        * may lead to a call to btrfs_show_devname() which will try
+        * to hold device_list_mutex. And here this device
+        * is already out of device list, so we don't have to hold
+        * the device_list_mutex lock.
+        */
+       btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
+       call_rcu(&tgtdev->rcu, free_device);
   }
   
   static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
@@@ -2101,6 -2101,31 +2101,31 @@@ int btrfs_find_device_missing_or_by_pat
         }
   }
   
+ /*
+  * Lookup a device given by device id, or the path if the id is 0.
+  */
+ int btrfs_find_device_by_devspec(struct btrfs_root *root, u64 devid,
+                                        char *devpath,
+                                        struct btrfs_device **device)
+ {
+       int ret;
+ 
+       if (devid) {
+               ret = 0;
+               *device = btrfs_find_device(root->fs_info, devid, NULL,
+                                           NULL);
+               if (!*device)
+                       ret = -ENOENT;
+       } else {
+               if (!devpath || !devpath[0])
+                       return -EINVAL;
+ 
+               ret = btrfs_find_device_missing_or_by_path(root, devpath,
+                                                          device);
+       }
+       return ret;
+ }
+ 
   /*
    * does all the dirty work required for changing file system's UUID.
    */
@@@ -2418,7 -2443,7 +2443,7 @@@ int btrfs_init_new_device(struct btrfs_
   
                 ret = btrfs_relocate_sys_chunks(root);
                 if (ret < 0)
-                       btrfs_std_error(root->fs_info, ret,
+                       btrfs_handle_fs_error(root->fs_info, ret,
                                     "Failed to relocate sys chunks after "
                                     "device initialization. This can be fixed "
                                     "using the \"btrfs balance\" command.");
@@@ -2663,7 -2688,7 +2688,7 @@@ static int btrfs_free_chunk(struct btrf
         if (ret < 0)
                 goto out;
         else if (ret > 0) { /* Logic error or corruption */
-               btrfs_std_error(root->fs_info, -ENOENT,
+               btrfs_handle_fs_error(root->fs_info, -ENOENT,
                             "Failed lookup while freeing chunk.");
                 ret = -ENOENT;
                 goto out;
@@@ -2671,7 -2696,7 +2696,7 @@@
   
         ret = btrfs_del_item(trans, root, path);
         if (ret < 0)
-               btrfs_std_error(root->fs_info, ret,
+               btrfs_handle_fs_error(root->fs_info, ret,
                             "Failed to delete chunk item.");
   out:
         btrfs_free_path(path);
@@@ -2857,7 -2882,7 +2882,7 @@@ static int btrfs_relocate_chunk(struct 
                                                      chunk_offset);
         if (IS_ERR(trans)) {
                 ret = PTR_ERR(trans);
-               btrfs_std_error(root->fs_info, ret, NULL);
+               btrfs_handle_fs_error(root->fs_info, ret, NULL);
                 return ret;
         }
   
@@@ -3402,6 -3427,7 +3427,7 @@@ static int __btrfs_balance(struct btrfs
         u32 count_meta = 0;
         u32 count_sys = 0;
         int chunk_reserved = 0;
+       u64 bytes_used = 0;
   
         /* step one make some room on all the devices */
         devices = &fs_info->fs_devices->devices;
@@@ -3540,7 -3566,13 +3566,13 @@@ again
                         goto loop;
                 }
   
-               if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) && !chunk_reserved) {
+               ASSERT(fs_info->data_sinfo);
+               spin_lock(&fs_info->data_sinfo->lock);
+               bytes_used = fs_info->data_sinfo->bytes_used;
+               spin_unlock(&fs_info->data_sinfo->lock);
+ 
+               if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
+                   !chunk_reserved && !bytes_used) {
                         trans = btrfs_start_transaction(chunk_root, 0);
                         if (IS_ERR(trans)) {
                                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
@@@ -3632,7 -3664,7 +3664,7 @@@ static void __cancel_balance(struct btr
         unset_balance_control(fs_info);
         ret = del_balance_item(fs_info->tree_root);
         if (ret)
-               btrfs_std_error(fs_info, ret, NULL);
+               btrfs_handle_fs_error(fs_info, ret, NULL);
   
         atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
   }
@@@ -3693,10 -3725,8 +3725,8 @@@ int btrfs_balance(struct btrfs_balance_
                 num_devices--;
         }
         btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
-       allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
-       if (num_devices == 1)
-               allowed |= BTRFS_BLOCK_GROUP_DUP;
-       else if (num_devices > 1)
+       allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP;
+       if (num_devices > 1)
                 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
         if (num_devices > 2)
                 allowed |= BTRFS_BLOCK_GROUP_RAID5;
@@@ -5278,7 -5308,15 +5308,15 @@@ static int __btrfs_map_block(struct btr
         stripe_nr = div64_u64(stripe_nr, stripe_len);
   
         stripe_offset = stripe_nr * stripe_len;
-       BUG_ON(offset < stripe_offset);
+       if (offset < stripe_offset) {
+               btrfs_crit(fs_info, "stripe math has gone wrong, "
+                          "stripe_offset=%llu, offset=%llu, start=%llu, "
+                          "logical=%llu, stripe_len=%llu",
+                          stripe_offset, offset, em->start, logical,
+                          stripe_len);
+               free_extent_map(em);
+               return -EINVAL;
+       }
   
         /* stripe_offset is the offset of this block in its stripe*/
         stripe_offset = offset - stripe_offset;
@@@ -5519,7 -5557,13 +5557,13 @@@
                                 &stripe_index);
                 mirror_num = stripe_index + 1;
         }
-       BUG_ON(stripe_index >= map->num_stripes);
+       if (stripe_index >= map->num_stripes) {
+               btrfs_crit(fs_info, "stripe index math went horribly wrong, "
+                          "got stripe_index=%u, num_stripes=%u",
+                          stripe_index, map->num_stripes);
+               ret = -EINVAL;
+               goto out;
+       }
   
         num_alloc_stripes = num_stripes;
         if (dev_replace_is_ongoing) {
@@@ -6242,7 -6286,7 +6286,7 @@@ static int read_one_chunk(struct btrfs_
                         "invalid chunk length %llu", length);
                 return -EIO;
         }
-       if (!is_power_of_2(stripe_len)) {
+       if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) {
                 btrfs_err(root->fs_info, "invalid chunk stripe length: %llu",
                           stripe_len);
                 return -EIO;
author	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 21 May 2016 17:49:22 +0000 (10:49 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 21 May 2016 17:49:22 +0000 (10:49 -0700)
		1	2
fs/btrfs/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/ioctl.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/tree-log.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/volumes.c	patch \|	diff1 \|	diff2 \|	blob \| history