Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux...

author Linus Torvalds <torvalds@linux-foundation.org>

Thu, 19 Feb 2015 22:36:00 +0000 (14:36 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 19 Feb 2015 22:36:00 +0000 (14:36 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Thu, 19 Feb 2015 22:36:00 +0000 (14:36 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 19 Feb 2015 22:36:00 +0000 (14:36 -0800)
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c

index 8729cf6..f55721f 100644 (file)
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1246,25 +1246,6 @@ int btrfs_check_shared(struct btrfs_trans_handle *trans,
         return ret;
  }
  
-/*
- * this makes the path point to (inum INODE_ITEM ioff)
- */
-int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
-                       struct btrfs_path *path)
-{
-       struct btrfs_key key;
-       return btrfs_find_item(fs_root, path, inum, ioff,
-                       BTRFS_INODE_ITEM_KEY, &key);
-}
-
-static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
-                               struct btrfs_path *path,
-                               struct btrfs_key *found_key)
-{
-       return btrfs_find_item(fs_root, path, inum, ioff,
-                       BTRFS_INODE_REF_KEY, found_key);
-}
-
  int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
                           u64 start_off, struct btrfs_path *path,
                           struct btrfs_inode_extref **ret_extref,
@@ -1374,7 +1355,8 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
                         btrfs_tree_read_unlock_blocking(eb);
                         free_extent_buffer(eb);
                 }
-               ret = inode_ref_info(parent, 0, fs_root, path, &found_key);
+               ret = btrfs_find_item(fs_root, path, parent, 0,
+                               BTRFS_INODE_REF_KEY, &found_key);
                 if (ret > 0)
                         ret = -ENOENT;
                 if (ret)
@@ -1727,8 +1709,10 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
         struct btrfs_key found_key;
  
         while (!ret) {
-               ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
-                                    &found_key);
+               ret = btrfs_find_item(fs_root, path, inum,
+                               parent ? parent + 1 : 0, BTRFS_INODE_REF_KEY,
+                               &found_key);
+
                 if (ret < 0)
                         break;
                 if (ret) {
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h

index 2a1ac6b..9c41fba 100644 (file)
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -32,9 +32,6 @@ struct inode_fs_paths {
  typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
                 void *ctx);
  
-int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
-                       struct btrfs_path *path);
-
  int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
                         struct btrfs_path *path, struct btrfs_key *found_key,
                         u64 *flags);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h

index 4aadadc..de5e4f2 100644 (file)
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -185,6 +185,9 @@ struct btrfs_inode {
  
         struct btrfs_delayed_node *delayed_node;
  
+       /* File creation time. */
+       struct timespec i_otime;
+
         struct inode vfs_inode;
  };
  
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c

index 14a72ed..9936421 100644 (file)
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -213,11 +213,19 @@ static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
   */
  static void add_root_to_dirty_list(struct btrfs_root *root)
  {
+       if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
+           !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
+               return;
+
         spin_lock(&root->fs_info->trans_lock);
-       if (test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state) &&
-           list_empty(&root->dirty_list)) {
-               list_add(&root->dirty_list,
-                        &root->fs_info->dirty_cowonly_roots);
+       if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
+               /* Want the extent tree to be the last on the list */
+               if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID)
+                       list_move_tail(&root->dirty_list,
+                                      &root->fs_info->dirty_cowonly_roots);
+               else
+                       list_move(&root->dirty_list,
+                                 &root->fs_info->dirty_cowonly_roots);
         }
         spin_unlock(&root->fs_info->trans_lock);
  }
@@ -1363,8 +1371,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
  
         if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
                 BUG_ON(tm->slot != 0);
-               eb_rewin = alloc_dummy_extent_buffer(eb->start,
-                                               fs_info->tree_root->nodesize);
+               eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
                 if (!eb_rewin) {
                         btrfs_tree_read_unlock_blocking(eb);
                         free_extent_buffer(eb);
@@ -1444,7 +1451,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
         } else if (old_root) {
                 btrfs_tree_read_unlock(eb_root);
                 free_extent_buffer(eb_root);
-               eb = alloc_dummy_extent_buffer(logical, root->nodesize);
+               eb = alloc_dummy_extent_buffer(root->fs_info, logical);
         } else {
                 btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
                 eb = btrfs_clone_extent_buffer(eb_root);
@@ -2282,7 +2289,7 @@ static void reada_for_search(struct btrfs_root *root,
                 if ((search <= target && target - search <= 65536) ||
                     (search > target && search - target <= 65536)) {
                         gen = btrfs_node_ptr_generation(node, nr);
-                       readahead_tree_block(root, search, blocksize);
+                       readahead_tree_block(root, search);
                         nread += blocksize;
                 }
                 nscan++;
@@ -2301,7 +2308,6 @@ static noinline void reada_for_balance(struct btrfs_root *root,
         u64 gen;
         u64 block1 = 0;
         u64 block2 = 0;
-       int blocksize;
  
         parent = path->nodes[level + 1];
         if (!parent)
@@ -2309,7 +2315,6 @@ static noinline void reada_for_balance(struct btrfs_root *root,
  
         nritems = btrfs_header_nritems(parent);
         slot = path->slots[level + 1];
-       blocksize = root->nodesize;
  
         if (slot > 0) {
                 block1 = btrfs_node_blockptr(parent, slot - 1);
@@ -2334,9 +2339,9 @@ static noinline void reada_for_balance(struct btrfs_root *root,
         }
  
         if (block1)
-               readahead_tree_block(root, block1, blocksize);
+               readahead_tree_block(root, block1);
         if (block2)
-               readahead_tree_block(root, block2, blocksize);
+               readahead_tree_block(root, block2);
  }
  
  
@@ -2609,32 +2614,24 @@ static int key_search(struct extent_buffer *b, struct btrfs_key *key,
         return 0;
  }
  
-int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *found_path,
+int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
                 u64 iobjectid, u64 ioff, u8 key_type,
                 struct btrfs_key *found_key)
  {
         int ret;
         struct btrfs_key key;
         struct extent_buffer *eb;
-       struct btrfs_path *path;
+
+       ASSERT(path);
+       ASSERT(found_key);
  
         key.type = key_type;
         key.objectid = iobjectid;
         key.offset = ioff;
  
-       if (found_path == NULL) {
-               path = btrfs_alloc_path();
-               if (!path)
-                       return -ENOMEM;
-       } else
-               path = found_path;
-
         ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
-       if ((ret < 0) || (found_key == NULL)) {
-               if (path != found_path)
-                       btrfs_free_path(path);
+       if (ret < 0)
                 return ret;
-       }
  
         eb = path->nodes[0];
         if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
@@ -3383,7 +3380,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
         add_root_to_dirty_list(root);
         extent_buffer_get(c);
         path->nodes[level] = c;
-       path->locks[level] = BTRFS_WRITE_LOCK;
+       path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
         path->slots[level] = 0;
         return 0;
  }
@@ -4356,13 +4353,15 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
         path->search_for_split = 1;
         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
         path->search_for_split = 0;
+       if (ret > 0)
+               ret = -EAGAIN;
         if (ret < 0)
                 goto err;
  
         ret = -EAGAIN;
         leaf = path->nodes[0];
-       /* if our item isn't there or got smaller, return now */
-       if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
+       /* if our item isn't there, return now */
+       if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
                 goto err;
  
         /* the leaf has  changed, it now has room.  return now */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h

index 0b18070..84c3b00 100644 (file)
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -198,6 +198,8 @@ static int btrfs_csum_sizes[] = { 4, 0 };
  
  #define BTRFS_DIRTY_METADATA_THRESH    (32 * 1024 * 1024)
  
+#define BTRFS_MAX_EXTENT_SIZE (128 * 1024 * 1024)
+
  /*
   * The key defines the order in the tree, and so it also defines (optimal)
   * block layout.
@@ -1020,6 +1022,9 @@ enum btrfs_raid_types {
                                          BTRFS_BLOCK_GROUP_RAID6 |   \
                                          BTRFS_BLOCK_GROUP_DUP |     \
                                          BTRFS_BLOCK_GROUP_RAID10)
+#define BTRFS_BLOCK_GROUP_RAID56_MASK  (BTRFS_BLOCK_GROUP_RAID5 |   \
+                                        BTRFS_BLOCK_GROUP_RAID6)
+
  /*
   * We need a bit for restriper to be able to tell when chunks of type
   * SINGLE are available.  This "extended" profile format is used in
@@ -1239,7 +1244,6 @@ enum btrfs_disk_cache_state {
         BTRFS_DC_ERROR          = 1,
         BTRFS_DC_CLEAR          = 2,
         BTRFS_DC_SETUP          = 3,
-       BTRFS_DC_NEED_WRITE     = 4,
  };
  
  struct btrfs_caching_control {
@@ -1277,7 +1281,6 @@ struct btrfs_block_group_cache {
         unsigned long full_stripe_len;
  
         unsigned int ro:1;
-       unsigned int dirty:1;
         unsigned int iref:1;
         unsigned int has_caching_ctl:1;
         unsigned int removed:1;
@@ -1315,6 +1318,9 @@ struct btrfs_block_group_cache {
         struct list_head ro_list;
  
         atomic_t trimming;
+
+       /* For dirty block groups */
+       struct list_head dirty_list;
  };
  
  /* delayed seq elem */
@@ -1741,6 +1747,7 @@ struct btrfs_fs_info {
  
         spinlock_t unused_bgs_lock;
         struct list_head unused_bgs;
+       struct mutex unused_bg_unpin_mutex;
  
         /* For btrfs to record security options */
         struct security_mnt_opts security_opts;
@@ -1776,6 +1783,7 @@ struct btrfs_subvolume_writers {
  #define BTRFS_ROOT_DEFRAG_RUNNING      6
  #define BTRFS_ROOT_FORCE_COW           7
  #define BTRFS_ROOT_MULTI_LOG_TASKS     8
+#define BTRFS_ROOT_DIRTY               9
  
  /*
   * in ram representation of the tree.  extent_root is used for all allocations
@@ -1794,8 +1802,6 @@ struct btrfs_root {
         struct btrfs_fs_info *fs_info;
         struct extent_io_tree dirty_log_pages;
  
-       struct kobject root_kobj;
-       struct completion kobj_unregister;
         struct mutex objectid_mutex;
  
         spinlock_t accounting_lock;
@@ -2465,31 +2471,6 @@ BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32);
  BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32);
  BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64);
  BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64);
-
-static inline struct btrfs_timespec *
-btrfs_inode_atime(struct btrfs_inode_item *inode_item)
-{
-       unsigned long ptr = (unsigned long)inode_item;
-       ptr += offsetof(struct btrfs_inode_item, atime);
-       return (struct btrfs_timespec *)ptr;
-}
-
-static inline struct btrfs_timespec *
-btrfs_inode_mtime(struct btrfs_inode_item *inode_item)
-{
-       unsigned long ptr = (unsigned long)inode_item;
-       ptr += offsetof(struct btrfs_inode_item, mtime);
-       return (struct btrfs_timespec *)ptr;
-}
-
-static inline struct btrfs_timespec *
-btrfs_inode_ctime(struct btrfs_inode_item *inode_item)
-{
-       unsigned long ptr = (unsigned long)inode_item;
-       ptr += offsetof(struct btrfs_inode_item, ctime);
-       return (struct btrfs_timespec *)ptr;
-}
-
  BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
  BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
  BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c

index de4e70f..82f0c7c 100644 (file)
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1755,27 +1755,31 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
         btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
         btrfs_set_stack_inode_block_group(inode_item, 0);
  
-       btrfs_set_stack_timespec_sec(btrfs_inode_atime(inode_item),
+       btrfs_set_stack_timespec_sec(&inode_item->atime,
                                      inode->i_atime.tv_sec);
-       btrfs_set_stack_timespec_nsec(btrfs_inode_atime(inode_item),
+       btrfs_set_stack_timespec_nsec(&inode_item->atime,
                                       inode->i_atime.tv_nsec);
  
-       btrfs_set_stack_timespec_sec(btrfs_inode_mtime(inode_item),
+       btrfs_set_stack_timespec_sec(&inode_item->mtime,
                                      inode->i_mtime.tv_sec);
-       btrfs_set_stack_timespec_nsec(btrfs_inode_mtime(inode_item),
+       btrfs_set_stack_timespec_nsec(&inode_item->mtime,
                                       inode->i_mtime.tv_nsec);
  
-       btrfs_set_stack_timespec_sec(btrfs_inode_ctime(inode_item),
+       btrfs_set_stack_timespec_sec(&inode_item->ctime,
                                      inode->i_ctime.tv_sec);
-       btrfs_set_stack_timespec_nsec(btrfs_inode_ctime(inode_item),
+       btrfs_set_stack_timespec_nsec(&inode_item->ctime,
                                       inode->i_ctime.tv_nsec);
+
+       btrfs_set_stack_timespec_sec(&inode_item->otime,
+                                    BTRFS_I(inode)->i_otime.tv_sec);
+       btrfs_set_stack_timespec_nsec(&inode_item->otime,
+                                    BTRFS_I(inode)->i_otime.tv_nsec);
  }
  
  int btrfs_fill_inode(struct inode *inode, u32 *rdev)
  {
         struct btrfs_delayed_node *delayed_node;
         struct btrfs_inode_item *inode_item;
-       struct btrfs_timespec *tspec;
  
         delayed_node = btrfs_get_delayed_node(inode);
         if (!delayed_node)
@@ -1802,17 +1806,19 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
         *rdev = btrfs_stack_inode_rdev(inode_item);
         BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item);
  
-       tspec = btrfs_inode_atime(inode_item);
-       inode->i_atime.tv_sec = btrfs_stack_timespec_sec(tspec);
-       inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(tspec);
+       inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime);
+       inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime);
+
+       inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(&inode_item->mtime);
+       inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->mtime);
  
-       tspec = btrfs_inode_mtime(inode_item);
-       inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(tspec);
-       inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(tspec);
+       inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(&inode_item->ctime);
+       inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->ctime);
  
-       tspec = btrfs_inode_ctime(inode_item);
-       inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(tspec);
-       inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(tspec);
+       BTRFS_I(inode)->i_otime.tv_sec =
+               btrfs_stack_timespec_sec(&inode_item->otime);
+       BTRFS_I(inode)->i_otime.tv_nsec =
+               btrfs_stack_timespec_nsec(&inode_item->otime);
  
         inode->i_generation = BTRFS_I(inode)->generation;
         BTRFS_I(inode)->index_cnt = (u64)-1;
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c

index ca6a3a3..5ec03d9 100644 (file)
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -440,18 +440,9 @@ leave:
   */
  static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
  {
-       s64 writers;
-       DEFINE_WAIT(wait);
-
         set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
-       do {
-               prepare_to_wait(&fs_info->replace_wait, &wait,
-                               TASK_UNINTERRUPTIBLE);
-               writers = percpu_counter_sum(&fs_info->bio_counter);
-               if (writers)
-                       schedule();
-               finish_wait(&fs_info->replace_wait, &wait);
-       } while (writers);
+       wait_event(fs_info->replace_wait, !percpu_counter_sum(
+                  &fs_info->bio_counter));
  }
  
  /*
@@ -932,15 +923,15 @@ void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
  
  void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
  {
-       DEFINE_WAIT(wait);
-again:
-       percpu_counter_inc(&fs_info->bio_counter);
-       if (test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state)) {
+       while (1) {
+               percpu_counter_inc(&fs_info->bio_counter);
+               if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
+                                    &fs_info->fs_state)))
+                       break;
+
                 btrfs_bio_counter_dec(fs_info);
                 wait_event(fs_info->replace_wait,
                            !test_bit(BTRFS_FS_STATE_DEV_REPLACING,
                                      &fs_info->fs_state));
-               goto again;
         }
-
  }
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c

index 1afb182..f79f385 100644 (file)
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -318,7 +318,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
                         memcpy(&found, result, csum_size);
  
                         read_extent_buffer(buf, &val, 0, csum_size);
-                       printk_ratelimited(KERN_INFO
+                       printk_ratelimited(KERN_WARNING
                                 "BTRFS: %s checksum verify failed on %llu wanted %X found %X "
                                 "level %d\n",
                                 root->fs_info->sb->s_id, buf->start,
@@ -367,7 +367,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
                 ret = 0;
                 goto out;
         }
-       printk_ratelimited(KERN_INFO "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n",
+       printk_ratelimited(KERN_ERR
+           "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n",
                         eb->fs_info->sb->s_id, eb->start,
                         parent_transid, btrfs_header_generation(eb));
         ret = 1;
@@ -633,21 +634,21 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
  
         found_start = btrfs_header_bytenr(eb);
         if (found_start != eb->start) {
-               printk_ratelimited(KERN_INFO "BTRFS (device %s): bad tree block start "
+               printk_ratelimited(KERN_ERR "BTRFS (device %s): bad tree block start "
                                "%llu %llu\n",
                                eb->fs_info->sb->s_id, found_start, eb->start);
                 ret = -EIO;
                 goto err;
         }
         if (check_tree_block_fsid(root, eb)) {
-               printk_ratelimited(KERN_INFO "BTRFS (device %s): bad fsid on block %llu\n",
+               printk_ratelimited(KERN_ERR "BTRFS (device %s): bad fsid on block %llu\n",
                                eb->fs_info->sb->s_id, eb->start);
                 ret = -EIO;
                 goto err;
         }
         found_level = btrfs_header_level(eb);
         if (found_level >= BTRFS_MAX_LEVEL) {
-               btrfs_info(root->fs_info, "bad tree block level %d",
+               btrfs_err(root->fs_info, "bad tree block level %d",
                            (int)btrfs_header_level(eb));
                 ret = -EIO;
                 goto err;
@@ -1073,12 +1074,12 @@ static const struct address_space_operations btree_aops = {
         .set_page_dirty = btree_set_page_dirty,
  };
  
-void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize)
+void readahead_tree_block(struct btrfs_root *root, u64 bytenr)
  {
         struct extent_buffer *buf = NULL;
         struct inode *btree_inode = root->fs_info->btree_inode;
  
-       buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+       buf = btrfs_find_create_tree_block(root, bytenr);
         if (!buf)
                 return;
         read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
@@ -1086,7 +1087,7 @@ void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize)
         free_extent_buffer(buf);
  }
  
-int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
                          int mirror_num, struct extent_buffer **eb)
  {
         struct extent_buffer *buf = NULL;
@@ -1094,7 +1095,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
         struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
         int ret;
  
-       buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+       buf = btrfs_find_create_tree_block(root, bytenr);
         if (!buf)
                 return 0;
  
@@ -1125,12 +1126,11 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
  }
  
  struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
-                                                u64 bytenr, u32 blocksize)
+                                                u64 bytenr)
  {
         if (btrfs_test_is_dummy_root(root))
-               return alloc_test_extent_buffer(root->fs_info, bytenr,
-                                               blocksize);
-       return alloc_extent_buffer(root->fs_info, bytenr, blocksize);
+               return alloc_test_extent_buffer(root->fs_info, bytenr);
+       return alloc_extent_buffer(root->fs_info, bytenr);
  }
  
  
@@ -1152,7 +1152,7 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
         struct extent_buffer *buf = NULL;
         int ret;
  
-       buf = btrfs_find_create_tree_block(root, bytenr, root->nodesize);
+       buf = btrfs_find_create_tree_block(root, bytenr);
         if (!buf)
                 return NULL;
  
@@ -1275,12 +1275,10 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
         memset(&root->root_key, 0, sizeof(root->root_key));
         memset(&root->root_item, 0, sizeof(root->root_item));
         memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
-       memset(&root->root_kobj, 0, sizeof(root->root_kobj));
         if (fs_info)
                 root->defrag_trans_start = fs_info->generation;
         else
                 root->defrag_trans_start = 0;
-       init_completion(&root->kobj_unregister);
         root->root_key.objectid = objectid;
         root->anon_dev = 0;
  
@@ -1630,6 +1628,8 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
                                      bool check_ref)
  {
         struct btrfs_root *root;
+       struct btrfs_path *path;
+       struct btrfs_key key;
         int ret;
  
         if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
@@ -1669,8 +1669,17 @@ again:
         if (ret)
                 goto fail;
  
-       ret = btrfs_find_item(fs_info->tree_root, NULL, BTRFS_ORPHAN_OBJECTID,
-                       location->objectid, BTRFS_ORPHAN_ITEM_KEY, NULL);
+       path = btrfs_alloc_path();
+       if (!path) {
+               ret = -ENOMEM;
+               goto fail;
+       }
+       key.objectid = BTRFS_ORPHAN_OBJECTID;
+       key.type = BTRFS_ORPHAN_ITEM_KEY;
+       key.offset = location->objectid;
+
+       ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
+       btrfs_free_path(path);
         if (ret < 0)
                 goto fail;
         if (ret == 0)
@@ -2232,6 +2241,7 @@ int open_ctree(struct super_block *sb,
         spin_lock_init(&fs_info->qgroup_op_lock);
         spin_lock_init(&fs_info->buffer_lock);
         spin_lock_init(&fs_info->unused_bgs_lock);
+       mutex_init(&fs_info->unused_bg_unpin_mutex);
         rwlock_init(&fs_info->tree_mod_log_lock);
         mutex_init(&fs_info->reloc_mutex);
         mutex_init(&fs_info->delalloc_root_mutex);
@@ -2496,7 +2506,7 @@ int open_ctree(struct super_block *sb,
                 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
  
         if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
-               printk(KERN_ERR "BTRFS: has skinny extents\n");
+               printk(KERN_INFO "BTRFS: has skinny extents\n");
  
         /*
          * flag our filesystem as having big metadata blocks if
@@ -2520,7 +2530,7 @@ int open_ctree(struct super_block *sb,
          */
         if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
             (sectorsize != nodesize)) {
-               printk(KERN_WARNING "BTRFS: unequal leaf/node/sector sizes "
+               printk(KERN_ERR "BTRFS: unequal leaf/node/sector sizes "
                                 "are not allowed for mixed block groups on %s\n",
                                 sb->s_id);
                 goto fail_alloc;
@@ -2628,12 +2638,12 @@ int open_ctree(struct super_block *sb,
         sb->s_blocksize_bits = blksize_bits(sectorsize);
  
         if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
-               printk(KERN_INFO "BTRFS: valid FS not found on %s\n", sb->s_id);
+               printk(KERN_ERR "BTRFS: valid FS not found on %s\n", sb->s_id);
                 goto fail_sb_buffer;
         }
  
         if (sectorsize != PAGE_SIZE) {
-               printk(KERN_WARNING "BTRFS: Incompatible sector size(%lu) "
+               printk(KERN_ERR "BTRFS: incompatible sector size (%lu) "
                        "found on %s\n", (unsigned long)sectorsize, sb->s_id);
                 goto fail_sb_buffer;
         }
@@ -2642,7 +2652,7 @@ int open_ctree(struct super_block *sb,
         ret = btrfs_read_sys_array(tree_root);
         mutex_unlock(&fs_info->chunk_mutex);
         if (ret) {
-               printk(KERN_WARNING "BTRFS: failed to read the system "
+               printk(KERN_ERR "BTRFS: failed to read the system "
                        "array on %s\n", sb->s_id);
                 goto fail_sb_buffer;
         }
@@ -2657,7 +2667,7 @@ int open_ctree(struct super_block *sb,
                                            generation);
         if (!chunk_root->node ||
             !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
-               printk(KERN_WARNING "BTRFS: failed to read chunk root on %s\n",
+               printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n",
                        sb->s_id);
                 goto fail_tree_roots;
         }
@@ -2669,7 +2679,7 @@ int open_ctree(struct super_block *sb,
  
         ret = btrfs_read_chunk_tree(chunk_root);
         if (ret) {
-               printk(KERN_WARNING "BTRFS: failed to read chunk tree on %s\n",
+               printk(KERN_ERR "BTRFS: failed to read chunk tree on %s\n",
                        sb->s_id);
                 goto fail_tree_roots;
         }
@@ -2681,7 +2691,7 @@ int open_ctree(struct super_block *sb,
         btrfs_close_extra_devices(fs_info, fs_devices, 0);
  
         if (!fs_devices->latest_bdev) {
-               printk(KERN_CRIT "BTRFS: failed to read devices on %s\n",
+               printk(KERN_ERR "BTRFS: failed to read devices on %s\n",
                        sb->s_id);
                 goto fail_tree_roots;
         }
@@ -2765,7 +2775,7 @@ retry_root_backup:
  
         ret = btrfs_recover_balance(fs_info);
         if (ret) {
-               printk(KERN_WARNING "BTRFS: failed to recover balance\n");
+               printk(KERN_ERR "BTRFS: failed to recover balance\n");
                 goto fail_block_groups;
         }
  
@@ -3860,6 +3870,21 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
                 printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n",
                                 btrfs_super_log_root(sb));
  
+       /*
+        * Check the lower bound, the alignment and other constraints are
+        * checked later.
+        */
+       if (btrfs_super_nodesize(sb) < 4096) {
+               printk(KERN_ERR "BTRFS: nodesize too small: %u < 4096\n",
+                               btrfs_super_nodesize(sb));
+               ret = -EINVAL;
+       }
+       if (btrfs_super_sectorsize(sb) < 4096) {
+               printk(KERN_ERR "BTRFS: sectorsize too small: %u < 4096\n",
+                               btrfs_super_sectorsize(sb));
+               ret = -EINVAL;
+       }
+
         if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) {
                 printk(KERN_ERR "BTRFS: dev_item UUID does not match fsid: %pU != %pU\n",
                                 fs_info->fsid, sb->dev_item.fsid);
@@ -3873,6 +3898,10 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
         if (btrfs_super_num_devices(sb) > (1UL << 31))
                 printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n",
                                 btrfs_super_num_devices(sb));
+       if (btrfs_super_num_devices(sb) == 0) {
+               printk(KERN_ERR "BTRFS: number of devices is 0\n");
+               ret = -EINVAL;
+       }
  
         if (btrfs_super_bytenr(sb) != BTRFS_SUPER_INFO_OFFSET) {
                 printk(KERN_ERR "BTRFS: super offset mismatch %llu != %u\n",
@@ -3880,6 +3909,25 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
                 ret = -EINVAL;
         }
  
+       /*
+        * Obvious sys_chunk_array corruptions, it must hold at least one key
+        * and one chunk
+        */
+       if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
+               printk(KERN_ERR "BTRFS: system chunk array too big %u > %u\n",
+                               btrfs_super_sys_array_size(sb),
+                               BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
+               ret = -EINVAL;
+       }
+       if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
+                       + sizeof(struct btrfs_chunk)) {
+               printk(KERN_ERR "BTRFS: system chunk array too small %u < %lu\n",
+                               btrfs_super_sys_array_size(sb),
+                               sizeof(struct btrfs_disk_key)
+                               + sizeof(struct btrfs_chunk));
+               ret = -EINVAL;
+       }
+
         /*
          * The generation is a global counter, we'll trust it more than the others
          * but it's still possible that it's the one that's wrong.
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h

index 4146518..27d44c0 100644 (file)
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -46,11 +46,11 @@ struct btrfs_fs_devices;
  
  struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
                                       u64 parent_transid);
-void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize);
-int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+void readahead_tree_block(struct btrfs_root *root, u64 bytenr);
+int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
                          int mirror_num, struct extent_buffer **eb);
  struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
-                                                  u64 bytenr, u32 blocksize);
+                                                  u64 bytenr);
  void clean_tree_block(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root, struct extent_buffer *buf);
  int open_ctree(struct super_block *sb,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c

index a684086..571f402 100644 (file)
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -74,8 +74,9 @@ enum {
         RESERVE_ALLOC_NO_ACCOUNT = 2,
  };
  
-static int update_block_group(struct btrfs_root *root,
-                             u64 bytenr, u64 num_bytes, int alloc);
+static int update_block_group(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root, u64 bytenr,
+                             u64 num_bytes, int alloc);
  static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 u64 bytenr, u64 num_bytes, u64 parent,
@@ -1925,7 +1926,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
                          */
                         ret = 0;
                 }
-               kfree(bbio);
+               btrfs_put_bbio(bbio);
         }
  
         if (actual_bytes)
@@ -2768,7 +2769,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
         struct btrfs_delayed_ref_head *head;
         int ret;
         int run_all = count == (unsigned long)-1;
-       int run_most = 0;
  
         /* We'll clean this up in btrfs_cleanup_transaction */
         if (trans->aborted)
@@ -2778,10 +2778,8 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                 root = root->fs_info->tree_root;
  
         delayed_refs = &trans->transaction->delayed_refs;
-       if (count == 0) {
+       if (count == 0)
                 count = atomic_read(&delayed_refs->num_entries) * 2;
-               run_most = 1;
-       }
  
  again:
  #ifdef SCRAMBLE_DELAYED_REFS
@@ -3315,120 +3313,42 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                                    struct btrfs_root *root)
  {
         struct btrfs_block_group_cache *cache;
-       int err = 0;
+       struct btrfs_transaction *cur_trans = trans->transaction;
+       int ret = 0;
         struct btrfs_path *path;
-       u64 last = 0;
+
+       if (list_empty(&cur_trans->dirty_bgs))
+               return 0;
  
         path = btrfs_alloc_path();
         if (!path)
                 return -ENOMEM;
  
-again:
-       while (1) {
-               cache = btrfs_lookup_first_block_group(root->fs_info, last);
-               while (cache) {
-                       if (cache->disk_cache_state == BTRFS_DC_CLEAR)
-                               break;
-                       cache = next_block_group(root, cache);
-               }
-               if (!cache) {
-                       if (last == 0)
-                               break;
-                       last = 0;
-                       continue;
-               }
-               err = cache_save_setup(cache, trans, path);
-               last = cache->key.objectid + cache->key.offset;
-               btrfs_put_block_group(cache);
-       }
-
-       while (1) {
-               if (last == 0) {
-                       err = btrfs_run_delayed_refs(trans, root,
-                                                    (unsigned long)-1);
-                       if (err) /* File system offline */
-                               goto out;
-               }
-
-               cache = btrfs_lookup_first_block_group(root->fs_info, last);
-               while (cache) {
-                       if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
-                               btrfs_put_block_group(cache);
-                               goto again;
-                       }
-
-                       if (cache->dirty)
-                               break;
-                       cache = next_block_group(root, cache);
-               }
-               if (!cache) {
-                       if (last == 0)
-                               break;
-                       last = 0;
-                       continue;
-               }
-
-               if (cache->disk_cache_state == BTRFS_DC_SETUP)
-                       cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
-               cache->dirty = 0;
-               last = cache->key.objectid + cache->key.offset;
-
-               err = write_one_cache_group(trans, root, path, cache);
-               btrfs_put_block_group(cache);
-               if (err) /* File system offline */
-                       goto out;
-       }
-
-       while (1) {
-               /*
-                * I don't think this is needed since we're just marking our
-                * preallocated extent as written, but just in case it can't
-                * hurt.
-                */
-               if (last == 0) {
-                       err = btrfs_run_delayed_refs(trans, root,
-                                                    (unsigned long)-1);
-                       if (err) /* File system offline */
-                               goto out;
-               }
-
-               cache = btrfs_lookup_first_block_group(root->fs_info, last);
-               while (cache) {
-                       /*
-                        * Really this shouldn't happen, but it could if we
-                        * couldn't write the entire preallocated extent and
-                        * splitting the extent resulted in a new block.
-                        */
-                       if (cache->dirty) {
-                               btrfs_put_block_group(cache);
-                               goto again;
-                       }
-                       if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
-                               break;
-                       cache = next_block_group(root, cache);
-               }
-               if (!cache) {
-                       if (last == 0)
-                               break;
-                       last = 0;
-                       continue;
-               }
-
-               err = btrfs_write_out_cache(root, trans, cache, path);
-
-               /*
-                * If we didn't have an error then the cache state is still
-                * NEED_WRITE, so we can set it to WRITTEN.
-                */
-               if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
-                       cache->disk_cache_state = BTRFS_DC_WRITTEN;
-               last = cache->key.objectid + cache->key.offset;
+       /*
+        * We don't need the lock here since we are protected by the transaction
+        * commit.  We want to do the cache_save_setup first and then run the
+        * delayed refs to make sure we have the best chance at doing this all
+        * in one shot.
+        */
+       while (!list_empty(&cur_trans->dirty_bgs)) {
+               cache = list_first_entry(&cur_trans->dirty_bgs,
+                                        struct btrfs_block_group_cache,
+                                        dirty_list);
+               list_del_init(&cache->dirty_list);
+               if (cache->disk_cache_state == BTRFS_DC_CLEAR)
+                       cache_save_setup(cache, trans, path);
+               if (!ret)
+                       ret = btrfs_run_delayed_refs(trans, root,
+                                                    (unsigned long) -1);
+               if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP)
+                       btrfs_write_out_cache(root, trans, cache, path);
+               if (!ret)
+                       ret = write_one_cache_group(trans, root, path, cache);
                 btrfs_put_block_group(cache);
         }
-out:
  
         btrfs_free_path(path);
-       return err;
+       return ret;
  }
  
  int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
@@ -5043,19 +4963,25 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
  /**
   * drop_outstanding_extent - drop an outstanding extent
   * @inode: the inode we're dropping the extent for
+ * @num_bytes: the number of bytes we're relaseing.
   *
   * This is called when we are freeing up an outstanding extent, either called
   * after an error or after an extent is written.  This will return the number of
   * reserved extents that need to be freed.  This must be called with
   * BTRFS_I(inode)->lock held.
   */
-static unsigned drop_outstanding_extent(struct inode *inode)
+static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes)
  {
         unsigned drop_inode_space = 0;
         unsigned dropped_extents = 0;
+       unsigned num_extents = 0;
  
-       BUG_ON(!BTRFS_I(inode)->outstanding_extents);
-       BTRFS_I(inode)->outstanding_extents--;
+       num_extents = (unsigned)div64_u64(num_bytes +
+                                         BTRFS_MAX_EXTENT_SIZE - 1,
+                                         BTRFS_MAX_EXTENT_SIZE);
+       ASSERT(num_extents);
+       ASSERT(BTRFS_I(inode)->outstanding_extents >= num_extents);
+       BTRFS_I(inode)->outstanding_extents -= num_extents;
  
         if (BTRFS_I(inode)->outstanding_extents == 0 &&
             test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
@@ -5226,7 +5152,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
  
  out_fail:
         spin_lock(&BTRFS_I(inode)->lock);
-       dropped = drop_outstanding_extent(inode);
+       dropped = drop_outstanding_extent(inode, num_bytes);
         /*
          * If the inodes csum_bytes is the same as the original
          * csum_bytes then we know we haven't raced with any free()ers
@@ -5305,7 +5231,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
  
         num_bytes = ALIGN(num_bytes, root->sectorsize);
         spin_lock(&BTRFS_I(inode)->lock);
-       dropped = drop_outstanding_extent(inode);
+       dropped = drop_outstanding_extent(inode, num_bytes);
  
         if (num_bytes)
                 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
@@ -5375,8 +5301,9 @@ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
         btrfs_free_reserved_data_space(inode, num_bytes);
  }
  
-static int update_block_group(struct btrfs_root *root,
-                             u64 bytenr, u64 num_bytes, int alloc)
+static int update_block_group(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root, u64 bytenr,
+                             u64 num_bytes, int alloc)
  {
         struct btrfs_block_group_cache *cache = NULL;
         struct btrfs_fs_info *info = root->fs_info;
@@ -5414,6 +5341,14 @@ static int update_block_group(struct btrfs_root *root,
                 if (!alloc && cache->cached == BTRFS_CACHE_NO)
                         cache_block_group(cache, 1);
  
+               spin_lock(&trans->transaction->dirty_bgs_lock);
+               if (list_empty(&cache->dirty_list)) {
+                       list_add_tail(&cache->dirty_list,
+                                     &trans->transaction->dirty_bgs);
+                       btrfs_get_block_group(cache);
+               }
+               spin_unlock(&trans->transaction->dirty_bgs_lock);
+
                 byte_in_group = bytenr - cache->key.objectid;
                 WARN_ON(byte_in_group > cache->key.offset);
  
@@ -5424,7 +5359,6 @@ static int update_block_group(struct btrfs_root *root,
                     cache->disk_cache_state < BTRFS_DC_CLEAR)
                         cache->disk_cache_state = BTRFS_DC_CLEAR;
  
-               cache->dirty = 1;
                 old_val = btrfs_block_group_used(&cache->item);
                 num_bytes = min(total, cache->key.offset - byte_in_group);
                 if (alloc) {
@@ -5807,10 +5741,13 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
                 unpin = &fs_info->freed_extents[0];
  
         while (1) {
+               mutex_lock(&fs_info->unused_bg_unpin_mutex);
                 ret = find_first_extent_bit(unpin, 0, &start, &end,
                                             EXTENT_DIRTY, NULL);
-               if (ret)
+               if (ret) {
+                       mutex_unlock(&fs_info->unused_bg_unpin_mutex);
                         break;
+               }
  
                 if (btrfs_test_opt(root, DISCARD))
                         ret = btrfs_discard_extent(root, start,
@@ -5818,6 +5755,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
  
                 clear_extent_dirty(unpin, start, end, GFP_NOFS);
                 unpin_extent_range(root, start, end, true);
+               mutex_unlock(&fs_info->unused_bg_unpin_mutex);
                 cond_resched();
         }
  
@@ -6103,7 +6041,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                         }
                 }
  
-               ret = update_block_group(root, bytenr, num_bytes, 0);
+               ret = update_block_group(trans, root, bytenr, num_bytes, 0);
                 if (ret) {
                         btrfs_abort_transaction(trans, extent_root, ret);
                         goto out;
@@ -6205,7 +6143,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                            struct extent_buffer *buf,
                            u64 parent, int last_ref)
  {
-       struct btrfs_block_group_cache *cache = NULL;
         int pin = 1;
         int ret;
  
@@ -6221,17 +6158,20 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
         if (!last_ref)
                 return;
  
-       cache = btrfs_lookup_block_group(root->fs_info, buf->start);
-
         if (btrfs_header_generation(buf) == trans->transid) {
+               struct btrfs_block_group_cache *cache;
+
                 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
                         ret = check_ref_cleanup(trans, root, buf->start);
                         if (!ret)
                                 goto out;
                 }
  
+               cache = btrfs_lookup_block_group(root->fs_info, buf->start);
+
                 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
                         pin_down_extent(root, cache, buf->start, buf->len, 1);
+                       btrfs_put_block_group(cache);
                         goto out;
                 }
  
@@ -6239,6 +6179,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
  
                 btrfs_add_free_space(cache, buf->start, buf->len);
                 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0);
+               btrfs_put_block_group(cache);
                 trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
                 pin = 0;
         }
@@ -6253,7 +6194,6 @@ out:
          * anymore.
          */
         clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
-       btrfs_put_block_group(cache);
  }
  
  /* Can return -ENOMEM */
@@ -7063,7 +7003,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
         if (ret)
                 return ret;
  
-       ret = update_block_group(root, ins->objectid, ins->offset, 1);
+       ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
         if (ret) { /* -ENOENT, logic error */
                 btrfs_err(fs_info, "update block group failed for %llu %llu",
                         ins->objectid, ins->offset);
@@ -7152,7 +7092,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
                         return ret;
         }
  
-       ret = update_block_group(root, ins->objectid, root->nodesize, 1);
+       ret = update_block_group(trans, root, ins->objectid, root->nodesize,
+                                1);
         if (ret) { /* -ENOENT, logic error */
                 btrfs_err(fs_info, "update block group failed for %llu %llu",
                         ins->objectid, ins->offset);
@@ -7217,11 +7158,11 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
  
  static struct extent_buffer *
  btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                     u64 bytenr, u32 blocksize, int level)
+                     u64 bytenr, int level)
  {
         struct extent_buffer *buf;
  
-       buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+       buf = btrfs_find_create_tree_block(root, bytenr);
         if (!buf)
                 return ERR_PTR(-ENOMEM);
         btrfs_set_header_generation(buf, trans->transid);
@@ -7340,7 +7281,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
  
         if (btrfs_test_is_dummy_root(root)) {
                 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
-                                           blocksize, level);
+                                           level);
                 if (!IS_ERR(buf))
                         root->alloc_bytenr += blocksize;
                 return buf;
@@ -7357,8 +7298,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
                 return ERR_PTR(ret);
         }
  
-       buf = btrfs_init_new_buffer(trans, root, ins.objectid,
-                                   blocksize, level);
+       buf = btrfs_init_new_buffer(trans, root, ins.objectid, level);
         BUG_ON(IS_ERR(buf)); /* -ENOMEM */
  
         if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
@@ -7487,7 +7427,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
                                 continue;
                 }
  reada:
-               readahead_tree_block(root, bytenr, blocksize);
+               readahead_tree_block(root, bytenr);
                 nread++;
         }
         wc->reada_slot = slot;
@@ -7828,7 +7768,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
  
         next = btrfs_find_tree_block(root, bytenr);
         if (!next) {
-               next = btrfs_find_create_tree_block(root, bytenr, blocksize);
+               next = btrfs_find_create_tree_block(root, bytenr);
                 if (!next)
                         return -ENOMEM;
                 btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
@@ -8548,14 +8488,6 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
         if (IS_ERR(trans))
                 return PTR_ERR(trans);
  
-       alloc_flags = update_block_group_flags(root, cache->flags);
-       if (alloc_flags != cache->flags) {
-               ret = do_chunk_alloc(trans, root, alloc_flags,
-                                    CHUNK_ALLOC_FORCE);
-               if (ret < 0)
-                       goto out;
-       }
-
         ret = set_block_group_ro(cache, 0);
         if (!ret)
                 goto out;
@@ -8566,6 +8498,11 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
                 goto out;
         ret = set_block_group_ro(cache, 0);
  out:
+       if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
+               alloc_flags = update_block_group_flags(root, cache->flags);
+               check_system_chunk(trans, root, alloc_flags);
+       }
+
         btrfs_end_transaction(trans, root);
         return ret;
  }
@@ -9005,6 +8942,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
         INIT_LIST_HEAD(&cache->cluster_list);
         INIT_LIST_HEAD(&cache->bg_list);
         INIT_LIST_HEAD(&cache->ro_list);
+       INIT_LIST_HEAD(&cache->dirty_list);
         btrfs_init_free_space_ctl(cache);
         atomic_set(&cache->trimming, 0);
  
@@ -9068,9 +9006,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                          * b) Setting 'dirty flag' makes sure that we flush
                          *    the new space cache info onto disk.
                          */
-                       cache->disk_cache_state = BTRFS_DC_CLEAR;
                         if (btrfs_test_opt(root, SPACE_CACHE))
-                               cache->dirty = 1;
+                               cache->disk_cache_state = BTRFS_DC_CLEAR;
                 }
  
                 read_extent_buffer(leaf, &cache->item,
@@ -9460,6 +9397,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                 }
         }
  
+       spin_lock(&trans->transaction->dirty_bgs_lock);
+       if (!list_empty(&block_group->dirty_list)) {
+               list_del_init(&block_group->dirty_list);
+               btrfs_put_block_group(block_group);
+       }
+       spin_unlock(&trans->transaction->dirty_bgs_lock);
+
         btrfs_remove_free_space_cache(block_group);
  
         spin_lock(&block_group->space_info->lock);
@@ -9611,7 +9555,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                  * Want to do this before we do anything else so we can recover
                  * properly if we fail to join the transaction.
                  */
-               trans = btrfs_join_transaction(root);
+               /* 1 for btrfs_orphan_reserve_metadata() */
+               trans = btrfs_start_transaction(root, 1);
                 if (IS_ERR(trans)) {
                         btrfs_set_block_group_rw(root, block_group);
                         ret = PTR_ERR(trans);
@@ -9624,18 +9569,33 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                  */
                 start = block_group->key.objectid;
                 end = start + block_group->key.offset - 1;
+               /*
+                * Hold the unused_bg_unpin_mutex lock to avoid racing with
+                * btrfs_finish_extent_commit(). If we are at transaction N,
+                * another task might be running finish_extent_commit() for the
+                * previous transaction N - 1, and have seen a range belonging
+                * to the block group in freed_extents[] before we were able to
+                * clear the whole block group range from freed_extents[]. This
+                * means that task can lookup for the block group after we
+                * unpinned it from freed_extents[] and removed it, leading to
+                * a BUG_ON() at btrfs_unpin_extent_range().
+                */
+               mutex_lock(&fs_info->unused_bg_unpin_mutex);
                 ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
                                   EXTENT_DIRTY, GFP_NOFS);
                 if (ret) {
+                       mutex_unlock(&fs_info->unused_bg_unpin_mutex);
                         btrfs_set_block_group_rw(root, block_group);
                         goto end_trans;
                 }
                 ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
                                   EXTENT_DIRTY, GFP_NOFS);
                 if (ret) {
+                       mutex_unlock(&fs_info->unused_bg_unpin_mutex);
                         btrfs_set_block_group_rw(root, block_group);
                         goto end_trans;
                 }
+               mutex_unlock(&fs_info->unused_bg_unpin_mutex);
  
                 /* Reset pinned so btrfs_put_block_group doesn't complain */
                 block_group->pinned = 0;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c

index c73df6a..c7233ff 100644 (file)
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -64,7 +64,7 @@ void btrfs_leak_debug_check(void)
  
         while (!list_empty(&states)) {
                 state = list_entry(states.next, struct extent_state, leak_list);
-               pr_err("BTRFS: state leak: start %llu end %llu state %lu in tree %d refs %d\n",
+               pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
                        state->start, state->end, state->state,
                        extent_state_in_tree(state),
                        atomic_read(&state->refs));
@@ -396,21 +396,21 @@ static void merge_state(struct extent_io_tree *tree,
  }
  
  static void set_state_cb(struct extent_io_tree *tree,
-                        struct extent_state *state, unsigned long *bits)
+                        struct extent_state *state, unsigned *bits)
  {
         if (tree->ops && tree->ops->set_bit_hook)
                 tree->ops->set_bit_hook(tree->mapping->host, state, bits);
  }
  
  static void clear_state_cb(struct extent_io_tree *tree,
-                          struct extent_state *state, unsigned long *bits)
+                          struct extent_state *state, unsigned *bits)
  {
         if (tree->ops && tree->ops->clear_bit_hook)
                 tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
  }
  
  static void set_state_bits(struct extent_io_tree *tree,
-                          struct extent_state *state, unsigned long *bits);
+                          struct extent_state *state, unsigned *bits);
  
  /*
   * insert an extent_state struct into the tree.  'bits' are set on the
@@ -426,7 +426,7 @@ static int insert_state(struct extent_io_tree *tree,
                         struct extent_state *state, u64 start, u64 end,
                         struct rb_node ***p,
                         struct rb_node **parent,
-                       unsigned long *bits)
+                       unsigned *bits)
  {
         struct rb_node *node;
  
@@ -511,10 +511,10 @@ static struct extent_state *next_state(struct extent_state *state)
   */
  static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
                                             struct extent_state *state,
-                                           unsigned long *bits, int wake)
+                                           unsigned *bits, int wake)
  {
         struct extent_state *next;
-       unsigned long bits_to_clear = *bits & ~EXTENT_CTLBITS;
+       unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
  
         if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
                 u64 range = state->end - state->start + 1;
@@ -570,7 +570,7 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
   * This takes the tree lock, and returns 0 on success and < 0 on error.
   */
  int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                    unsigned long bits, int wake, int delete,
+                    unsigned bits, int wake, int delete,
                      struct extent_state **cached_state,
                      gfp_t mask)
  {
@@ -789,9 +789,9 @@ out:
  
  static void set_state_bits(struct extent_io_tree *tree,
                            struct extent_state *state,
-                          unsigned long *bits)
+                          unsigned *bits)
  {
-       unsigned long bits_to_set = *bits & ~EXTENT_CTLBITS;
+       unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
  
         set_state_cb(tree, state, bits);
         if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
@@ -803,7 +803,7 @@ static void set_state_bits(struct extent_io_tree *tree,
  
  static void cache_state_if_flags(struct extent_state *state,
                                  struct extent_state **cached_ptr,
-                                const u64 flags)
+                                unsigned flags)
  {
         if (cached_ptr && !(*cached_ptr)) {
                 if (!flags || (state->state & flags)) {
@@ -833,7 +833,7 @@ static void cache_state(struct extent_state *state,
  
  static int __must_check
  __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                unsigned long bits, unsigned long exclusive_bits,
+                unsigned bits, unsigned exclusive_bits,
                  u64 *failed_start, struct extent_state **cached_state,
                  gfp_t mask)
  {
@@ -1034,7 +1034,7 @@ search_again:
  }
  
  int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                  unsigned long bits, u64 * failed_start,
+                  unsigned bits, u64 * failed_start,
                    struct extent_state **cached_state, gfp_t mask)
  {
         return __set_extent_bit(tree, start, end, bits, 0, failed_start,
@@ -1060,7 +1060,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
   * boundary bits like LOCK.
   */
  int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                      unsigned long bits, unsigned long clear_bits,
+                      unsigned bits, unsigned clear_bits,
                        struct extent_state **cached_state, gfp_t mask)
  {
         struct extent_state *state;
@@ -1268,14 +1268,14 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
  }
  
  int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-                   unsigned long bits, gfp_t mask)
+                   unsigned bits, gfp_t mask)
  {
         return set_extent_bit(tree, start, end, bits, NULL,
                               NULL, mask);
  }
  
  int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-                     unsigned long bits, gfp_t mask)
+                     unsigned bits, gfp_t mask)
  {
         return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
  }
@@ -1330,10 +1330,11 @@ int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
   * us if waiting is desired.
   */
  int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-                    unsigned long bits, struct extent_state **cached_state)
+                    unsigned bits, struct extent_state **cached_state)
  {
         int err;
         u64 failed_start;
+
         while (1) {
                 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
                                        EXTENT_LOCKED, &failed_start,
@@ -1440,7 +1441,7 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
   */
  static struct extent_state *
  find_first_extent_bit_state(struct extent_io_tree *tree,
-                           u64 start, unsigned long bits)
+                           u64 start, unsigned bits)
  {
         struct rb_node *node;
         struct extent_state *state;
@@ -1474,7 +1475,7 @@ out:
   * If nothing was found, 1 is returned. If found something, return 0.
   */
  int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
-                         u64 *start_ret, u64 *end_ret, unsigned long bits,
+                         u64 *start_ret, u64 *end_ret, unsigned bits,
                           struct extent_state **cached_state)
  {
         struct extent_state *state;
@@ -1753,7 +1754,7 @@ out_failed:
  
  int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
                                  struct page *locked_page,
-                                unsigned long clear_bits,
+                                unsigned clear_bits,
                                  unsigned long page_ops)
  {
         struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
@@ -1810,7 +1811,7 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
   */
  u64 count_range_bits(struct extent_io_tree *tree,
                      u64 *start, u64 search_end, u64 max_bytes,
-                    unsigned long bits, int contig)
+                    unsigned bits, int contig)
  {
         struct rb_node *node;
         struct extent_state *state;
@@ -1928,7 +1929,7 @@ out:
   * range is found set.
   */
  int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                  unsigned long bits, int filled, struct extent_state *cached)
+                  unsigned bits, int filled, struct extent_state *cached)
  {
         struct extent_state *state = NULL;
         struct rb_node *node;
@@ -2057,7 +2058,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
         sector = bbio->stripes[mirror_num-1].physical >> 9;
         bio->bi_iter.bi_sector = sector;
         dev = bbio->stripes[mirror_num-1].dev;
-       kfree(bbio);
+       btrfs_put_bbio(bbio);
         if (!dev || !dev->bdev || !dev->writeable) {
                 bio_put(bio);
                 return -EIO;
@@ -2816,8 +2817,10 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
                     bio_add_page(bio, page, page_size, offset) < page_size) {
                         ret = submit_one_bio(rw, bio, mirror_num,
                                              prev_bio_flags);
-                       if (ret < 0)
+                       if (ret < 0) {
+                               *bio_ret = NULL;
                                 return ret;
+                       }
                         bio = NULL;
                 } else {
                         return 0;
@@ -3239,7 +3242,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
                                                page,
                                                &delalloc_start,
                                                &delalloc_end,
-                                              128 * 1024 * 1024);
+                                              BTRFS_MAX_EXTENT_SIZE);
                 if (nr_delalloc == 0) {
                         delalloc_start = delalloc_end + 1;
                         continue;
@@ -4598,11 +4601,11 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
  
  static struct extent_buffer *
  __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
-                     unsigned long len, gfp_t mask)
+                     unsigned long len)
  {
         struct extent_buffer *eb = NULL;
  
-       eb = kmem_cache_zalloc(extent_buffer_cache, mask);
+       eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS);
         if (eb == NULL)
                 return NULL;
         eb->start = start;
@@ -4643,7 +4646,7 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
         struct extent_buffer *new;
         unsigned long num_pages = num_extent_pages(src->start, src->len);
  
-       new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_NOFS);
+       new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
         if (new == NULL)
                 return NULL;
  
@@ -4666,13 +4669,26 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
         return new;
  }
  
-struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len)
+struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+                                               u64 start)
  {
         struct extent_buffer *eb;
-       unsigned long num_pages = num_extent_pages(0, len);
+       unsigned long len;
+       unsigned long num_pages;
         unsigned long i;
  
-       eb = __alloc_extent_buffer(NULL, start, len, GFP_NOFS);
+       if (!fs_info) {
+               /*
+                * Called only from tests that don't always have a fs_info
+                * available, but we know that nodesize is 4096
+                */
+               len = 4096;
+       } else {
+               len = fs_info->tree_root->nodesize;
+       }
+       num_pages = num_extent_pages(0, len);
+
+       eb = __alloc_extent_buffer(fs_info, start, len);
         if (!eb)
                 return NULL;
  
@@ -4762,7 +4778,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
  
  #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
  struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
-                                              u64 start, unsigned long len)
+                                              u64 start)
  {
         struct extent_buffer *eb, *exists = NULL;
         int ret;
@@ -4770,7 +4786,7 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
         eb = find_extent_buffer(fs_info, start);
         if (eb)
                 return eb;
-       eb = alloc_dummy_extent_buffer(start, len);
+       eb = alloc_dummy_extent_buffer(fs_info, start);
         if (!eb)
                 return NULL;
         eb->fs_info = fs_info;
@@ -4808,8 +4824,9 @@ free_eb:
  #endif
  
  struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
-                                         u64 start, unsigned long len)
+                                         u64 start)
  {
+       unsigned long len = fs_info->tree_root->nodesize;
         unsigned long num_pages = num_extent_pages(start, len);
         unsigned long i;
         unsigned long index = start >> PAGE_CACHE_SHIFT;
@@ -4824,7 +4841,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
         if (eb)
                 return eb;
  
-       eb = __alloc_extent_buffer(fs_info, start, len, GFP_NOFS);
+       eb = __alloc_extent_buffer(fs_info, start, len);
         if (!eb)
                 return NULL;
  
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h

index ece9ce8..695b0cc 100644 (file)
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -4,22 +4,22 @@
  #include <linux/rbtree.h>
  
  /* bits for the extent state */
-#define EXTENT_DIRTY 1
-#define EXTENT_WRITEBACK (1 << 1)
-#define EXTENT_UPTODATE (1 << 2)
-#define EXTENT_LOCKED (1 << 3)
-#define EXTENT_NEW (1 << 4)
-#define EXTENT_DELALLOC (1 << 5)
-#define EXTENT_DEFRAG (1 << 6)
-#define EXTENT_BOUNDARY (1 << 9)
-#define EXTENT_NODATASUM (1 << 10)
-#define EXTENT_DO_ACCOUNTING (1 << 11)
-#define EXTENT_FIRST_DELALLOC (1 << 12)
-#define EXTENT_NEED_WAIT (1 << 13)
-#define EXTENT_DAMAGED (1 << 14)
-#define EXTENT_NORESERVE (1 << 15)
-#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
-#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
+#define EXTENT_DIRTY           (1U << 0)
+#define EXTENT_WRITEBACK       (1U << 1)
+#define EXTENT_UPTODATE                (1U << 2)
+#define EXTENT_LOCKED          (1U << 3)
+#define EXTENT_NEW             (1U << 4)
+#define EXTENT_DELALLOC                (1U << 5)
+#define EXTENT_DEFRAG          (1U << 6)
+#define EXTENT_BOUNDARY                (1U << 9)
+#define EXTENT_NODATASUM       (1U << 10)
+#define EXTENT_DO_ACCOUNTING   (1U << 11)
+#define EXTENT_FIRST_DELALLOC  (1U << 12)
+#define EXTENT_NEED_WAIT       (1U << 13)
+#define EXTENT_DAMAGED         (1U << 14)
+#define EXTENT_NORESERVE       (1U << 15)
+#define EXTENT_IOBITS          (EXTENT_LOCKED | EXTENT_WRITEBACK)
+#define EXTENT_CTLBITS         (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
  
  /*
   * flags for bio submission. The high bits indicate the compression
@@ -81,9 +81,9 @@ struct extent_io_ops {
         int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
                                       struct extent_state *state, int uptodate);
         void (*set_bit_hook)(struct inode *inode, struct extent_state *state,
-                            unsigned long *bits);
+                            unsigned *bits);
         void (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
-                              unsigned long *bits);
+                              unsigned *bits);
         void (*merge_extent_hook)(struct inode *inode,
                                   struct extent_state *new,
                                   struct extent_state *other);
@@ -108,7 +108,7 @@ struct extent_state {
         /* ADD NEW ELEMENTS AFTER THIS */
         wait_queue_head_t wq;
         atomic_t refs;
-       unsigned long state;
+       unsigned state;
  
         /* for use by the FS */
         u64 private;
@@ -188,7 +188,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
  int try_release_extent_buffer(struct page *page);
  int lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
  int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-                    unsigned long bits, struct extent_state **cached);
+                    unsigned bits, struct extent_state **cached);
  int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end);
  int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
                          struct extent_state **cached, gfp_t mask);
@@ -202,21 +202,21 @@ void extent_io_exit(void);
  
  u64 count_range_bits(struct extent_io_tree *tree,
                      u64 *start, u64 search_end,
-                    u64 max_bytes, unsigned long bits, int contig);
+                    u64 max_bytes, unsigned bits, int contig);
  
  void free_extent_state(struct extent_state *state);
  int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                  unsigned long bits, int filled,
+                  unsigned bits, int filled,
                    struct extent_state *cached_state);
  int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-                     unsigned long bits, gfp_t mask);
+                     unsigned bits, gfp_t mask);
  int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                    unsigned long bits, int wake, int delete,
+                    unsigned bits, int wake, int delete,
                      struct extent_state **cached, gfp_t mask);
  int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-                   unsigned long bits, gfp_t mask);
+                   unsigned bits, gfp_t mask);
  int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                  unsigned long bits, u64 *failed_start,
+                  unsigned bits, u64 *failed_start,
                    struct extent_state **cached_state, gfp_t mask);
  int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
                         struct extent_state **cached_state, gfp_t mask);
@@ -229,14 +229,14 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
  int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
                        gfp_t mask);
  int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                      unsigned long bits, unsigned long clear_bits,
+                      unsigned bits, unsigned clear_bits,
                        struct extent_state **cached_state, gfp_t mask);
  int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
                         struct extent_state **cached_state, gfp_t mask);
  int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
                       struct extent_state **cached_state, gfp_t mask);
  int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
-                         u64 *start_ret, u64 *end_ret, unsigned long bits,
+                         u64 *start_ret, u64 *end_ret, unsigned bits,
                           struct extent_state **cached_state);
  int extent_invalidatepage(struct extent_io_tree *tree,
                           struct page *page, unsigned long offset);
@@ -262,8 +262,9 @@ int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
  void set_page_extent_mapped(struct page *page);
  
  struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
-                                         u64 start, unsigned long len);
-struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len);
+                                         u64 start);
+struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+               u64 start);
  struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
  struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
                                          u64 start);
@@ -322,7 +323,7 @@ int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
  int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
  int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
                                  struct page *locked_page,
-                                unsigned long bits_to_clear,
+                                unsigned bits_to_clear,
                                  unsigned long page_ops);
  struct bio *
  btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
@@ -377,5 +378,5 @@ noinline u64 find_lock_delalloc_range(struct inode *inode,
                                       u64 *end, u64 max_bytes);
  #endif
  struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
-                                              u64 start, unsigned long len);
+                                              u64 start);
  #endif
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c

index d6c03f7..a719785 100644 (file)
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -651,15 +651,13 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
         struct io_ctl io_ctl;
         struct btrfs_key key;
         struct btrfs_free_space *e, *n;
-       struct list_head bitmaps;
+       LIST_HEAD(bitmaps);
         u64 num_entries;
         u64 num_bitmaps;
         u64 generation;
         u8 type;
         int ret = 0;
  
-       INIT_LIST_HEAD(&bitmaps);
-
         /* Nothing in the space cache, goodbye */
         if (!i_size_read(inode))
                 return 0;
@@ -1243,6 +1241,7 @@ int btrfs_write_out_cache(struct btrfs_root *root,
         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
         struct inode *inode;
         int ret = 0;
+       enum btrfs_disk_cache_state dcs = BTRFS_DC_WRITTEN;
  
         root = root->fs_info->tree_root;
  
@@ -1266,9 +1265,7 @@ int btrfs_write_out_cache(struct btrfs_root *root,
         ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
                                       path, block_group->key.objectid);
         if (ret) {
-               spin_lock(&block_group->lock);
-               block_group->disk_cache_state = BTRFS_DC_ERROR;
-               spin_unlock(&block_group->lock);
+               dcs = BTRFS_DC_ERROR;
                 ret = 0;
  #ifdef DEBUG
                 btrfs_err(root->fs_info,
@@ -1277,6 +1274,9 @@ int btrfs_write_out_cache(struct btrfs_root *root,
  #endif
         }
  
+       spin_lock(&block_group->lock);
+       block_group->disk_cache_state = dcs;
+       spin_unlock(&block_group->lock);
         iput(inode);
         return ret;
  }
@@ -2903,7 +2903,6 @@ int btrfs_find_space_cluster(struct btrfs_root *root,
         trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
                                  min_bytes);
  
-       INIT_LIST_HEAD(&bitmaps);
         ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
                                       bytes + empty_size,
                                       cont1_bytes, min_bytes);
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c

index 8ffa478..265e03c 100644 (file)
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -344,6 +344,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
                 return -ENOMEM;
  
         path->leave_spinning = 1;
+       path->skip_release_on_error = 1;
         ret = btrfs_insert_empty_item(trans, root, path, &key,
                                       ins_len);
         if (ret == -EEXIST) {
@@ -362,8 +363,12 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
                 ptr = (unsigned long)(ref + 1);
                 ret = 0;
         } else if (ret < 0) {
-               if (ret == -EOVERFLOW)
-                       ret = -EMLINK;
+               if (ret == -EOVERFLOW) {
+                       if (find_name_in_backref(path, name, name_len, &ref))
+                               ret = -EEXIST;
+                       else
+                               ret = -EMLINK;
+               }
                 goto out;
         } else {
                 ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c

index 54bcf63..a85c23d 100644 (file)
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1530,10 +1530,45 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
  static void btrfs_split_extent_hook(struct inode *inode,
                                     struct extent_state *orig, u64 split)
  {
+       u64 size;
+
         /* not delalloc, ignore it */
         if (!(orig->state & EXTENT_DELALLOC))
                 return;
  
+       size = orig->end - orig->start + 1;
+       if (size > BTRFS_MAX_EXTENT_SIZE) {
+               u64 num_extents;
+               u64 new_size;
+
+               /*
+                * We need the largest size of the remaining extent to see if we
+                * need to add a new outstanding extent.  Think of the following
+                * case
+                *
+                * [MEAX_EXTENT_SIZEx2 - 4k][4k]
+                *
+                * The new_size would just be 4k and we'd think we had enough
+                * outstanding extents for this if we only took one side of the
+                * split, same goes for the other direction.  We need to see if
+                * the larger size still is the same amount of extents as the
+                * original size, because if it is we need to add a new
+                * outstanding extent.  But if we split up and the larger size
+                * is less than the original then we are good to go since we've
+                * already accounted for the extra extent in our original
+                * accounting.
+                */
+               new_size = orig->end - split + 1;
+               if ((split - orig->start) > new_size)
+                       new_size = split - orig->start;
+
+               num_extents = div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1,
+                                       BTRFS_MAX_EXTENT_SIZE);
+               if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
+                             BTRFS_MAX_EXTENT_SIZE) < num_extents)
+                       return;
+       }
+
         spin_lock(&BTRFS_I(inode)->lock);
         BTRFS_I(inode)->outstanding_extents++;
         spin_unlock(&BTRFS_I(inode)->lock);
@@ -1549,10 +1584,34 @@ static void btrfs_merge_extent_hook(struct inode *inode,
                                     struct extent_state *new,
                                     struct extent_state *other)
  {
+       u64 new_size, old_size;
+       u64 num_extents;
+
         /* not delalloc, ignore it */
         if (!(other->state & EXTENT_DELALLOC))
                 return;
  
+       old_size = other->end - other->start + 1;
+       new_size = old_size + (new->end - new->start + 1);
+
+       /* we're not bigger than the max, unreserve the space and go */
+       if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
+               spin_lock(&BTRFS_I(inode)->lock);
+               BTRFS_I(inode)->outstanding_extents--;
+               spin_unlock(&BTRFS_I(inode)->lock);
+               return;
+       }
+
+       /*
+        * If we grew by another max_extent, just return, we want to keep that
+        * reserved amount.
+        */
+       num_extents = div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
+                               BTRFS_MAX_EXTENT_SIZE);
+       if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
+                     BTRFS_MAX_EXTENT_SIZE) > num_extents)
+               return;
+
         spin_lock(&BTRFS_I(inode)->lock);
         BTRFS_I(inode)->outstanding_extents--;
         spin_unlock(&BTRFS_I(inode)->lock);
@@ -1604,7 +1663,7 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root,
   * have pending delalloc work to be done.
   */
  static void btrfs_set_bit_hook(struct inode *inode,
-                              struct extent_state *state, unsigned long *bits)
+                              struct extent_state *state, unsigned *bits)
  {
  
         if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
@@ -1645,9 +1704,11 @@ static void btrfs_set_bit_hook(struct inode *inode,
   */
  static void btrfs_clear_bit_hook(struct inode *inode,
                                  struct extent_state *state,
-                                unsigned long *bits)
+                                unsigned *bits)
  {
         u64 len = state->end + 1 - state->start;
+       u64 num_extents = div64_u64(len + BTRFS_MAX_EXTENT_SIZE -1,
+                                   BTRFS_MAX_EXTENT_SIZE);
  
         spin_lock(&BTRFS_I(inode)->lock);
         if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG))
@@ -1667,7 +1728,7 @@ static void btrfs_clear_bit_hook(struct inode *inode,
                         *bits &= ~EXTENT_FIRST_DELALLOC;
                 } else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
                         spin_lock(&BTRFS_I(inode)->lock);
-                       BTRFS_I(inode)->outstanding_extents--;
+                       BTRFS_I(inode)->outstanding_extents -= num_extents;
                         spin_unlock(&BTRFS_I(inode)->lock);
                 }
  
@@ -2945,7 +3006,7 @@ static int __readpage_endio_check(struct inode *inode,
         return 0;
  zeroit:
         if (__ratelimit(&_rs))
-               btrfs_info(BTRFS_I(inode)->root->fs_info,
+               btrfs_warn(BTRFS_I(inode)->root->fs_info,
                            "csum failed ino %llu off %llu csum %u expected csum %u",
                            btrfs_ino(inode), start, csum, csum_expected);
         memset(kaddr + pgoff, 1, len);
@@ -3407,7 +3468,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
  
  out:
         if (ret)
-               btrfs_crit(root->fs_info,
+               btrfs_err(root->fs_info,
                         "could not do orphan cleanup %d", ret);
         btrfs_free_path(path);
         return ret;
@@ -3490,7 +3551,6 @@ static void btrfs_read_locked_inode(struct inode *inode)
         struct btrfs_path *path;
         struct extent_buffer *leaf;
         struct btrfs_inode_item *inode_item;
-       struct btrfs_timespec *tspec;
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct btrfs_key location;
         unsigned long ptr;
@@ -3527,17 +3587,19 @@ static void btrfs_read_locked_inode(struct inode *inode)
         i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
         btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
  
-       tspec = btrfs_inode_atime(inode_item);
-       inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec);
-       inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+       inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
+       inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
+
+       inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
+       inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
  
-       tspec = btrfs_inode_mtime(inode_item);
-       inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec);
-       inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+       inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime);
+       inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime);
  
-       tspec = btrfs_inode_ctime(inode_item);
-       inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec);
-       inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+       BTRFS_I(inode)->i_otime.tv_sec =
+               btrfs_timespec_sec(leaf, &inode_item->otime);
+       BTRFS_I(inode)->i_otime.tv_nsec =
+               btrfs_timespec_nsec(leaf, &inode_item->otime);
  
         inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
         BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
@@ -3656,21 +3718,26 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
         btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
         btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
  
-       btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
+       btrfs_set_token_timespec_sec(leaf, &item->atime,
                                      inode->i_atime.tv_sec, &token);
-       btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
+       btrfs_set_token_timespec_nsec(leaf, &item->atime,
                                       inode->i_atime.tv_nsec, &token);
  
-       btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
+       btrfs_set_token_timespec_sec(leaf, &item->mtime,
                                      inode->i_mtime.tv_sec, &token);
-       btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
+       btrfs_set_token_timespec_nsec(leaf, &item->mtime,
                                       inode->i_mtime.tv_nsec, &token);
  
-       btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
+       btrfs_set_token_timespec_sec(leaf, &item->ctime,
                                      inode->i_ctime.tv_sec, &token);
-       btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
+       btrfs_set_token_timespec_nsec(leaf, &item->ctime,
                                       inode->i_ctime.tv_nsec, &token);
  
+       btrfs_set_token_timespec_sec(leaf, &item->otime,
+                                    BTRFS_I(inode)->i_otime.tv_sec, &token);
+       btrfs_set_token_timespec_nsec(leaf, &item->otime,
+                                     BTRFS_I(inode)->i_otime.tv_nsec, &token);
+
         btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
                                      &token);
         btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
@@ -5007,6 +5074,7 @@ static int fixup_tree_root_location(struct btrfs_root *root,
         struct btrfs_root *new_root;
         struct btrfs_root_ref *ref;
         struct extent_buffer *leaf;
+       struct btrfs_key key;
         int ret;
         int err = 0;
  
@@ -5017,9 +5085,12 @@ static int fixup_tree_root_location(struct btrfs_root *root,
         }
  
         err = -ENOENT;
-       ret = btrfs_find_item(root->fs_info->tree_root, path,
-                               BTRFS_I(dir)->root->root_key.objectid,
-                               location->objectid, BTRFS_ROOT_REF_KEY, NULL);
+       key.objectid = BTRFS_I(dir)->root->root_key.objectid;
+       key.type = BTRFS_ROOT_REF_KEY;
+       key.offset = location->objectid;
+
+       ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key, path,
+                               0, 0);
         if (ret) {
                 if (ret < 0)
                         err = ret;
@@ -5258,7 +5329,10 @@ static struct inode *new_simple_dir(struct super_block *s,
         inode->i_op = &btrfs_dir_ro_inode_operations;
         inode->i_fop = &simple_dir_operations;
         inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
-       inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+       inode->i_mtime = CURRENT_TIME;
+       inode->i_atime = inode->i_mtime;
+       inode->i_ctime = inode->i_mtime;
+       BTRFS_I(inode)->i_otime = inode->i_mtime;
  
         return inode;
  }
@@ -5826,7 +5900,12 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
  
         inode_init_owner(inode, dir, mode);
         inode_set_bytes(inode, 0);
-       inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+
+       inode->i_mtime = CURRENT_TIME;
+       inode->i_atime = inode->i_mtime;
+       inode->i_ctime = inode->i_mtime;
+       BTRFS_I(inode)->i_otime = inode->i_mtime;
+
         inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
                                   struct btrfs_inode_item);
         memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item,
@@ -7134,11 +7213,12 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
         u64 start = iblock << inode->i_blkbits;
         u64 lockstart, lockend;
         u64 len = bh_result->b_size;
+       u64 orig_len = len;
         int unlock_bits = EXTENT_LOCKED;
         int ret = 0;
  
         if (create)
-               unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY;
+               unlock_bits |= EXTENT_DIRTY;
         else
                 len = min_t(u64, len, root->sectorsize);
  
@@ -7269,14 +7349,12 @@ unlock:
                 if (start + len > i_size_read(inode))
                         i_size_write(inode, start + len);
  
-               spin_lock(&BTRFS_I(inode)->lock);
-               BTRFS_I(inode)->outstanding_extents++;
-               spin_unlock(&BTRFS_I(inode)->lock);
-
-               ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
-                                    lockstart + len - 1, EXTENT_DELALLOC, NULL,
-                                    &cached_state, GFP_NOFS);
-               BUG_ON(ret);
+               if (len < orig_len) {
+                       spin_lock(&BTRFS_I(inode)->lock);
+                       BTRFS_I(inode)->outstanding_extents++;
+                       spin_unlock(&BTRFS_I(inode)->lock);
+               }
+               btrfs_free_reserved_data_space(inode, len);
         }
  
         /*
@@ -7805,8 +7883,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
         }
  
         /* async crcs make it difficult to collect full stripe writes. */
-       if (btrfs_get_alloc_profile(root, 1) &
-           (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))
+       if (btrfs_get_alloc_profile(root, 1) & BTRFS_BLOCK_GROUP_RAID56_MASK)
                 async_submit = 0;
         else
                 async_submit = 1;
@@ -8053,8 +8130,6 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
                 else if (ret >= 0 && (size_t)ret < count)
                         btrfs_delalloc_release_space(inode,
                                                      count - (size_t)ret);
-               else
-                       btrfs_delalloc_release_metadata(inode, 0);
         }
  out:
         if (wakeup)
@@ -8575,6 +8650,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
  
         ei->delayed_node = NULL;
  
+       ei->i_otime.tv_sec = 0;
+       ei->i_otime.tv_nsec = 0;
+
         inode = &ei->vfs_inode;
         extent_map_tree_init(&ei->extent_tree);
         extent_io_tree_init(&ei->io_tree, &inode->i_data);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c

index 48b60db..97159a8 100644 (file)
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1431,9 +1431,8 @@ static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
                 qgroup = u64_to_ptr(unode->aux);
                 qgroup->rfer += sign * oper->num_bytes;
                 qgroup->rfer_cmpr += sign * oper->num_bytes;
+               WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes);
                 qgroup->excl += sign * oper->num_bytes;
-               if (sign < 0)
-                       WARN_ON(qgroup->excl < oper->num_bytes);
                 qgroup->excl_cmpr += sign * oper->num_bytes;
                 qgroup_dirty(fs_info, qgroup);
  
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c

index 8ab2a17..5264858 100644 (file)
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -58,15 +58,6 @@
   */
  #define RBIO_CACHE_READY_BIT   3
  
-/*
- * bbio and raid_map is managed by the caller, so we shouldn't free
- * them here. And besides that, all rbios with this flag should not
- * be cached, because we need raid_map to check the rbios' stripe
- * is the same or not, but it is very likely that the caller has
- * free raid_map, so don't cache those rbios.
- */
-#define RBIO_HOLD_BBIO_MAP_BIT 4
-
  #define RBIO_CACHE_SIZE 1024
  
  enum btrfs_rbio_ops {
@@ -79,13 +70,6 @@ struct btrfs_raid_bio {
         struct btrfs_fs_info *fs_info;
         struct btrfs_bio *bbio;
  
-       /*
-        * logical block numbers for the start of each stripe
-        * The last one or two are p/q.  These are sorted,
-        * so raid_map[0] is the start of our full stripe
-        */
-       u64 *raid_map;
-
         /* while we're doing rmw on a stripe
          * we put it into a hash table so we can
          * lock the stripe and merge more rbios
@@ -303,7 +287,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
   */
  static int rbio_bucket(struct btrfs_raid_bio *rbio)
  {
-       u64 num = rbio->raid_map[0];
+       u64 num = rbio->bbio->raid_map[0];
  
         /*
          * we shift down quite a bit.  We're using byte
@@ -606,8 +590,8 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
             test_bit(RBIO_CACHE_BIT, &cur->flags))
                 return 0;
  
-       if (last->raid_map[0] !=
-           cur->raid_map[0])
+       if (last->bbio->raid_map[0] !=
+           cur->bbio->raid_map[0])
                 return 0;
  
         /* we can't merge with different operations */
@@ -689,7 +673,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
         spin_lock_irqsave(&h->lock, flags);
         list_for_each_entry(cur, &h->hash_list, hash_list) {
                 walk++;
-               if (cur->raid_map[0] == rbio->raid_map[0]) {
+               if (cur->bbio->raid_map[0] == rbio->bbio->raid_map[0]) {
                         spin_lock(&cur->bio_list_lock);
  
                         /* can we steal this cached rbio's pages? */
@@ -841,21 +825,6 @@ done_nolock:
                 remove_rbio_from_cache(rbio);
  }
  
-static inline void
-__free_bbio_and_raid_map(struct btrfs_bio *bbio, u64 *raid_map, int need)
-{
-       if (need) {
-               kfree(raid_map);
-               kfree(bbio);
-       }
-}
-
-static inline void free_bbio_and_raid_map(struct btrfs_raid_bio *rbio)
-{
-       __free_bbio_and_raid_map(rbio->bbio, rbio->raid_map,
-                       !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags));
-}
-
  static void __free_raid_bio(struct btrfs_raid_bio *rbio)
  {
         int i;
@@ -875,8 +844,7 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio)
                 }
         }
  
-       free_bbio_and_raid_map(rbio);
-
+       btrfs_put_bbio(rbio->bbio);
         kfree(rbio);
  }
  
@@ -985,8 +953,7 @@ static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
   * this does not allocate any pages for rbio->pages.
   */
  static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
-                         struct btrfs_bio *bbio, u64 *raid_map,
-                         u64 stripe_len)
+                         struct btrfs_bio *bbio, u64 stripe_len)
  {
         struct btrfs_raid_bio *rbio;
         int nr_data = 0;
@@ -1007,7 +974,6 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
         INIT_LIST_HEAD(&rbio->stripe_cache);
         INIT_LIST_HEAD(&rbio->hash_list);
         rbio->bbio = bbio;
-       rbio->raid_map = raid_map;
         rbio->fs_info = root->fs_info;
         rbio->stripe_len = stripe_len;
         rbio->nr_pages = num_pages;
@@ -1028,10 +994,12 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
         rbio->bio_pages = p + sizeof(struct page *) * num_pages;
         rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2;
  
-       if (raid_map[real_stripes - 1] == RAID6_Q_STRIPE)
+       if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
+               nr_data = real_stripes - 1;
+       else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
                 nr_data = real_stripes - 2;
         else
-               nr_data = real_stripes - 1;
+               BUG();
  
         rbio->nr_data = nr_data;
         return rbio;
@@ -1182,7 +1150,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
         spin_lock_irq(&rbio->bio_list_lock);
         bio_list_for_each(bio, &rbio->bio_list) {
                 start = (u64)bio->bi_iter.bi_sector << 9;
-               stripe_offset = start - rbio->raid_map[0];
+               stripe_offset = start - rbio->bbio->raid_map[0];
                 page_index = stripe_offset >> PAGE_CACHE_SHIFT;
  
                 for (i = 0; i < bio->bi_vcnt; i++) {
@@ -1402,7 +1370,7 @@ static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
         logical <<= 9;
  
         for (i = 0; i < rbio->nr_data; i++) {
-               stripe_start = rbio->raid_map[i];
+               stripe_start = rbio->bbio->raid_map[i];
                 if (logical >= stripe_start &&
                     logical < stripe_start + rbio->stripe_len) {
                         return i;
@@ -1776,17 +1744,16 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
   * our main entry point for writes from the rest of the FS.
   */
  int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
-                       struct btrfs_bio *bbio, u64 *raid_map,
-                       u64 stripe_len)
+                       struct btrfs_bio *bbio, u64 stripe_len)
  {
         struct btrfs_raid_bio *rbio;
         struct btrfs_plug_cb *plug = NULL;
         struct blk_plug_cb *cb;
         int ret;
  
-       rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
+       rbio = alloc_rbio(root, bbio, stripe_len);
         if (IS_ERR(rbio)) {
-               __free_bbio_and_raid_map(bbio, raid_map, 1);
+               btrfs_put_bbio(bbio);
                 return PTR_ERR(rbio);
         }
         bio_list_add(&rbio->bio_list, bio);
@@ -1885,9 +1852,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
                 }
  
                 /* all raid6 handling here */
-               if (rbio->raid_map[rbio->real_stripes - 1] ==
-                   RAID6_Q_STRIPE) {
-
+               if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) {
                         /*
                          * single failure, rebuild from parity raid5
                          * style
@@ -1922,8 +1887,9 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
                          * here due to a crc mismatch and we can't give them the
                          * data they want
                          */
-                       if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
-                               if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
+                       if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) {
+                               if (rbio->bbio->raid_map[faila] ==
+                                   RAID5_P_STRIPE) {
                                         err = -EIO;
                                         goto cleanup;
                                 }
@@ -1934,7 +1900,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
                                 goto pstripe;
                         }
  
-                       if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
+                       if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) {
                                 raid6_datap_recov(rbio->real_stripes,
                                                   PAGE_SIZE, faila, pointers);
                         } else {
@@ -2001,8 +1967,7 @@ cleanup:
  
  cleanup_io:
         if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
-               if (err == 0 &&
-                   !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags))
+               if (err == 0)
                         cache_rbio_pages(rbio);
                 else
                         clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
@@ -2156,15 +2121,16 @@ cleanup:
   * of the drive.
   */
  int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
-                         struct btrfs_bio *bbio, u64 *raid_map,
-                         u64 stripe_len, int mirror_num, int generic_io)
+                         struct btrfs_bio *bbio, u64 stripe_len,
+                         int mirror_num, int generic_io)
  {
         struct btrfs_raid_bio *rbio;
         int ret;
  
-       rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
+       rbio = alloc_rbio(root, bbio, stripe_len);
         if (IS_ERR(rbio)) {
-               __free_bbio_and_raid_map(bbio, raid_map, generic_io);
+               if (generic_io)
+                       btrfs_put_bbio(bbio);
                 return PTR_ERR(rbio);
         }
  
@@ -2175,7 +2141,8 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
         rbio->faila = find_logical_bio_stripe(rbio, bio);
         if (rbio->faila == -1) {
                 BUG();
-               __free_bbio_and_raid_map(bbio, raid_map, generic_io);
+               if (generic_io)
+                       btrfs_put_bbio(bbio);
                 kfree(rbio);
                 return -EIO;
         }
@@ -2184,7 +2151,7 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
                 btrfs_bio_counter_inc_noblocked(root->fs_info);
                 rbio->generic_bio_cnt = 1;
         } else {
-               set_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags);
+               btrfs_get_bbio(bbio);
         }
  
         /*
@@ -2240,14 +2207,14 @@ static void read_rebuild_work(struct btrfs_work *work)
  
  struct btrfs_raid_bio *
  raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
-                              struct btrfs_bio *bbio, u64 *raid_map,
-                              u64 stripe_len, struct btrfs_device *scrub_dev,
+                              struct btrfs_bio *bbio, u64 stripe_len,
+                              struct btrfs_device *scrub_dev,
                                unsigned long *dbitmap, int stripe_nsectors)
  {
         struct btrfs_raid_bio *rbio;
         int i;
  
-       rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
+       rbio = alloc_rbio(root, bbio, stripe_len);
         if (IS_ERR(rbio))
                 return NULL;
         bio_list_add(&rbio->bio_list, bio);
@@ -2279,10 +2246,10 @@ void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
         int stripe_offset;
         int index;
  
-       ASSERT(logical >= rbio->raid_map[0]);
-       ASSERT(logical + PAGE_SIZE <= rbio->raid_map[0] +
+       ASSERT(logical >= rbio->bbio->raid_map[0]);
+       ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] +
                                 rbio->stripe_len * rbio->nr_data);
-       stripe_offset = (int)(logical - rbio->raid_map[0]);
+       stripe_offset = (int)(logical - rbio->bbio->raid_map[0]);
         index = stripe_offset >> PAGE_CACHE_SHIFT;
         rbio->bio_pages[index] = page;
  }
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h

index 31d4a15..2b5d797 100644 (file)
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -43,16 +43,15 @@ struct btrfs_raid_bio;
  struct btrfs_device;
  
  int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
-                         struct btrfs_bio *bbio, u64 *raid_map,
-                         u64 stripe_len, int mirror_num, int generic_io);
+                         struct btrfs_bio *bbio, u64 stripe_len,
+                         int mirror_num, int generic_io);
  int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
-                              struct btrfs_bio *bbio, u64 *raid_map,
-                              u64 stripe_len);
+                              struct btrfs_bio *bbio, u64 stripe_len);
  
  struct btrfs_raid_bio *
  raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
-                              struct btrfs_bio *bbio, u64 *raid_map,
-                              u64 stripe_len, struct btrfs_device *scrub_dev,
+                              struct btrfs_bio *bbio, u64 stripe_len,
+                              struct btrfs_device *scrub_dev,
                                unsigned long *dbitmap, int stripe_nsectors);
  void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
                                    struct page *page, u64 logical);
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c

index b63ae20..0e7beea 100644 (file)
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -66,7 +66,6 @@ struct reada_extctl {
  struct reada_extent {
         u64                     logical;
         struct btrfs_key        top;
-       u32                     blocksize;
         int                     err;
         struct list_head        extctl;
         int                     refcnt;
@@ -349,7 +348,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
  
         blocksize = root->nodesize;
         re->logical = logical;
-       re->blocksize = blocksize;
         re->top = *top;
         INIT_LIST_HEAD(&re->extctl);
         spin_lock_init(&re->lock);
@@ -463,7 +461,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
         spin_unlock(&fs_info->reada_lock);
         btrfs_dev_replace_unlock(&fs_info->dev_replace);
  
-       kfree(bbio);
+       btrfs_put_bbio(bbio);
         return re;
  
  error:
@@ -488,7 +486,7 @@ error:
                 kref_put(&zone->refcnt, reada_zone_release);
                 spin_unlock(&fs_info->reada_lock);
         }
-       kfree(bbio);
+       btrfs_put_bbio(bbio);
         kfree(re);
         return re_exist;
  }
@@ -660,7 +658,6 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
         int mirror_num = 0;
         struct extent_buffer *eb = NULL;
         u64 logical;
-       u32 blocksize;
         int ret;
         int i;
         int need_kick = 0;
@@ -694,7 +691,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
                 spin_unlock(&fs_info->reada_lock);
                 return 0;
         }
-       dev->reada_next = re->logical + re->blocksize;
+       dev->reada_next = re->logical + fs_info->tree_root->nodesize;
         re->refcnt++;
  
         spin_unlock(&fs_info->reada_lock);
@@ -709,7 +706,6 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
                 }
         }
         logical = re->logical;
-       blocksize = re->blocksize;
  
         spin_lock(&re->lock);
         if (re->scheduled_for == NULL) {
@@ -724,8 +720,8 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
                 return 0;
  
         atomic_inc(&dev->reada_in_flight);
-       ret = reada_tree_block_flagged(fs_info->extent_root, logical, blocksize,
-                        mirror_num, &eb);
+       ret = reada_tree_block_flagged(fs_info->extent_root, logical,
+                       mirror_num, &eb);
         if (ret)
                 __readahead_hook(fs_info->extent_root, NULL, logical, ret);
         else if (eb)
@@ -851,7 +847,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
                                 break;
                         printk(KERN_DEBUG
                                 "  re: logical %llu size %u empty %d for %lld",
-                               re->logical, re->blocksize,
+                               re->logical, fs_info->tree_root->nodesize,
                                 list_empty(&re->extctl), re->scheduled_for ?
                                 re->scheduled_for->devid : -1);
  
@@ -886,7 +882,8 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
                 }
                 printk(KERN_DEBUG
                         "re: logical %llu size %u list empty %d for %lld",
-                       re->logical, re->blocksize, list_empty(&re->extctl),
+                       re->logical, fs_info->tree_root->nodesize,
+                       list_empty(&re->extctl),
                         re->scheduled_for ? re->scheduled_for->devid : -1);
                 for (i = 0; i < re->nzones; ++i) {
                         printk(KERN_CONT " zone %llu-%llu devs",
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c

index 74257d6..d830853 100644 (file)
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2855,9 +2855,10 @@ static void update_processed_blocks(struct reloc_control *rc,
         }
  }
  
-static int tree_block_processed(u64 bytenr, u32 blocksize,
-                               struct reloc_control *rc)
+static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
  {
+       u32 blocksize = rc->extent_root->nodesize;
+
         if (test_range_bit(&rc->processed_blocks, bytenr,
                            bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
                 return 1;
@@ -2965,8 +2966,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
         while (rb_node) {
                 block = rb_entry(rb_node, struct tree_block, rb_node);
                 if (!block->key_ready)
-                       readahead_tree_block(rc->extent_root, block->bytenr,
-                                       block->key.objectid);
+                       readahead_tree_block(rc->extent_root, block->bytenr);
                 rb_node = rb_next(rb_node);
         }
  
@@ -3353,7 +3353,7 @@ static int __add_tree_block(struct reloc_control *rc,
         bool skinny = btrfs_fs_incompat(rc->extent_root->fs_info,
                                         SKINNY_METADATA);
  
-       if (tree_block_processed(bytenr, blocksize, rc))
+       if (tree_block_processed(bytenr, rc))
                 return 0;
  
         if (tree_search(blocks, bytenr))
@@ -3611,7 +3611,7 @@ static int find_data_references(struct reloc_control *rc,
                 if (added)
                         goto next;
  
-               if (!tree_block_processed(leaf->start, leaf->len, rc)) {
+               if (!tree_block_processed(leaf->start, rc)) {
                         block = kmalloc(sizeof(*block), GFP_NOFS);
                         if (!block) {
                                 err = -ENOMEM;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c

index e427cb7..ec57687 100644 (file)
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -66,7 +66,6 @@ struct scrub_ctx;
  struct scrub_recover {
         atomic_t                refs;
         struct btrfs_bio        *bbio;
-       u64                     *raid_map;
         u64                     map_length;
  };
  
@@ -80,7 +79,7 @@ struct scrub_page {
         u64                     logical;
         u64                     physical;
         u64                     physical_for_dev_replace;
-       atomic_t                ref_count;
+       atomic_t                refs;
         struct {
                 unsigned int    mirror_num:8;
                 unsigned int    have_csum:1;
@@ -113,7 +112,7 @@ struct scrub_block {
         struct scrub_page       *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
         int                     page_count;
         atomic_t                outstanding_pages;
-       atomic_t                ref_count; /* free mem on transition to zero */
+       atomic_t                refs; /* free mem on transition to zero */
         struct scrub_ctx        *sctx;
         struct scrub_parity     *sparity;
         struct {
@@ -142,7 +141,7 @@ struct scrub_parity {
  
         int                     stripe_len;
  
-       atomic_t                ref_count;
+       atomic_t                refs;
  
         struct list_head        spages;
  
@@ -194,6 +193,15 @@ struct scrub_ctx {
          */
         struct btrfs_scrub_progress stat;
         spinlock_t              stat_lock;
+
+       /*
+        * Use a ref counter to avoid use-after-free issues. Scrub workers
+        * decrement bios_in_flight and workers_pending and then do a wakeup
+        * on the list_wait wait queue. We must ensure the main scrub task
+        * doesn't free the scrub context before or while the workers are
+        * doing the wakeup() call.
+        */
+       atomic_t                refs;
  };
  
  struct scrub_fixup_nodatasum {
@@ -236,10 +244,7 @@ static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
  static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
  static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
  static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
-static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
-                                    struct btrfs_fs_info *fs_info,
-                                    struct scrub_block *original_sblock,
-                                    u64 length, u64 logical,
+static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
                                      struct scrub_block *sblocks_for_recheck);
  static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
                                 struct scrub_block *sblock, int is_metadata,
@@ -251,8 +256,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
                                          const u8 *csum, u64 generation,
                                          u16 csum_size);
  static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
-                                            struct scrub_block *sblock_good,
-                                            int force_write);
+                                            struct scrub_block *sblock_good);
  static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
                                             struct scrub_block *sblock_good,
                                             int page_num, int force_write);
@@ -302,10 +306,12 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
  static void copy_nocow_pages_worker(struct btrfs_work *work);
  static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
  static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
+static void scrub_put_ctx(struct scrub_ctx *sctx);
  
  
  static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
  {
+       atomic_inc(&sctx->refs);
         atomic_inc(&sctx->bios_in_flight);
  }
  
@@ -313,6 +319,7 @@ static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
  {
         atomic_dec(&sctx->bios_in_flight);
         wake_up(&sctx->list_wait);
+       scrub_put_ctx(sctx);
  }
  
  static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
@@ -346,6 +353,7 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
  {
         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
  
+       atomic_inc(&sctx->refs);
         /*
          * increment scrubs_running to prevent cancel requests from
          * completing as long as a worker is running. we must also
@@ -388,6 +396,7 @@ static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
         atomic_dec(&sctx->workers_pending);
         wake_up(&fs_info->scrub_pause_wait);
         wake_up(&sctx->list_wait);
+       scrub_put_ctx(sctx);
  }
  
  static void scrub_free_csums(struct scrub_ctx *sctx)
@@ -433,6 +442,12 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
         kfree(sctx);
  }
  
+static void scrub_put_ctx(struct scrub_ctx *sctx)
+{
+       if (atomic_dec_and_test(&sctx->refs))
+               scrub_free_ctx(sctx);
+}
+
  static noinline_for_stack
  struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
  {
@@ -457,6 +472,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
         sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
         if (!sctx)
                 goto nomem;
+       atomic_set(&sctx->refs, 1);
         sctx->is_dev_replace = is_dev_replace;
         sctx->pages_per_rd_bio = pages_per_rd_bio;
         sctx->curr = -1;
@@ -520,6 +536,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
         struct inode_fs_paths *ipath = NULL;
         struct btrfs_root *local_root;
         struct btrfs_key root_key;
+       struct btrfs_key key;
  
         root_key.objectid = root;
         root_key.type = BTRFS_ROOT_ITEM_KEY;
@@ -530,7 +547,14 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
                 goto err;
         }
  
-       ret = inode_item_info(inum, 0, local_root, swarn->path);
+       /*
+        * this makes the path point to (inum INODE_ITEM ioff)
+        */
+       key.objectid = inum;
+       key.type = BTRFS_INODE_ITEM_KEY;
+       key.offset = 0;
+
+       ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
         if (ret) {
                 btrfs_release_path(swarn->path);
                 goto err;
@@ -848,8 +872,7 @@ static inline void scrub_get_recover(struct scrub_recover *recover)
  static inline void scrub_put_recover(struct scrub_recover *recover)
  {
         if (atomic_dec_and_test(&recover->refs)) {
-               kfree(recover->bbio);
-               kfree(recover->raid_map);
+               btrfs_put_bbio(recover->bbio);
                 kfree(recover);
         }
  }
@@ -955,8 +978,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
         }
  
         /* setup the context, map the logical blocks and alloc the pages */
-       ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,
-                                       logical, sblocks_for_recheck);
+       ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
         if (ret) {
                 spin_lock(&sctx->stat_lock);
                 sctx->stat.read_errors++;
@@ -1030,9 +1052,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
         if (!is_metadata && !have_csum) {
                 struct scrub_fixup_nodatasum *fixup_nodatasum;
  
-nodatasum_case:
                 WARN_ON(sctx->is_dev_replace);
  
+nodatasum_case:
+
                 /*
                  * !is_metadata and !have_csum, this means that the data
                  * might not be COW'ed, that it might be modified
@@ -1091,76 +1114,20 @@ nodatasum_case:
                     sblock_other->no_io_error_seen) {
                         if (sctx->is_dev_replace) {
                                 scrub_write_block_to_dev_replace(sblock_other);
+                               goto corrected_error;
                         } else {
-                               int force_write = is_metadata || have_csum;
-
                                 ret = scrub_repair_block_from_good_copy(
-                                               sblock_bad, sblock_other,
-                                               force_write);
+                                               sblock_bad, sblock_other);
+                               if (!ret)
+                                       goto corrected_error;
                         }
-                       if (0 == ret)
-                               goto corrected_error;
                 }
         }
  
-       /*
-        * for dev_replace, pick good pages and write to the target device.
-        */
-       if (sctx->is_dev_replace) {
-               success = 1;
-               for (page_num = 0; page_num < sblock_bad->page_count;
-                    page_num++) {
-                       int sub_success;
-
-                       sub_success = 0;
-                       for (mirror_index = 0;
-                            mirror_index < BTRFS_MAX_MIRRORS &&
-                            sblocks_for_recheck[mirror_index].page_count > 0;
-                            mirror_index++) {
-                               struct scrub_block *sblock_other =
-                                       sblocks_for_recheck + mirror_index;
-                               struct scrub_page *page_other =
-                                       sblock_other->pagev[page_num];
-
-                               if (!page_other->io_error) {
-                                       ret = scrub_write_page_to_dev_replace(
-                                                       sblock_other, page_num);
-                                       if (ret == 0) {
-                                               /* succeeded for this page */
-                                               sub_success = 1;
-                                               break;
-                                       } else {
-                                               btrfs_dev_replace_stats_inc(
-                                                       &sctx->dev_root->
-                                                       fs_info->dev_replace.
-                                                       num_write_errors);
-                                       }
-                               }
-                       }
-
-                       if (!sub_success) {
-                               /*
-                                * did not find a mirror to fetch the page
-                                * from. scrub_write_page_to_dev_replace()
-                                * handles this case (page->io_error), by
-                                * filling the block with zeros before
-                                * submitting the write request
-                                */
-                               success = 0;
-                               ret = scrub_write_page_to_dev_replace(
-                                               sblock_bad, page_num);
-                               if (ret)
-                                       btrfs_dev_replace_stats_inc(
-                                               &sctx->dev_root->fs_info->
-                                               dev_replace.num_write_errors);
-                       }
-               }
-
-               goto out;
-       }
+       if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
+               goto did_not_correct_error;
  
         /*
-        * for regular scrub, repair those pages that are errored.
          * In case of I/O errors in the area that is supposed to be
          * repaired, continue by picking good copies of those pages.
          * Select the good pages from mirrors to rewrite bad pages from
@@ -1184,44 +1151,64 @@ nodatasum_case:
          * mirror, even if other 512 byte sectors in the same PAGE_SIZE
          * area are unreadable.
          */
-
-       /* can only fix I/O errors from here on */
-       if (sblock_bad->no_io_error_seen)
-               goto did_not_correct_error;
-
         success = 1;
-       for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
+       for (page_num = 0; page_num < sblock_bad->page_count;
+            page_num++) {
                 struct scrub_page *page_bad = sblock_bad->pagev[page_num];
+               struct scrub_block *sblock_other = NULL;
  
-               if (!page_bad->io_error)
+               /* skip no-io-error page in scrub */
+               if (!page_bad->io_error && !sctx->is_dev_replace)
                         continue;
  
-               for (mirror_index = 0;
-                    mirror_index < BTRFS_MAX_MIRRORS &&
-                    sblocks_for_recheck[mirror_index].page_count > 0;
-                    mirror_index++) {
-                       struct scrub_block *sblock_other = sblocks_for_recheck +
-                                                          mirror_index;
-                       struct scrub_page *page_other = sblock_other->pagev[
-                                                       page_num];
-
-                       if (!page_other->io_error) {
-                               ret = scrub_repair_page_from_good_copy(
-                                       sblock_bad, sblock_other, page_num, 0);
-                               if (0 == ret) {
-                                       page_bad->io_error = 0;
-                                       break; /* succeeded for this page */
+               /* try to find no-io-error page in mirrors */
+               if (page_bad->io_error) {
+                       for (mirror_index = 0;
+                            mirror_index < BTRFS_MAX_MIRRORS &&
+                            sblocks_for_recheck[mirror_index].page_count > 0;
+                            mirror_index++) {
+                               if (!sblocks_for_recheck[mirror_index].
+                                   pagev[page_num]->io_error) {
+                                       sblock_other = sblocks_for_recheck +
+                                                      mirror_index;
+                                       break;
                                 }
                         }
+                       if (!sblock_other)
+                               success = 0;
                 }
  
-               if (page_bad->io_error) {
-                       /* did not find a mirror to copy the page from */
-                       success = 0;
+               if (sctx->is_dev_replace) {
+                       /*
+                        * did not find a mirror to fetch the page
+                        * from. scrub_write_page_to_dev_replace()
+                        * handles this case (page->io_error), by
+                        * filling the block with zeros before
+                        * submitting the write request
+                        */
+                       if (!sblock_other)
+                               sblock_other = sblock_bad;
+
+                       if (scrub_write_page_to_dev_replace(sblock_other,
+                                                           page_num) != 0) {
+                               btrfs_dev_replace_stats_inc(
+                                       &sctx->dev_root->
+                                       fs_info->dev_replace.
+                                       num_write_errors);
+                               success = 0;
+                       }
+               } else if (sblock_other) {
+                       ret = scrub_repair_page_from_good_copy(sblock_bad,
+                                                              sblock_other,
+                                                              page_num, 0);
+                       if (0 == ret)
+                               page_bad->io_error = 0;
+                       else
+                               success = 0;
                 }
         }
  
-       if (success) {
+       if (success && !sctx->is_dev_replace) {
                 if (is_metadata || have_csum) {
                         /*
                          * need to verify the checksum now that all
@@ -1288,19 +1275,18 @@ out:
         return 0;
  }
  
-static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio, u64 *raid_map)
+static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
  {
-       if (raid_map) {
-               if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
-                       return 3;
-               else
-                       return 2;
-       } else {
+       if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
+               return 2;
+       else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
+               return 3;
+       else
                 return (int)bbio->num_stripes;
-       }
  }
  
-static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map,
+static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
+                                                u64 *raid_map,
                                                  u64 mapped_length,
                                                  int nstripes, int mirror,
                                                  int *stripe_index,
@@ -1308,7 +1294,7 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map,
  {
         int i;
  
-       if (raid_map) {
+       if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
                 /* RAID5/6 */
                 for (i = 0; i < nstripes; i++) {
                         if (raid_map[i] == RAID6_Q_STRIPE ||
@@ -1329,72 +1315,65 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map,
         }
  }
  
-static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
-                                    struct btrfs_fs_info *fs_info,
-                                    struct scrub_block *original_sblock,
-                                    u64 length, u64 logical,
+static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
                                      struct scrub_block *sblocks_for_recheck)
  {
+       struct scrub_ctx *sctx = original_sblock->sctx;
+       struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+       u64 length = original_sblock->page_count * PAGE_SIZE;
+       u64 logical = original_sblock->pagev[0]->logical;
         struct scrub_recover *recover;
         struct btrfs_bio *bbio;
-       u64 *raid_map;
         u64 sublen;
         u64 mapped_length;
         u64 stripe_offset;
         int stripe_index;
-       int page_index;
+       int page_index = 0;
         int mirror_index;
         int nmirrors;
         int ret;
  
         /*
-        * note: the two members ref_count and outstanding_pages
+        * note: the two members refs and outstanding_pages
          * are not used (and not set) in the blocks that are used for
          * the recheck procedure
          */
  
-       page_index = 0;
         while (length > 0) {
                 sublen = min_t(u64, length, PAGE_SIZE);
                 mapped_length = sublen;
                 bbio = NULL;
-               raid_map = NULL;
  
                 /*
                  * with a length of PAGE_SIZE, each returned stripe
                  * represents one mirror
                  */
                 ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical,
-                                      &mapped_length, &bbio, 0, &raid_map);
+                                      &mapped_length, &bbio, 0, 1);
                 if (ret || !bbio || mapped_length < sublen) {
-                       kfree(bbio);
-                       kfree(raid_map);
+                       btrfs_put_bbio(bbio);
                         return -EIO;
                 }
  
                 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
                 if (!recover) {
-                       kfree(bbio);
-                       kfree(raid_map);
+                       btrfs_put_bbio(bbio);
                         return -ENOMEM;
                 }
  
                 atomic_set(&recover->refs, 1);
                 recover->bbio = bbio;
-               recover->raid_map = raid_map;
                 recover->map_length = mapped_length;
  
                 BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
  
-               nmirrors = scrub_nr_raid_mirrors(bbio, raid_map);
+               nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
+
                 for (mirror_index = 0; mirror_index < nmirrors;
                      mirror_index++) {
                         struct scrub_block *sblock;
                         struct scrub_page *page;
  
-                       if (mirror_index >= BTRFS_MAX_MIRRORS)
-                               continue;
-
                         sblock = sblocks_for_recheck + mirror_index;
                         sblock->sctx = sctx;
                         page = kzalloc(sizeof(*page), GFP_NOFS);
@@ -1410,9 +1389,12 @@ leave_nomem:
                         sblock->pagev[page_index] = page;
                         page->logical = logical;
  
-                       scrub_stripe_index_and_offset(logical, raid_map,
+                       scrub_stripe_index_and_offset(logical,
+                                                     bbio->map_type,
+                                                     bbio->raid_map,
                                                       mapped_length,
-                                                     bbio->num_stripes,
+                                                     bbio->num_stripes -
+                                                     bbio->num_tgtdevs,
                                                       mirror_index,
                                                       &stripe_index,
                                                       &stripe_offset);
@@ -1458,7 +1440,8 @@ static void scrub_bio_wait_endio(struct bio *bio, int error)
  
  static inline int scrub_is_page_on_raid56(struct scrub_page *page)
  {
-       return page->recover && page->recover->raid_map;
+       return page->recover &&
+              (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
  }
  
  static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
@@ -1475,7 +1458,6 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
         bio->bi_end_io = scrub_bio_wait_endio;
  
         ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio,
-                                   page->recover->raid_map,
                                     page->recover->map_length,
                                     page->mirror_num, 0);
         if (ret)
@@ -1615,8 +1597,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
  }
  
  static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
-                                            struct scrub_block *sblock_good,
-                                            int force_write)
+                                            struct scrub_block *sblock_good)
  {
         int page_num;
         int ret = 0;
@@ -1626,8 +1607,7 @@ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
  
                 ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
                                                            sblock_good,
-                                                          page_num,
-                                                          force_write);
+                                                          page_num, 1);
                 if (ret_sub)
                         ret = ret_sub;
         }
@@ -2067,12 +2047,12 @@ static int scrub_checksum_super(struct scrub_block *sblock)
  
  static void scrub_block_get(struct scrub_block *sblock)
  {
-       atomic_inc(&sblock->ref_count);
+       atomic_inc(&sblock->refs);
  }
  
  static void scrub_block_put(struct scrub_block *sblock)
  {
-       if (atomic_dec_and_test(&sblock->ref_count)) {
+       if (atomic_dec_and_test(&sblock->refs)) {
                 int i;
  
                 if (sblock->sparity)
@@ -2086,12 +2066,12 @@ static void scrub_block_put(struct scrub_block *sblock)
  
  static void scrub_page_get(struct scrub_page *spage)
  {
-       atomic_inc(&spage->ref_count);
+       atomic_inc(&spage->refs);
  }
  
  static void scrub_page_put(struct scrub_page *spage)
  {
-       if (atomic_dec_and_test(&spage->ref_count)) {
+       if (atomic_dec_and_test(&spage->refs)) {
                 if (spage->page)
                         __free_page(spage->page);
                 kfree(spage);
@@ -2217,7 +2197,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
  
         /* one ref inside this function, plus one for each page added to
          * a bio later on */
-       atomic_set(&sblock->ref_count, 1);
+       atomic_set(&sblock->refs, 1);
         sblock->sctx = sctx;
         sblock->no_io_error_seen = 1;
  
@@ -2510,7 +2490,7 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity,
  
         /* one ref inside this function, plus one for each page added to
          * a bio later on */
-       atomic_set(&sblock->ref_count, 1);
+       atomic_set(&sblock->refs, 1);
         sblock->sctx = sctx;
         sblock->no_io_error_seen = 1;
         sblock->sparity = sparity;
@@ -2705,7 +2685,6 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
         struct btrfs_raid_bio *rbio;
         struct scrub_page *spage;
         struct btrfs_bio *bbio = NULL;
-       u64 *raid_map = NULL;
         u64 length;
         int ret;
  
@@ -2716,8 +2695,8 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
         length = sparity->logic_end - sparity->logic_start + 1;
         ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE,
                                sparity->logic_start,
-                              &length, &bbio, 0, &raid_map);
-       if (ret || !bbio || !raid_map)
+                              &length, &bbio, 0, 1);
+       if (ret || !bbio || !bbio->raid_map)
                 goto bbio_out;
  
         bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
@@ -2729,8 +2708,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
         bio->bi_end_io = scrub_parity_bio_endio;
  
         rbio = raid56_parity_alloc_scrub_rbio(sctx->dev_root, bio, bbio,
-                                             raid_map, length,
-                                             sparity->scrub_dev,
+                                             length, sparity->scrub_dev,
                                               sparity->dbitmap,
                                               sparity->nsectors);
         if (!rbio)
@@ -2747,8 +2725,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
  rbio_out:
         bio_put(bio);
  bbio_out:
-       kfree(bbio);
-       kfree(raid_map);
+       btrfs_put_bbio(bbio);
         bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
                   sparity->nsectors);
         spin_lock(&sctx->stat_lock);
@@ -2765,12 +2742,12 @@ static inline int scrub_calc_parity_bitmap_len(int nsectors)
  
  static void scrub_parity_get(struct scrub_parity *sparity)
  {
-       atomic_inc(&sparity->ref_count);
+       atomic_inc(&sparity->refs);
  }
  
  static void scrub_parity_put(struct scrub_parity *sparity)
  {
-       if (!atomic_dec_and_test(&sparity->ref_count))
+       if (!atomic_dec_and_test(&sparity->refs))
                 return;
  
         scrub_parity_check_and_repair(sparity);
@@ -2820,7 +2797,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
         sparity->scrub_dev = sdev;
         sparity->logic_start = logic_start;
         sparity->logic_end = logic_end;
-       atomic_set(&sparity->ref_count, 1);
+       atomic_set(&sparity->refs, 1);
         INIT_LIST_HEAD(&sparity->spages);
         sparity->dbitmap = sparity->bitmap;
         sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
@@ -3037,8 +3014,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
                 increment = map->stripe_len;
                 mirror_num = num % map->num_stripes + 1;
-       } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
-                               BTRFS_BLOCK_GROUP_RAID6)) {
+       } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
                 get_raid56_logic_offset(physical, num, map, &offset, NULL);
                 increment = map->stripe_len * nr_data_stripes(map);
                 mirror_num = 1;
@@ -3074,8 +3050,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
          */
         logical = base + offset;
         physical_end = physical + nstripes * map->stripe_len;
-       if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
-                        BTRFS_BLOCK_GROUP_RAID6)) {
+       if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
                 get_raid56_logic_offset(physical_end, num,
                                         map, &logic_end, NULL);
                 logic_end += base;
@@ -3121,8 +3096,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
         ret = 0;
         while (physical < physical_end) {
                 /* for raid56, we skip parity stripe */
-               if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
-                               BTRFS_BLOCK_GROUP_RAID6)) {
+               if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
                         ret = get_raid56_logic_offset(physical, num,
                                         map, &logical, &stripe_logical);
                         logical += base;
@@ -3280,8 +3254,7 @@ again:
                         scrub_free_csums(sctx);
                         if (extent_logical + extent_len <
                             key.objectid + bytes) {
-                               if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
-                                       BTRFS_BLOCK_GROUP_RAID6)) {
+                               if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
                                         /*
                                          * loop until we find next data stripe
                                          * or we have finished all stripes.
@@ -3775,7 +3748,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
         scrub_workers_put(fs_info);
         mutex_unlock(&fs_info->scrub_lock);
  
-       scrub_free_ctx(sctx);
+       scrub_put_ctx(sctx);
  
         return ret;
  }
@@ -3881,14 +3854,14 @@ static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
                               &mapped_length, &bbio, 0);
         if (ret || !bbio || mapped_length < extent_len ||
             !bbio->stripes[0].dev->bdev) {
-               kfree(bbio);
+               btrfs_put_bbio(bbio);
                 return;
         }
  
         *extent_physical = bbio->stripes[0].physical;
         *extent_mirror_num = bbio->mirror_num;
         *extent_dev = bbio->stripes[0].dev;
-       kfree(bbio);
+       btrfs_put_bbio(bbio);
  }
  
  static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c

index 804432d..fe58572 100644 (file)
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -2471,12 +2471,9 @@ verbose_printk("btrfs: send_utimes %llu\n", ino);
         if (ret < 0)
                 goto out;
         TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
-       TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb,
-                       btrfs_inode_atime(ii));
-       TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb,
-                       btrfs_inode_mtime(ii));
-       TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb,
-                       btrfs_inode_ctime(ii));
+       TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime);
+       TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime);
+       TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, &ii->ctime);
         /* TODO Add otime support when the otime patches get into upstream */
  
         ret = send_cmd(sctx);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c

index 6f49b28..05fef19 100644 (file)
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1958,11 +1958,6 @@ static int btrfs_freeze(struct super_block *sb)
         return btrfs_commit_transaction(trans, root);
  }
  
-static int btrfs_unfreeze(struct super_block *sb)
-{
-       return 0;
-}
-
  static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
  {
         struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb);
@@ -2011,7 +2006,6 @@ static const struct super_operations btrfs_super_ops = {
         .statfs         = btrfs_statfs,
         .remount_fs     = btrfs_remount,
         .freeze_fs      = btrfs_freeze,
-       .unfreeze_fs    = btrfs_unfreeze,
  };
  
  static const struct file_operations btrfs_ctl_fops = {
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c

index 92db3f6..94edb0a 100644 (file)
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -733,10 +733,18 @@ int btrfs_init_sysfs(void)
  
         ret = btrfs_init_debugfs();
         if (ret)
-               return ret;
+               goto out1;
  
         init_feature_attrs();
         ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
+       if (ret)
+               goto out2;
+
+       return 0;
+out2:
+       debugfs_remove_recursive(btrfs_debugfs_root_dentry);
+out1:
+       kset_unregister(btrfs_kset);
  
         return ret;
  }
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c

index cc286ce..f51963a 100644 (file)
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -53,7 +53,7 @@ static int test_btrfs_split_item(void)
                 return -ENOMEM;
         }
  
-       path->nodes[0] = eb = alloc_dummy_extent_buffer(0, 4096);
+       path->nodes[0] = eb = alloc_dummy_extent_buffer(NULL, 4096);
         if (!eb) {
                 test_msg("Could not allocate dummy buffer\n");
                 ret = -ENOMEM;
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c

index 7e99c2f..9e9f236 100644 (file)
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -258,8 +258,7 @@ static int test_find_delalloc(void)
         }
         ret = 0;
  out_bits:
-       clear_extent_bits(&tmp, 0, total_dirty - 1,
-                         (unsigned long)-1, GFP_NOFS);
+       clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_NOFS);
  out:
         if (locked_page)
                 page_cache_release(locked_page);
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c

index 3ae0f5b..a116b55 100644 (file)
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -255,7 +255,7 @@ static noinline int test_btrfs_get_extent(void)
                 goto out;
         }
  
-       root->node = alloc_dummy_extent_buffer(0, 4096);
+       root->node = alloc_dummy_extent_buffer(NULL, 4096);
         if (!root->node) {
                 test_msg("Couldn't allocate dummy buffer\n");
                 goto out;
@@ -843,7 +843,7 @@ static int test_hole_first(void)
                 goto out;
         }
  
-       root->node = alloc_dummy_extent_buffer(0, 4096);
+       root->node = alloc_dummy_extent_buffer(NULL, 4096);
         if (!root->node) {
                 test_msg("Couldn't allocate dummy buffer\n");
                 goto out;
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c

index ec3dcb2..73f299e 100644 (file)
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -404,12 +404,22 @@ int btrfs_test_qgroups(void)
                 ret = -ENOMEM;
                 goto out;
         }
+       /* We are using this root as our extent root */
+       root->fs_info->extent_root = root;
+
+       /*
+        * Some of the paths we test assume we have a filled out fs_info, so we
+        * just need to add the root in there so we don't panic.
+        */
+       root->fs_info->tree_root = root;
+       root->fs_info->quota_root = root;
+       root->fs_info->quota_enabled = 1;
  
         /*
          * Can't use bytenr 0, some things freak out
          * *cough*backref walking code*cough*
          */
-       root->node = alloc_test_extent_buffer(root->fs_info, 4096, 4096);
+       root->node = alloc_test_extent_buffer(root->fs_info, 4096);
         if (!root->node) {
                 test_msg("Couldn't allocate dummy buffer\n");
                 ret = -ENOMEM;
@@ -448,17 +458,6 @@ int btrfs_test_qgroups(void)
                 goto out;
         }
  
-       /* We are using this root as our extent root */
-       root->fs_info->extent_root = root;
-
-       /*
-        * Some of the paths we test assume we have a filled out fs_info, so we
-        * just need to addt he root in there so we don't panic.
-        */
-       root->fs_info->tree_root = root;
-       root->fs_info->quota_root = root;
-       root->fs_info->quota_enabled = 1;
-
         test_msg("Running qgroup tests\n");
         ret = test_no_shared_qgroup(root);
         if (ret)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c

index e88b59d..7e80f32 100644 (file)
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -220,6 +220,7 @@ loop:
          * commit the transaction.
          */
         atomic_set(&cur_trans->use_count, 2);
+       cur_trans->have_free_bgs = 0;
         cur_trans->start_time = get_seconds();
  
         cur_trans->delayed_refs.href_root = RB_ROOT;
@@ -248,6 +249,8 @@ loop:
         INIT_LIST_HEAD(&cur_trans->pending_chunks);
         INIT_LIST_HEAD(&cur_trans->switch_commits);
         INIT_LIST_HEAD(&cur_trans->pending_ordered);
+       INIT_LIST_HEAD(&cur_trans->dirty_bgs);
+       spin_lock_init(&cur_trans->dirty_bgs_lock);
         list_add_tail(&cur_trans->list, &fs_info->trans_list);
         extent_io_tree_init(&cur_trans->dirty_pages,
                              fs_info->btree_inode->i_mapping);
@@ -1020,6 +1023,7 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
         u64 old_root_bytenr;
         u64 old_root_used;
         struct btrfs_root *tree_root = root->fs_info->tree_root;
+       bool extent_root = (root->objectid == BTRFS_EXTENT_TREE_OBJECTID);
  
         old_root_used = btrfs_root_used(&root->root_item);
         btrfs_write_dirty_block_groups(trans, root);
@@ -1027,7 +1031,9 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
         while (1) {
                 old_root_bytenr = btrfs_root_bytenr(&root->root_item);
                 if (old_root_bytenr == root->node->start &&
-                   old_root_used == btrfs_root_used(&root->root_item))
+                   old_root_used == btrfs_root_used(&root->root_item) &&
+                   (!extent_root ||
+                    list_empty(&trans->transaction->dirty_bgs)))
                         break;
  
                 btrfs_set_root_node(&root->root_item, root->node);
@@ -1038,7 +1044,15 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
                         return ret;
  
                 old_root_used = btrfs_root_used(&root->root_item);
-               ret = btrfs_write_dirty_block_groups(trans, root);
+               if (extent_root) {
+                       ret = btrfs_write_dirty_block_groups(trans, root);
+                       if (ret)
+                               return ret;
+               }
+               ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+               if (ret)
+                       return ret;
+               ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
                 if (ret)
                         return ret;
         }
@@ -1061,10 +1075,6 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
         struct extent_buffer *eb;
         int ret;
  
-       ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-       if (ret)
-               return ret;
-
         eb = btrfs_lock_root_node(fs_info->tree_root);
         ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
                               0, &eb);
@@ -1097,6 +1107,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
                 next = fs_info->dirty_cowonly_roots.next;
                 list_del_init(next);
                 root = list_entry(next, struct btrfs_root, dirty_list);
+               clear_bit(BTRFS_ROOT_DIRTY, &root->state);
  
                 if (root != fs_info->extent_root)
                         list_add_tail(&root->dirty_list,
@@ -1983,6 +1994,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
         switch_commit_roots(cur_trans, root->fs_info);
  
         assert_qgroups_uptodate(trans);
+       ASSERT(list_empty(&cur_trans->dirty_bgs));
         update_super_roots(root);
  
         btrfs_set_super_log_root(root->fs_info->super_copy, 0);
@@ -2026,6 +2038,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
  
         btrfs_finish_extent_commit(trans, root);
  
+       if (cur_trans->have_free_bgs)
+               btrfs_clear_space_info_full(root->fs_info);
+
         root->fs_info->last_trans_committed = cur_trans->transid;
         /*
          * We needn't acquire the lock here because there is no other task
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h

index 00ed29c..937050a 100644 (file)
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -47,6 +47,11 @@ struct btrfs_transaction {
         atomic_t num_writers;
         atomic_t use_count;
  
+       /*
+        * true if there is free bgs operations in this transaction
+        */
+       int have_free_bgs;
+
         /* Be protected by fs_info->trans_lock when we want to change it. */
         enum btrfs_trans_state state;
         struct list_head list;
@@ -58,6 +63,8 @@ struct btrfs_transaction {
         struct list_head pending_chunks;
         struct list_head pending_ordered;
         struct list_head switch_commits;
+       struct list_head dirty_bgs;
+       spinlock_t dirty_bgs_lock;
         struct btrfs_delayed_ref_root delayed_refs;
         int aborted;
  };
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c

index 1a9585d..9a37f8b 100644 (file)
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -453,11 +453,13 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
  insert:
         btrfs_release_path(path);
         /* try to insert the key into the destination tree */
+       path->skip_release_on_error = 1;
         ret = btrfs_insert_empty_item(trans, root, path,
                                       key, item_size);
+       path->skip_release_on_error = 0;
  
         /* make sure any existing item is the correct size */
-       if (ret == -EEXIST) {
+       if (ret == -EEXIST || ret == -EOVERFLOW) {
                 u32 found_size;
                 found_size = btrfs_item_size_nr(path->nodes[0],
                                                 path->slots[0]);
@@ -488,8 +490,20 @@ insert:
                 src_item = (struct btrfs_inode_item *)src_ptr;
                 dst_item = (struct btrfs_inode_item *)dst_ptr;
  
-               if (btrfs_inode_generation(eb, src_item) == 0)
+               if (btrfs_inode_generation(eb, src_item) == 0) {
+                       struct extent_buffer *dst_eb = path->nodes[0];
+
+                       if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
+                           S_ISREG(btrfs_inode_mode(dst_eb, dst_item))) {
+                               struct btrfs_map_token token;
+                               u64 ino_size = btrfs_inode_size(eb, src_item);
+
+                               btrfs_init_map_token(&token);
+                               btrfs_set_token_inode_size(dst_eb, dst_item,
+                                                          ino_size, &token);
+                       }
                         goto no_copy;
+               }
  
                 if (overwrite_root &&
                     S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
@@ -844,7 +858,7 @@ out:
  static noinline int backref_in_log(struct btrfs_root *log,
                                    struct btrfs_key *key,
                                    u64 ref_objectid,
-                                  char *name, int namelen)
+                                  const char *name, int namelen)
  {
         struct btrfs_path *path;
         struct btrfs_inode_ref *ref;
@@ -1254,13 +1268,14 @@ out:
  }
  
  static int insert_orphan_item(struct btrfs_trans_handle *trans,
-                             struct btrfs_root *root, u64 offset)
+                             struct btrfs_root *root, u64 ino)
  {
         int ret;
-       ret = btrfs_find_item(root, NULL, BTRFS_ORPHAN_OBJECTID,
-                       offset, BTRFS_ORPHAN_ITEM_KEY, NULL);
-       if (ret > 0)
-               ret = btrfs_insert_orphan_item(trans, root, offset);
+
+       ret = btrfs_insert_orphan_item(trans, root, ino);
+       if (ret == -EEXIST)
+               ret = 0;
+
         return ret;
  }
  
@@ -1287,6 +1302,7 @@ static int count_inode_extrefs(struct btrfs_root *root,
                 leaf = path->nodes[0];
                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+               cur_offset = 0;
  
                 while (cur_offset < item_size) {
                         extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
@@ -1302,7 +1318,7 @@ static int count_inode_extrefs(struct btrfs_root *root,
         }
         btrfs_release_path(path);
  
-       if (ret < 0)
+       if (ret < 0 && ret != -ENOENT)
                 return ret;
         return nlink;
  }
@@ -1394,9 +1410,6 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
         nlink = ret;
  
         ret = count_inode_extrefs(root, inode, path);
-       if (ret == -ENOENT)
-               ret = 0;
-
         if (ret < 0)
                 goto out;
  
@@ -1556,6 +1569,30 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
         return ret;
  }
  
+/*
+ * Return true if an inode reference exists in the log for the given name,
+ * inode and parent inode.
+ */
+static bool name_in_log_ref(struct btrfs_root *log_root,
+                           const char *name, const int name_len,
+                           const u64 dirid, const u64 ino)
+{
+       struct btrfs_key search_key;
+
+       search_key.objectid = ino;
+       search_key.type = BTRFS_INODE_REF_KEY;
+       search_key.offset = dirid;
+       if (backref_in_log(log_root, &search_key, dirid, name, name_len))
+               return true;
+
+       search_key.type = BTRFS_INODE_EXTREF_KEY;
+       search_key.offset = btrfs_extref_hash(dirid, name, name_len);
+       if (backref_in_log(log_root, &search_key, dirid, name, name_len))
+               return true;
+
+       return false;
+}
+
  /*
   * take a single entry in a log directory item and replay it into
   * the subvolume.
@@ -1666,10 +1703,17 @@ out:
         return ret;
  
  insert:
+       if (name_in_log_ref(root->log_root, name, name_len,
+                           key->objectid, log_key.objectid)) {
+               /* The dentry will be added later. */
+               ret = 0;
+               update_size = false;
+               goto out;
+       }
         btrfs_release_path(path);
         ret = insert_one_name(trans, root, path, key->objectid, key->offset,
                               name, name_len, log_type, &log_key);
-       if (ret && ret != -ENOENT)
+       if (ret && ret != -ENOENT && ret != -EEXIST)
                 goto out;
         update_size = false;
         ret = 0;
@@ -2164,7 +2208,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
                 parent = path->nodes[*level];
                 root_owner = btrfs_header_owner(parent);
  
-               next = btrfs_find_create_tree_block(root, bytenr, blocksize);
+               next = btrfs_find_create_tree_block(root, bytenr);
                 if (!next)
                         return -ENOMEM;
  
@@ -2416,8 +2460,8 @@ static void wait_for_writer(struct btrfs_trans_handle *trans,
                 mutex_unlock(&root->log_mutex);
                 if (atomic_read(&root->log_writers))
                         schedule();
-               mutex_lock(&root->log_mutex);
                 finish_wait(&root->log_writer_wait, &wait);
+               mutex_lock(&root->log_mutex);
         }
  }
  
@@ -3219,7 +3263,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
  static void fill_inode_item(struct btrfs_trans_handle *trans,
                             struct extent_buffer *leaf,
                             struct btrfs_inode_item *item,
-                           struct inode *inode, int log_inode_only)
+                           struct inode *inode, int log_inode_only,
+                           u64 logged_isize)
  {
         struct btrfs_map_token token;
  
@@ -3232,7 +3277,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
                  * to say 'update this inode with these values'
                  */
                 btrfs_set_token_inode_generation(leaf, item, 0, &token);
-               btrfs_set_token_inode_size(leaf, item, 0, &token);
+               btrfs_set_token_inode_size(leaf, item, logged_isize, &token);
         } else {
                 btrfs_set_token_inode_generation(leaf, item,
                                                  BTRFS_I(inode)->generation,
@@ -3245,19 +3290,19 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
         btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
         btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
  
-       btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
+       btrfs_set_token_timespec_sec(leaf, &item->atime,
                                      inode->i_atime.tv_sec, &token);
-       btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
+       btrfs_set_token_timespec_nsec(leaf, &item->atime,
                                       inode->i_atime.tv_nsec, &token);
  
-       btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
+       btrfs_set_token_timespec_sec(leaf, &item->mtime,
                                      inode->i_mtime.tv_sec, &token);
-       btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
+       btrfs_set_token_timespec_nsec(leaf, &item->mtime,
                                       inode->i_mtime.tv_nsec, &token);
  
-       btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
+       btrfs_set_token_timespec_sec(leaf, &item->ctime,
                                      inode->i_ctime.tv_sec, &token);
-       btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
+       btrfs_set_token_timespec_nsec(leaf, &item->ctime,
                                       inode->i_ctime.tv_nsec, &token);
  
         btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
@@ -3284,7 +3329,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans,
                 return ret;
         inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
                                     struct btrfs_inode_item);
-       fill_inode_item(trans, path->nodes[0], inode_item, inode, 0);
+       fill_inode_item(trans, path->nodes[0], inode_item, inode, 0, 0);
         btrfs_release_path(path);
         return 0;
  }
@@ -3293,7 +3338,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
                                struct inode *inode,
                                struct btrfs_path *dst_path,
                                struct btrfs_path *src_path, u64 *last_extent,
-                              int start_slot, int nr, int inode_only)
+                              int start_slot, int nr, int inode_only,
+                              u64 logged_isize)
  {
         unsigned long src_offset;
         unsigned long dst_offset;
@@ -3350,7 +3396,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
                                                     dst_path->slots[0],
                                                     struct btrfs_inode_item);
                         fill_inode_item(trans, dst_path->nodes[0], inode_item,
-                                       inode, inode_only == LOG_INODE_EXISTS);
+                                       inode, inode_only == LOG_INODE_EXISTS,
+                                       logged_isize);
                 } else {
                         copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
                                            src_offset, ins_sizes[i]);
@@ -3902,6 +3949,33 @@ process:
         return ret;
  }
  
+static int logged_inode_size(struct btrfs_root *log, struct inode *inode,
+                            struct btrfs_path *path, u64 *size_ret)
+{
+       struct btrfs_key key;
+       int ret;
+
+       key.objectid = btrfs_ino(inode);
+       key.type = BTRFS_INODE_ITEM_KEY;
+       key.offset = 0;
+
+       ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
+       if (ret < 0) {
+               return ret;
+       } else if (ret > 0) {
+               *size_ret = i_size_read(inode);
+       } else {
+               struct btrfs_inode_item *item;
+
+               item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                                     struct btrfs_inode_item);
+               *size_ret = btrfs_inode_size(path->nodes[0], item);
+       }
+
+       btrfs_release_path(path);
+       return 0;
+}
+
  /* log a single inode in the tree log.
   * At least one parent directory for this inode must exist in the tree
   * or be logged already.
@@ -3939,6 +4013,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
         bool fast_search = false;
         u64 ino = btrfs_ino(inode);
         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+       u64 logged_isize = 0;
  
         path = btrfs_alloc_path();
         if (!path)
@@ -3966,15 +4041,22 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                 max_key.type = (u8)-1;
         max_key.offset = (u64)-1;
  
-       /* Only run delayed items if we are a dir or a new file */
+       /*
+        * Only run delayed items if we are a dir or a new file.
+        * Otherwise commit the delayed inode only, which is needed in
+        * order for the log replay code to mark inodes for link count
+        * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items).
+        */
         if (S_ISDIR(inode->i_mode) ||
-           BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) {
+           BTRFS_I(inode)->generation > root->fs_info->last_trans_committed)
                 ret = btrfs_commit_inode_delayed_items(trans, inode);
-               if (ret) {
-                       btrfs_free_path(path);
-                       btrfs_free_path(dst_path);
-                       return ret;
-               }
+       else
+               ret = btrfs_commit_inode_delayed_inode(inode);
+
+       if (ret) {
+               btrfs_free_path(path);
+               btrfs_free_path(dst_path);
+               return ret;
         }
  
         mutex_lock(&BTRFS_I(inode)->log_mutex);
@@ -3988,22 +4070,56 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
         if (S_ISDIR(inode->i_mode)) {
                 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
  
-               if (inode_only == LOG_INODE_EXISTS)
-                       max_key_type = BTRFS_XATTR_ITEM_KEY;
+               if (inode_only == LOG_INODE_EXISTS) {
+                       max_key_type = BTRFS_INODE_EXTREF_KEY;
+                       max_key.type = max_key_type;
+               }
                 ret = drop_objectid_items(trans, log, path, ino, max_key_type);
         } else {
-               if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
-                                      &BTRFS_I(inode)->runtime_flags)) {
-                       clear_bit(BTRFS_INODE_COPY_EVERYTHING,
-                                 &BTRFS_I(inode)->runtime_flags);
-                       ret = btrfs_truncate_inode_items(trans, log,
-                                                        inode, 0, 0);
-               } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
-                                             &BTRFS_I(inode)->runtime_flags) ||
+               if (inode_only == LOG_INODE_EXISTS) {
+                       /*
+                        * Make sure the new inode item we write to the log has
+                        * the same isize as the current one (if it exists).
+                        * This is necessary to prevent data loss after log
+                        * replay, and also to prevent doing a wrong expanding
+                        * truncate - for e.g. create file, write 4K into offset
+                        * 0, fsync, write 4K into offset 4096, add hard link,
+                        * fsync some other file (to sync log), power fail - if
+                        * we use the inode's current i_size, after log replay
+                        * we get a 8Kb file, with the last 4Kb extent as a hole
+                        * (zeroes), as if an expanding truncate happened,
+                        * instead of getting a file of 4Kb only.
+                        */
+                       err = logged_inode_size(log, inode, path,
+                                               &logged_isize);
+                       if (err)
+                               goto out_unlock;
+               }
+               if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+                            &BTRFS_I(inode)->runtime_flags)) {
+                       if (inode_only == LOG_INODE_EXISTS) {
+                               max_key.type = BTRFS_INODE_EXTREF_KEY;
+                               ret = drop_objectid_items(trans, log, path, ino,
+                                                         max_key.type);
+                       } else {
+                               clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+                                         &BTRFS_I(inode)->runtime_flags);
+                               clear_bit(BTRFS_INODE_COPY_EVERYTHING,
+                                         &BTRFS_I(inode)->runtime_flags);
+                               ret = btrfs_truncate_inode_items(trans, log,
+                                                                inode, 0, 0);
+                       }
+               } else if (test_bit(BTRFS_INODE_COPY_EVERYTHING,
+                                   &BTRFS_I(inode)->runtime_flags) ||
                            inode_only == LOG_INODE_EXISTS) {
-                       if (inode_only == LOG_INODE_ALL)
+                       if (inode_only == LOG_INODE_ALL) {
+                               clear_bit(BTRFS_INODE_COPY_EVERYTHING,
+                                         &BTRFS_I(inode)->runtime_flags);
                                 fast_search = true;
-                       max_key.type = BTRFS_XATTR_ITEM_KEY;
+                               max_key.type = BTRFS_XATTR_ITEM_KEY;
+                       } else {
+                               max_key.type = BTRFS_INODE_EXTREF_KEY;
+                       }
                         ret = drop_objectid_items(trans, log, path, ino,
                                                   max_key.type);
                 } else {
@@ -4047,7 +4163,8 @@ again:
                 }
  
                 ret = copy_items(trans, inode, dst_path, path, &last_extent,
-                                ins_start_slot, ins_nr, inode_only);
+                                ins_start_slot, ins_nr, inode_only,
+                                logged_isize);
                 if (ret < 0) {
                         err = ret;
                         goto out_unlock;
@@ -4071,7 +4188,7 @@ next_slot:
                 if (ins_nr) {
                         ret = copy_items(trans, inode, dst_path, path,
                                          &last_extent, ins_start_slot,
-                                        ins_nr, inode_only);
+                                        ins_nr, inode_only, logged_isize);
                         if (ret < 0) {
                                 err = ret;
                                 goto out_unlock;
@@ -4092,7 +4209,8 @@ next_slot:
         }
         if (ins_nr) {
                 ret = copy_items(trans, inode, dst_path, path, &last_extent,
-                                ins_start_slot, ins_nr, inode_only);
+                                ins_start_slot, ins_nr, inode_only,
+                                logged_isize);
                 if (ret < 0) {
                         err = ret;
                         goto out_unlock;
@@ -4273,6 +4391,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
         struct dentry *old_parent = NULL;
         int ret = 0;
         u64 last_committed = root->fs_info->last_trans_committed;
+       const struct dentry * const first_parent = parent;
+       const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans >
+                                last_committed);
  
         sb = inode->i_sb;
  
@@ -4328,7 +4449,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                 goto end_trans;
         }
  
-       inode_only = LOG_INODE_EXISTS;
         while (1) {
                 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
                         break;
@@ -4337,8 +4457,22 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                 if (root != BTRFS_I(inode)->root)
                         break;
  
+               /*
+                * On unlink we must make sure our immediate parent directory
+                * inode is fully logged. This is to prevent leaving dangling
+                * directory index entries and a wrong directory inode's i_size.
+                * Not doing so can result in a directory being impossible to
+                * delete after log replay (rmdir will always fail with error
+                * -ENOTEMPTY).
+                */
+               if (did_unlink && parent == first_parent)
+                       inode_only = LOG_INODE_ALL;
+               else
+                       inode_only = LOG_INODE_EXISTS;
+
                 if (BTRFS_I(inode)->generation >
-                   root->fs_info->last_trans_committed) {
+                   root->fs_info->last_trans_committed ||
+                   inode_only == LOG_INODE_ALL) {
                         ret = btrfs_log_inode(trans, root, inode, inode_only,
                                               0, LLONG_MAX, ctx);
                         if (ret)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c

index 50c5a87..cd4d131 100644 (file)
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1310,6 +1310,8 @@ again:
         if (ret) {
                 btrfs_error(root->fs_info, ret,
                             "Failed to remove dev extent item");
+       } else {
+               trans->transaction->have_free_bgs = 1;
         }
  out:
         btrfs_free_path(path);
@@ -4196,7 +4198,7 @@ static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
  
  static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
  {
-       if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)))
+       if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
                 return;
  
         btrfs_set_fs_incompat(info, RAID56);
@@ -4803,10 +4805,8 @@ unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
  
         BUG_ON(em->start > logical || em->start + em->len < logical);
         map = (struct map_lookup *)em->bdev;
-       if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
-                        BTRFS_BLOCK_GROUP_RAID6)) {
+       if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
                 len = map->stripe_len * nr_data_stripes(map);
-       }
         free_extent_map(em);
         return len;
  }
@@ -4826,8 +4826,7 @@ int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
  
         BUG_ON(em->start > logical || em->start + em->len < logical);
         map = (struct map_lookup *)em->bdev;
-       if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
-                        BTRFS_BLOCK_GROUP_RAID6))
+       if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
                 ret = 1;
         free_extent_map(em);
         return ret;
@@ -4876,32 +4875,24 @@ static inline int parity_smaller(u64 a, u64 b)
  }
  
  /* Bubble-sort the stripe set to put the parity/syndrome stripes last */
-static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
+static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
  {
         struct btrfs_bio_stripe s;
-       int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
         int i;
         u64 l;
         int again = 1;
-       int m;
  
         while (again) {
                 again = 0;
-               for (i = 0; i < real_stripes - 1; i++) {
-                       if (parity_smaller(raid_map[i], raid_map[i+1])) {
+               for (i = 0; i < num_stripes - 1; i++) {
+                       if (parity_smaller(bbio->raid_map[i],
+                                          bbio->raid_map[i+1])) {
                                 s = bbio->stripes[i];
-                               l = raid_map[i];
+                               l = bbio->raid_map[i];
                                 bbio->stripes[i] = bbio->stripes[i+1];
-                               raid_map[i] = raid_map[i+1];
+                               bbio->raid_map[i] = bbio->raid_map[i+1];
                                 bbio->stripes[i+1] = s;
-                               raid_map[i+1] = l;
-
-                               if (bbio->tgtdev_map) {
-                                       m = bbio->tgtdev_map[i];
-                                       bbio->tgtdev_map[i] =
-                                                       bbio->tgtdev_map[i + 1];
-                                       bbio->tgtdev_map[i + 1] = m;
-                               }
+                               bbio->raid_map[i+1] = l;
  
                                 again = 1;
                         }
@@ -4909,10 +4900,41 @@ static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
         }
  }
  
+static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
+{
+       struct btrfs_bio *bbio = kzalloc(
+               sizeof(struct btrfs_bio) +
+               sizeof(struct btrfs_bio_stripe) * (total_stripes) +
+               sizeof(int) * (real_stripes) +
+               sizeof(u64) * (real_stripes),
+               GFP_NOFS);
+       if (!bbio)
+               return NULL;
+
+       atomic_set(&bbio->error, 0);
+       atomic_set(&bbio->refs, 1);
+
+       return bbio;
+}
+
+void btrfs_get_bbio(struct btrfs_bio *bbio)
+{
+       WARN_ON(!atomic_read(&bbio->refs));
+       atomic_inc(&bbio->refs);
+}
+
+void btrfs_put_bbio(struct btrfs_bio *bbio)
+{
+       if (!bbio)
+               return;
+       if (atomic_dec_and_test(&bbio->refs))
+               kfree(bbio);
+}
+
  static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                              u64 logical, u64 *length,
                              struct btrfs_bio **bbio_ret,
-                            int mirror_num, u64 **raid_map_ret)
+                            int mirror_num, int need_raid_map)
  {
         struct extent_map *em;
         struct map_lookup *map;
@@ -4925,7 +4947,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
         u64 stripe_nr_orig;
         u64 stripe_nr_end;
         u64 stripe_len;
-       u64 *raid_map = NULL;
         int stripe_index;
         int i;
         int ret = 0;
@@ -4976,7 +4997,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
         stripe_offset = offset - stripe_offset;
  
         /* if we're here for raid56, we need to know the stripe aligned start */
-       if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
+       if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
                 unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
                 raid56_full_stripe_start = offset;
  
@@ -4989,8 +5010,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
  
         if (rw & REQ_DISCARD) {
                 /* we don't discard raid56 yet */
-               if (map->type &
-                   (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
+               if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
                         ret = -EOPNOTSUPP;
                         goto out;
                 }
@@ -5000,7 +5020,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                 /* For writes to RAID[56], allow a full stripeset across all disks.
                    For other RAID types and for RAID[56] reads, just allow a single
                    stripe (on a single disk). */
-               if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) &&
+               if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
                     (rw & REQ_WRITE)) {
                         max_len = stripe_len * nr_data_stripes(map) -
                                 (offset - raid56_full_stripe_start);
@@ -5047,7 +5067,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                 u64 physical_of_found = 0;
  
                 ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
-                            logical, &tmp_length, &tmp_bbio, 0, NULL);
+                            logical, &tmp_length, &tmp_bbio, 0, 0);
                 if (ret) {
                         WARN_ON(tmp_bbio != NULL);
                         goto out;
@@ -5061,7 +5081,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                          * is not left of the left cursor
                          */
                         ret = -EIO;
-                       kfree(tmp_bbio);
+                       btrfs_put_bbio(tmp_bbio);
                         goto out;
                 }
  
@@ -5096,11 +5116,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                 } else {
                         WARN_ON(1);
                         ret = -EIO;
-                       kfree(tmp_bbio);
+                       btrfs_put_bbio(tmp_bbio);
                         goto out;
                 }
  
-               kfree(tmp_bbio);
+               btrfs_put_bbio(tmp_bbio);
         } else if (mirror_num > map->num_stripes) {
                 mirror_num = 0;
         }
@@ -5166,15 +5186,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                         mirror_num = stripe_index - old_stripe_index + 1;
                 }
  
-       } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
-                               BTRFS_BLOCK_GROUP_RAID6)) {
-               u64 tmp;
-
-               if (raid_map_ret &&
+       } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+               if (need_raid_map &&
                     ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
                      mirror_num > 1)) {
-                       int i, rot;
-
                         /* push stripe_nr back to the start of the full stripe */
                         stripe_nr = raid56_full_stripe_start;
                         do_div(stripe_nr, stripe_len * nr_data_stripes(map));
@@ -5183,32 +5198,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                         num_stripes = map->num_stripes;
                         max_errors = nr_parity_stripes(map);
  
-                       raid_map = kmalloc_array(num_stripes, sizeof(u64),
-                                          GFP_NOFS);
-                       if (!raid_map) {
-                               ret = -ENOMEM;
-                               goto out;
-                       }
-
-                       /* Work out the disk rotation on this stripe-set */
-                       tmp = stripe_nr;
-                       rot = do_div(tmp, num_stripes);
-
-                       /* Fill in the logical address of each stripe */
-                       tmp = stripe_nr * nr_data_stripes(map);
-                       for (i = 0; i < nr_data_stripes(map); i++)
-                               raid_map[(i+rot) % num_stripes] =
-                                       em->start + (tmp + i) * map->stripe_len;
-
-                       raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
-                       if (map->type & BTRFS_BLOCK_GROUP_RAID6)
-                               raid_map[(i+rot+1) % num_stripes] =
-                                       RAID6_Q_STRIPE;
-
                         *length = map->stripe_len;
                         stripe_index = 0;
                         stripe_offset = 0;
                 } else {
+                       u64 tmp;
+
                         /*
                          * Mirror #0 or #1 means the original data block.
                          * Mirror #2 is RAID5 parity block.
@@ -5246,17 +5241,42 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                 tgtdev_indexes = num_stripes;
         }
  
-       bbio = kzalloc(btrfs_bio_size(num_alloc_stripes, tgtdev_indexes),
-                      GFP_NOFS);
+       bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
         if (!bbio) {
-               kfree(raid_map);
                 ret = -ENOMEM;
                 goto out;
         }
-       atomic_set(&bbio->error, 0);
         if (dev_replace_is_ongoing)
                 bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
  
+       /* build raid_map */
+       if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK &&
+           need_raid_map && ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
+           mirror_num > 1)) {
+               u64 tmp;
+               int i, rot;
+
+               bbio->raid_map = (u64 *)((void *)bbio->stripes +
+                                sizeof(struct btrfs_bio_stripe) *
+                                num_alloc_stripes +
+                                sizeof(int) * tgtdev_indexes);
+
+               /* Work out the disk rotation on this stripe-set */
+               tmp = stripe_nr;
+               rot = do_div(tmp, num_stripes);
+
+               /* Fill in the logical address of each stripe */
+               tmp = stripe_nr * nr_data_stripes(map);
+               for (i = 0; i < nr_data_stripes(map); i++)
+                       bbio->raid_map[(i+rot) % num_stripes] =
+                               em->start + (tmp + i) * map->stripe_len;
+
+               bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
+               if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+                       bbio->raid_map[(i+rot+1) % num_stripes] =
+                               RAID6_Q_STRIPE;
+       }
+
         if (rw & REQ_DISCARD) {
                 int factor = 0;
                 int sub_stripes = 0;
@@ -5340,6 +5360,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
         if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
                 max_errors = btrfs_chunk_max_errors(map);
  
+       if (bbio->raid_map)
+               sort_parity_stripes(bbio, num_stripes);
+
         tgtdev_indexes = 0;
         if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
             dev_replace->tgtdev != NULL) {
@@ -5427,6 +5450,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
         }
  
         *bbio_ret = bbio;
+       bbio->map_type = map->type;
         bbio->num_stripes = num_stripes;
         bbio->max_errors = max_errors;
         bbio->mirror_num = mirror_num;
@@ -5443,10 +5467,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                 bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
                 bbio->mirror_num = map->num_stripes + 1;
         }
-       if (raid_map) {
-               sort_parity_stripes(bbio, raid_map);
-               *raid_map_ret = raid_map;
-       }
  out:
         if (dev_replace_is_ongoing)
                 btrfs_dev_replace_unlock(dev_replace);
@@ -5459,17 +5479,17 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                       struct btrfs_bio **bbio_ret, int mirror_num)
  {
         return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
-                                mirror_num, NULL);
+                                mirror_num, 0);
  }
  
  /* For Scrub/replace */
  int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
                      u64 logical, u64 *length,
                      struct btrfs_bio **bbio_ret, int mirror_num,
-                    u64 **raid_map_ret)
+                    int need_raid_map)
  {
         return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
-                                mirror_num, raid_map_ret);
+                                mirror_num, need_raid_map);
  }
  
  int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -5511,8 +5531,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
                 do_div(length, map->num_stripes / map->sub_stripes);
         else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
                 do_div(length, map->num_stripes);
-       else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
-                             BTRFS_BLOCK_GROUP_RAID6)) {
+       else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
                 do_div(length, nr_data_stripes(map));
                 rmap_len = map->stripe_len * nr_data_stripes(map);
         }
@@ -5565,7 +5584,7 @@ static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int e
                 bio_endio_nodec(bio, err);
         else
                 bio_endio(bio, err);
-       kfree(bbio);
+       btrfs_put_bbio(bbio);
  }
  
  static void btrfs_end_bio(struct bio *bio, int err)
@@ -5808,7 +5827,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
         u64 logical = (u64)bio->bi_iter.bi_sector << 9;
         u64 length = 0;
         u64 map_length;
-       u64 *raid_map = NULL;
         int ret;
         int dev_nr = 0;
         int total_devs = 1;
@@ -5819,7 +5837,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
  
         btrfs_bio_counter_inc_blocked(root->fs_info);
         ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
-                             mirror_num, &raid_map);
+                             mirror_num, 1);
         if (ret) {
                 btrfs_bio_counter_dec(root->fs_info);
                 return ret;
@@ -5832,15 +5850,13 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
         bbio->fs_info = root->fs_info;
         atomic_set(&bbio->stripes_pending, bbio->num_stripes);
  
-       if (raid_map) {
+       if (bbio->raid_map) {
                 /* In this case, map_length has been set to the length of
                    a single stripe; not the whole write */
                 if (rw & WRITE) {
-                       ret = raid56_parity_write(root, bio, bbio,
-                                                 raid_map, map_length);
+                       ret = raid56_parity_write(root, bio, bbio, map_length);
                 } else {
-                       ret = raid56_parity_recover(root, bio, bbio,
-                                                   raid_map, map_length,
+                       ret = raid56_parity_recover(root, bio, bbio, map_length,
                                                     mirror_num, 1);
                 }
  
@@ -6238,17 +6254,22 @@ int btrfs_read_sys_array(struct btrfs_root *root)
         struct extent_buffer *sb;
         struct btrfs_disk_key *disk_key;
         struct btrfs_chunk *chunk;
-       u8 *ptr;
-       unsigned long sb_ptr;
+       u8 *array_ptr;
+       unsigned long sb_array_offset;
         int ret = 0;
         u32 num_stripes;
         u32 array_size;
         u32 len = 0;
-       u32 cur;
+       u32 cur_offset;
         struct btrfs_key key;
  
-       sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
-                                         BTRFS_SUPER_INFO_SIZE);
+       ASSERT(BTRFS_SUPER_INFO_SIZE <= root->nodesize);
+       /*
+        * This will create extent buffer of nodesize, superblock size is
+        * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
+        * overallocate but we can keep it as-is, only the first page is used.
+        */
+       sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET);
         if (!sb)
                 return -ENOMEM;
         btrfs_set_buffer_uptodate(sb);
@@ -6271,35 +6292,56 @@ int btrfs_read_sys_array(struct btrfs_root *root)
         write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
         array_size = btrfs_super_sys_array_size(super_copy);
  
-       ptr = super_copy->sys_chunk_array;
-       sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
-       cur = 0;
+       array_ptr = super_copy->sys_chunk_array;
+       sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
+       cur_offset = 0;
+
+       while (cur_offset < array_size) {
+               disk_key = (struct btrfs_disk_key *)array_ptr;
+               len = sizeof(*disk_key);
+               if (cur_offset + len > array_size)
+                       goto out_short_read;
  
-       while (cur < array_size) {
-               disk_key = (struct btrfs_disk_key *)ptr;
                 btrfs_disk_key_to_cpu(&key, disk_key);
  
-               len = sizeof(*disk_key); ptr += len;
-               sb_ptr += len;
-               cur += len;
+               array_ptr += len;
+               sb_array_offset += len;
+               cur_offset += len;
  
                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
-                       chunk = (struct btrfs_chunk *)sb_ptr;
+                       chunk = (struct btrfs_chunk *)sb_array_offset;
+                       /*
+                        * At least one btrfs_chunk with one stripe must be
+                        * present, exact stripe count check comes afterwards
+                        */
+                       len = btrfs_chunk_item_size(1);
+                       if (cur_offset + len > array_size)
+                               goto out_short_read;
+
+                       num_stripes = btrfs_chunk_num_stripes(sb, chunk);
+                       len = btrfs_chunk_item_size(num_stripes);
+                       if (cur_offset + len > array_size)
+                               goto out_short_read;
+
                         ret = read_one_chunk(root, &key, sb, chunk);
                         if (ret)
                                 break;
-                       num_stripes = btrfs_chunk_num_stripes(sb, chunk);
-                       len = btrfs_chunk_item_size(num_stripes);
                 } else {
                         ret = -EIO;
                         break;
                 }
-               ptr += len;
-               sb_ptr += len;
-               cur += len;
+               array_ptr += len;
+               sb_array_offset += len;
+               cur_offset += len;
         }
         free_extent_buffer(sb);
         return ret;
+
+out_short_read:
+       printk(KERN_ERR "BTRFS: sys_array too short to read %u bytes at offset %u\n",
+                       len, cur_offset);
+       free_extent_buffer(sb);
+       return -EIO;
  }
  
  int btrfs_read_chunk_tree(struct btrfs_root *root)
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h

index d6fe73c..83069de 100644 (file)
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -295,8 +295,10 @@ typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
  #define BTRFS_BIO_ORIG_BIO_SUBMITTED   (1 << 0)
  
  struct btrfs_bio {
+       atomic_t refs;
         atomic_t stripes_pending;
         struct btrfs_fs_info *fs_info;
+       u64 map_type; /* get from map_lookup->type */
         bio_end_io_t *end_io;
         struct bio *orig_bio;
         unsigned long flags;
@@ -307,6 +309,12 @@ struct btrfs_bio {
         int mirror_num;
         int num_tgtdevs;
         int *tgtdev_map;
+       /*
+        * logical block numbers for the start of each stripe
+        * The last one or two are p/q.  These are sorted,
+        * so raid_map[0] is the start of our full stripe
+        */
+       u64 *raid_map;
         struct btrfs_bio_stripe stripes[];
  };
  
@@ -388,19 +396,15 @@ struct btrfs_balance_control {
  
  int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
                                    u64 end, u64 *length);
-
-#define btrfs_bio_size(total_stripes, real_stripes)            \
-       (sizeof(struct btrfs_bio) +                             \
-        (sizeof(struct btrfs_bio_stripe) * (total_stripes)) +  \
-        (sizeof(int) * (real_stripes)))
-
+void btrfs_get_bbio(struct btrfs_bio *bbio);
+void btrfs_put_bbio(struct btrfs_bio *bbio);
  int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                     u64 logical, u64 *length,
                     struct btrfs_bio **bbio_ret, int mirror_num);
  int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
                      u64 logical, u64 *length,
                      struct btrfs_bio **bbio_ret, int mirror_num,
-                    u64 **raid_map_ret);
+                    int need_raid_map);
  int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
                      u64 chunk_start, u64 physical, u64 devid,
                      u64 **logical, int *naddrs, int *stripe_len);
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h

index 611e1c5..b6dec05 100644 (file)
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -495,8 +495,7 @@ struct btrfs_ioctl_send_args {
  
  /* Error codes as returned by the kernel */
  enum btrfs_err_code {
-       notused,
-       BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
+       BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET = 1,
         BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
         BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
         BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
author	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 19 Feb 2015 22:36:00 +0000 (14:36 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 19 Feb 2015 22:36:00 +0000 (14:36 -0800)
fs/btrfs/backref.c		patch \| blob \| history
fs/btrfs/backref.h		patch \| blob \| history
fs/btrfs/btrfs_inode.h		patch \| blob \| history
fs/btrfs/ctree.c		patch \| blob \| history
fs/btrfs/ctree.h		patch \| blob \| history
fs/btrfs/delayed-inode.c		patch \| blob \| history
fs/btrfs/dev-replace.c		patch \| blob \| history
fs/btrfs/disk-io.c		patch \| blob \| history
fs/btrfs/disk-io.h		patch \| blob \| history
fs/btrfs/extent-tree.c		patch \| blob \| history
fs/btrfs/extent_io.c		patch \| blob \| history
fs/btrfs/extent_io.h		patch \| blob \| history
fs/btrfs/free-space-cache.c		patch \| blob \| history
fs/btrfs/inode-item.c		patch \| blob \| history
fs/btrfs/inode.c		patch \| blob \| history
fs/btrfs/qgroup.c		patch \| blob \| history
fs/btrfs/raid56.c		patch \| blob \| history
fs/btrfs/raid56.h		patch \| blob \| history
fs/btrfs/reada.c		patch \| blob \| history
fs/btrfs/relocation.c		patch \| blob \| history
fs/btrfs/scrub.c		patch \| blob \| history
fs/btrfs/send.c		patch \| blob \| history
fs/btrfs/super.c		patch \| blob \| history
fs/btrfs/sysfs.c		patch \| blob \| history
fs/btrfs/tests/extent-buffer-tests.c		patch \| blob \| history
fs/btrfs/tests/extent-io-tests.c		patch \| blob \| history
fs/btrfs/tests/inode-tests.c		patch \| blob \| history
fs/btrfs/tests/qgroup-tests.c		patch \| blob \| history
fs/btrfs/transaction.c		patch \| blob \| history
fs/btrfs/transaction.h		patch \| blob \| history
fs/btrfs/tree-log.c		patch \| blob \| history
fs/btrfs/volumes.c		patch \| blob \| history
fs/btrfs/volumes.h		patch \| blob \| history
include/uapi/linux/btrfs.h		patch \| blob \| history