Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux...
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 19 Feb 2015 22:36:00 +0000 (14:36 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 19 Feb 2015 22:36:00 +0000 (14:36 -0800)
Pull btrfs updates from Chris Mason:
 "This pull is mostly cleanups and fixes:

   - The raid5/6 cleanups from Zhao Lei fixup some long standing warts
     in the code and add improvements on top of the scrubbing support
     from 3.19.

   - Josef has round one of our ENOSPC fixes coming from large btrfs
     clusters here at FB.

   - Dave Sterba continues a long series of cleanups (thanks Dave), and
     Filipe continues hammering on corner cases in fsync and others

  This all was held up a little trying to track down a use-after-free in
  btrfs raid5/6.  It's not clear yet if this is just made easier to
  trigger with this pull or if its a new bug from the raid5/6 cleanups.
  Dave Sterba is the only one to trigger it so far, but he has a
  consistent way to reproduce, so we'll get it nailed shortly"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (68 commits)
  Btrfs: don't remove extents and xattrs when logging new names
  Btrfs: fix fsync data loss after adding hard link to inode
  Btrfs: fix BUG_ON in btrfs_orphan_add() when delete unused block group
  Btrfs: account for large extents with enospc
  Btrfs: don't set and clear delalloc for O_DIRECT writes
  Btrfs: only adjust outstanding_extents when we do a short write
  btrfs: Fix out-of-space bug
  Btrfs: scrub, fix sleep in atomic context
  Btrfs: fix scheduler warning when syncing log
  Btrfs: Remove unnecessary placeholder in btrfs_err_code
  btrfs: cleanup init for list in free-space-cache
  btrfs: delete chunk allocation attemp when setting block group ro
  btrfs: clear bio reference after submit_one_bio()
  Btrfs: fix scrub race leading to use-after-free
  Btrfs: add missing cleanup on sysfs init failure
  Btrfs: fix race between transaction commit and empty block group removal
  btrfs: add more checks to btrfs_read_sys_array
  btrfs: cleanup, rename a few variables in btrfs_read_sys_array
  btrfs: add checks for sys_chunk_array sizes
  btrfs: more superblock checks, lower bounds on devices and sectorsize/nodesize
  ...

34 files changed:
fs/btrfs/backref.c
fs/btrfs/backref.h
fs/btrfs/btrfs_inode.h
fs/btrfs/ctree.c
fs/btrfs/ctree.h
fs/btrfs/delayed-inode.c
fs/btrfs/dev-replace.c
fs/btrfs/disk-io.c
fs/btrfs/disk-io.h
fs/btrfs/extent-tree.c
fs/btrfs/extent_io.c
fs/btrfs/extent_io.h
fs/btrfs/free-space-cache.c
fs/btrfs/inode-item.c
fs/btrfs/inode.c
fs/btrfs/qgroup.c
fs/btrfs/raid56.c
fs/btrfs/raid56.h
fs/btrfs/reada.c
fs/btrfs/relocation.c
fs/btrfs/scrub.c
fs/btrfs/send.c
fs/btrfs/super.c
fs/btrfs/sysfs.c
fs/btrfs/tests/extent-buffer-tests.c
fs/btrfs/tests/extent-io-tests.c
fs/btrfs/tests/inode-tests.c
fs/btrfs/tests/qgroup-tests.c
fs/btrfs/transaction.c
fs/btrfs/transaction.h
fs/btrfs/tree-log.c
fs/btrfs/volumes.c
fs/btrfs/volumes.h
include/uapi/linux/btrfs.h

index 8729cf6..f55721f 100644 (file)
@@ -1246,25 +1246,6 @@ int btrfs_check_shared(struct btrfs_trans_handle *trans,
        return ret;
 }
 
-/*
- * this makes the path point to (inum INODE_ITEM ioff)
- */
-int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
-                       struct btrfs_path *path)
-{
-       struct btrfs_key key;
-       return btrfs_find_item(fs_root, path, inum, ioff,
-                       BTRFS_INODE_ITEM_KEY, &key);
-}
-
-static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
-                               struct btrfs_path *path,
-                               struct btrfs_key *found_key)
-{
-       return btrfs_find_item(fs_root, path, inum, ioff,
-                       BTRFS_INODE_REF_KEY, found_key);
-}
-
 int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
                          u64 start_off, struct btrfs_path *path,
                          struct btrfs_inode_extref **ret_extref,
@@ -1374,7 +1355,8 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
                        btrfs_tree_read_unlock_blocking(eb);
                        free_extent_buffer(eb);
                }
-               ret = inode_ref_info(parent, 0, fs_root, path, &found_key);
+               ret = btrfs_find_item(fs_root, path, parent, 0,
+                               BTRFS_INODE_REF_KEY, &found_key);
                if (ret > 0)
                        ret = -ENOENT;
                if (ret)
@@ -1727,8 +1709,10 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
        struct btrfs_key found_key;
 
        while (!ret) {
-               ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
-                                    &found_key);
+               ret = btrfs_find_item(fs_root, path, inum,
+                               parent ? parent + 1 : 0, BTRFS_INODE_REF_KEY,
+                               &found_key);
+
                if (ret < 0)
                        break;
                if (ret) {
index 2a1ac6b..9c41fba 100644 (file)
@@ -32,9 +32,6 @@ struct inode_fs_paths {
 typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
                void *ctx);
 
-int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
-                       struct btrfs_path *path);
-
 int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
                        struct btrfs_path *path, struct btrfs_key *found_key,
                        u64 *flags);
index 4aadadc..de5e4f2 100644 (file)
@@ -185,6 +185,9 @@ struct btrfs_inode {
 
        struct btrfs_delayed_node *delayed_node;
 
+       /* File creation time. */
+       struct timespec i_otime;
+
        struct inode vfs_inode;
 };
 
index 14a72ed..9936421 100644 (file)
@@ -213,11 +213,19 @@ static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
  */
 static void add_root_to_dirty_list(struct btrfs_root *root)
 {
+       if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
+           !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
+               return;
+
        spin_lock(&root->fs_info->trans_lock);
-       if (test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state) &&
-           list_empty(&root->dirty_list)) {
-               list_add(&root->dirty_list,
-                        &root->fs_info->dirty_cowonly_roots);
+       if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
+               /* Want the extent tree to be the last on the list */
+               if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID)
+                       list_move_tail(&root->dirty_list,
+                                      &root->fs_info->dirty_cowonly_roots);
+               else
+                       list_move(&root->dirty_list,
+                                 &root->fs_info->dirty_cowonly_roots);
        }
        spin_unlock(&root->fs_info->trans_lock);
 }
@@ -1363,8 +1371,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
 
        if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
                BUG_ON(tm->slot != 0);
-               eb_rewin = alloc_dummy_extent_buffer(eb->start,
-                                               fs_info->tree_root->nodesize);
+               eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
                if (!eb_rewin) {
                        btrfs_tree_read_unlock_blocking(eb);
                        free_extent_buffer(eb);
@@ -1444,7 +1451,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
        } else if (old_root) {
                btrfs_tree_read_unlock(eb_root);
                free_extent_buffer(eb_root);
-               eb = alloc_dummy_extent_buffer(logical, root->nodesize);
+               eb = alloc_dummy_extent_buffer(root->fs_info, logical);
        } else {
                btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
                eb = btrfs_clone_extent_buffer(eb_root);
@@ -2282,7 +2289,7 @@ static void reada_for_search(struct btrfs_root *root,
                if ((search <= target && target - search <= 65536) ||
                    (search > target && search - target <= 65536)) {
                        gen = btrfs_node_ptr_generation(node, nr);
-                       readahead_tree_block(root, search, blocksize);
+                       readahead_tree_block(root, search);
                        nread += blocksize;
                }
                nscan++;
@@ -2301,7 +2308,6 @@ static noinline void reada_for_balance(struct btrfs_root *root,
        u64 gen;
        u64 block1 = 0;
        u64 block2 = 0;
-       int blocksize;
 
        parent = path->nodes[level + 1];
        if (!parent)
@@ -2309,7 +2315,6 @@ static noinline void reada_for_balance(struct btrfs_root *root,
 
        nritems = btrfs_header_nritems(parent);
        slot = path->slots[level + 1];
-       blocksize = root->nodesize;
 
        if (slot > 0) {
                block1 = btrfs_node_blockptr(parent, slot - 1);
@@ -2334,9 +2339,9 @@ static noinline void reada_for_balance(struct btrfs_root *root,
        }
 
        if (block1)
-               readahead_tree_block(root, block1, blocksize);
+               readahead_tree_block(root, block1);
        if (block2)
-               readahead_tree_block(root, block2, blocksize);
+               readahead_tree_block(root, block2);
 }
 
 
@@ -2609,32 +2614,24 @@ static int key_search(struct extent_buffer *b, struct btrfs_key *key,
        return 0;
 }
 
-int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *found_path,
+int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
                u64 iobjectid, u64 ioff, u8 key_type,
                struct btrfs_key *found_key)
 {
        int ret;
        struct btrfs_key key;
        struct extent_buffer *eb;
-       struct btrfs_path *path;
+
+       ASSERT(path);
+       ASSERT(found_key);
 
        key.type = key_type;
        key.objectid = iobjectid;
        key.offset = ioff;
 
-       if (found_path == NULL) {
-               path = btrfs_alloc_path();
-               if (!path)
-                       return -ENOMEM;
-       } else
-               path = found_path;
-
        ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
-       if ((ret < 0) || (found_key == NULL)) {
-               if (path != found_path)
-                       btrfs_free_path(path);
+       if (ret < 0)
                return ret;
-       }
 
        eb = path->nodes[0];
        if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
@@ -3383,7 +3380,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
        add_root_to_dirty_list(root);
        extent_buffer_get(c);
        path->nodes[level] = c;
-       path->locks[level] = BTRFS_WRITE_LOCK;
+       path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
        path->slots[level] = 0;
        return 0;
 }
@@ -4356,13 +4353,15 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
        path->search_for_split = 1;
        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
        path->search_for_split = 0;
+       if (ret > 0)
+               ret = -EAGAIN;
        if (ret < 0)
                goto err;
 
        ret = -EAGAIN;
        leaf = path->nodes[0];
-       /* if our item isn't there or got smaller, return now */
-       if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
+       /* if our item isn't there, return now */
+       if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
                goto err;
 
        /* the leaf has  changed, it now has room.  return now */
index 0b18070..84c3b00 100644 (file)
@@ -198,6 +198,8 @@ static int btrfs_csum_sizes[] = { 4, 0 };
 
 #define BTRFS_DIRTY_METADATA_THRESH    (32 * 1024 * 1024)
 
+#define BTRFS_MAX_EXTENT_SIZE (128 * 1024 * 1024)
+
 /*
  * The key defines the order in the tree, and so it also defines (optimal)
  * block layout.
@@ -1020,6 +1022,9 @@ enum btrfs_raid_types {
                                         BTRFS_BLOCK_GROUP_RAID6 |   \
                                         BTRFS_BLOCK_GROUP_DUP |     \
                                         BTRFS_BLOCK_GROUP_RAID10)
+#define BTRFS_BLOCK_GROUP_RAID56_MASK  (BTRFS_BLOCK_GROUP_RAID5 |   \
+                                        BTRFS_BLOCK_GROUP_RAID6)
+
 /*
  * We need a bit for restriper to be able to tell when chunks of type
  * SINGLE are available.  This "extended" profile format is used in
@@ -1239,7 +1244,6 @@ enum btrfs_disk_cache_state {
        BTRFS_DC_ERROR          = 1,
        BTRFS_DC_CLEAR          = 2,
        BTRFS_DC_SETUP          = 3,
-       BTRFS_DC_NEED_WRITE     = 4,
 };
 
 struct btrfs_caching_control {
@@ -1277,7 +1281,6 @@ struct btrfs_block_group_cache {
        unsigned long full_stripe_len;
 
        unsigned int ro:1;
-       unsigned int dirty:1;
        unsigned int iref:1;
        unsigned int has_caching_ctl:1;
        unsigned int removed:1;
@@ -1315,6 +1318,9 @@ struct btrfs_block_group_cache {
        struct list_head ro_list;
 
        atomic_t trimming;
+
+       /* For dirty block groups */
+       struct list_head dirty_list;
 };
 
 /* delayed seq elem */
@@ -1741,6 +1747,7 @@ struct btrfs_fs_info {
 
        spinlock_t unused_bgs_lock;
        struct list_head unused_bgs;
+       struct mutex unused_bg_unpin_mutex;
 
        /* For btrfs to record security options */
        struct security_mnt_opts security_opts;
@@ -1776,6 +1783,7 @@ struct btrfs_subvolume_writers {
 #define BTRFS_ROOT_DEFRAG_RUNNING      6
 #define BTRFS_ROOT_FORCE_COW           7
 #define BTRFS_ROOT_MULTI_LOG_TASKS     8
+#define BTRFS_ROOT_DIRTY               9
 
 /*
  * in ram representation of the tree.  extent_root is used for all allocations
@@ -1794,8 +1802,6 @@ struct btrfs_root {
        struct btrfs_fs_info *fs_info;
        struct extent_io_tree dirty_log_pages;
 
-       struct kobject root_kobj;
-       struct completion kobj_unregister;
        struct mutex objectid_mutex;
 
        spinlock_t accounting_lock;
@@ -2465,31 +2471,6 @@ BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32);
 BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32);
 BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64);
 BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64);
-
-static inline struct btrfs_timespec *
-btrfs_inode_atime(struct btrfs_inode_item *inode_item)
-{
-       unsigned long ptr = (unsigned long)inode_item;
-       ptr += offsetof(struct btrfs_inode_item, atime);
-       return (struct btrfs_timespec *)ptr;
-}
-
-static inline struct btrfs_timespec *
-btrfs_inode_mtime(struct btrfs_inode_item *inode_item)
-{
-       unsigned long ptr = (unsigned long)inode_item;
-       ptr += offsetof(struct btrfs_inode_item, mtime);
-       return (struct btrfs_timespec *)ptr;
-}
-
-static inline struct btrfs_timespec *
-btrfs_inode_ctime(struct btrfs_inode_item *inode_item)
-{
-       unsigned long ptr = (unsigned long)inode_item;
-       ptr += offsetof(struct btrfs_inode_item, ctime);
-       return (struct btrfs_timespec *)ptr;
-}
-
 BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
 BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
 BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
index de4e70f..82f0c7c 100644 (file)
@@ -1755,27 +1755,31 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
        btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
        btrfs_set_stack_inode_block_group(inode_item, 0);
 
-       btrfs_set_stack_timespec_sec(btrfs_inode_atime(inode_item),
+       btrfs_set_stack_timespec_sec(&inode_item->atime,
                                     inode->i_atime.tv_sec);
-       btrfs_set_stack_timespec_nsec(btrfs_inode_atime(inode_item),
+       btrfs_set_stack_timespec_nsec(&inode_item->atime,
                                      inode->i_atime.tv_nsec);
 
-       btrfs_set_stack_timespec_sec(btrfs_inode_mtime(inode_item),
+       btrfs_set_stack_timespec_sec(&inode_item->mtime,
                                     inode->i_mtime.tv_sec);
-       btrfs_set_stack_timespec_nsec(btrfs_inode_mtime(inode_item),
+       btrfs_set_stack_timespec_nsec(&inode_item->mtime,
                                      inode->i_mtime.tv_nsec);
 
-       btrfs_set_stack_timespec_sec(btrfs_inode_ctime(inode_item),
+       btrfs_set_stack_timespec_sec(&inode_item->ctime,
                                     inode->i_ctime.tv_sec);
-       btrfs_set_stack_timespec_nsec(btrfs_inode_ctime(inode_item),
+       btrfs_set_stack_timespec_nsec(&inode_item->ctime,
                                      inode->i_ctime.tv_nsec);
+
+       btrfs_set_stack_timespec_sec(&inode_item->otime,
+                                    BTRFS_I(inode)->i_otime.tv_sec);
+       btrfs_set_stack_timespec_nsec(&inode_item->otime,
+                                    BTRFS_I(inode)->i_otime.tv_nsec);
 }
 
 int btrfs_fill_inode(struct inode *inode, u32 *rdev)
 {
        struct btrfs_delayed_node *delayed_node;
        struct btrfs_inode_item *inode_item;
-       struct btrfs_timespec *tspec;
 
        delayed_node = btrfs_get_delayed_node(inode);
        if (!delayed_node)
@@ -1802,17 +1806,19 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
        *rdev = btrfs_stack_inode_rdev(inode_item);
        BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item);
 
-       tspec = btrfs_inode_atime(inode_item);
-       inode->i_atime.tv_sec = btrfs_stack_timespec_sec(tspec);
-       inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(tspec);
+       inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime);
+       inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime);
+
+       inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(&inode_item->mtime);
+       inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->mtime);
 
-       tspec = btrfs_inode_mtime(inode_item);
-       inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(tspec);
-       inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(tspec);
+       inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(&inode_item->ctime);
+       inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->ctime);
 
-       tspec = btrfs_inode_ctime(inode_item);
-       inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(tspec);
-       inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(tspec);
+       BTRFS_I(inode)->i_otime.tv_sec =
+               btrfs_stack_timespec_sec(&inode_item->otime);
+       BTRFS_I(inode)->i_otime.tv_nsec =
+               btrfs_stack_timespec_nsec(&inode_item->otime);
 
        inode->i_generation = BTRFS_I(inode)->generation;
        BTRFS_I(inode)->index_cnt = (u64)-1;
index ca6a3a3..5ec03d9 100644 (file)
@@ -440,18 +440,9 @@ leave:
  */
 static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
 {
-       s64 writers;
-       DEFINE_WAIT(wait);
-
        set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
-       do {
-               prepare_to_wait(&fs_info->replace_wait, &wait,
-                               TASK_UNINTERRUPTIBLE);
-               writers = percpu_counter_sum(&fs_info->bio_counter);
-               if (writers)
-                       schedule();
-               finish_wait(&fs_info->replace_wait, &wait);
-       } while (writers);
+       wait_event(fs_info->replace_wait, !percpu_counter_sum(
+                  &fs_info->bio_counter));
 }
 
 /*
@@ -932,15 +923,15 @@ void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
 
 void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
 {
-       DEFINE_WAIT(wait);
-again:
-       percpu_counter_inc(&fs_info->bio_counter);
-       if (test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state)) {
+       while (1) {
+               percpu_counter_inc(&fs_info->bio_counter);
+               if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
+                                    &fs_info->fs_state)))
+                       break;
+
                btrfs_bio_counter_dec(fs_info);
                wait_event(fs_info->replace_wait,
                           !test_bit(BTRFS_FS_STATE_DEV_REPLACING,
                                     &fs_info->fs_state));
-               goto again;
        }
-
 }
index 1afb182..f79f385 100644 (file)
@@ -318,7 +318,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
                        memcpy(&found, result, csum_size);
 
                        read_extent_buffer(buf, &val, 0, csum_size);
-                       printk_ratelimited(KERN_INFO
+                       printk_ratelimited(KERN_WARNING
                                "BTRFS: %s checksum verify failed on %llu wanted %X found %X "
                                "level %d\n",
                                root->fs_info->sb->s_id, buf->start,
@@ -367,7 +367,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
                ret = 0;
                goto out;
        }
-       printk_ratelimited(KERN_INFO "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n",
+       printk_ratelimited(KERN_ERR
+           "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n",
                        eb->fs_info->sb->s_id, eb->start,
                        parent_transid, btrfs_header_generation(eb));
        ret = 1;
@@ -633,21 +634,21 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
 
        found_start = btrfs_header_bytenr(eb);
        if (found_start != eb->start) {
-               printk_ratelimited(KERN_INFO "BTRFS (device %s): bad tree block start "
+               printk_ratelimited(KERN_ERR "BTRFS (device %s): bad tree block start "
                               "%llu %llu\n",
                               eb->fs_info->sb->s_id, found_start, eb->start);
                ret = -EIO;
                goto err;
        }
        if (check_tree_block_fsid(root, eb)) {
-               printk_ratelimited(KERN_INFO "BTRFS (device %s): bad fsid on block %llu\n",
+               printk_ratelimited(KERN_ERR "BTRFS (device %s): bad fsid on block %llu\n",
                               eb->fs_info->sb->s_id, eb->start);
                ret = -EIO;
                goto err;
        }
        found_level = btrfs_header_level(eb);
        if (found_level >= BTRFS_MAX_LEVEL) {
-               btrfs_info(root->fs_info, "bad tree block level %d",
+               btrfs_err(root->fs_info, "bad tree block level %d",
                           (int)btrfs_header_level(eb));
                ret = -EIO;
                goto err;
@@ -1073,12 +1074,12 @@ static const struct address_space_operations btree_aops = {
        .set_page_dirty = btree_set_page_dirty,
 };
 
-void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize)
+void readahead_tree_block(struct btrfs_root *root, u64 bytenr)
 {
        struct extent_buffer *buf = NULL;
        struct inode *btree_inode = root->fs_info->btree_inode;
 
-       buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+       buf = btrfs_find_create_tree_block(root, bytenr);
        if (!buf)
                return;
        read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
@@ -1086,7 +1087,7 @@ void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize)
        free_extent_buffer(buf);
 }
 
-int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
                         int mirror_num, struct extent_buffer **eb)
 {
        struct extent_buffer *buf = NULL;
@@ -1094,7 +1095,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
        struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
        int ret;
 
-       buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+       buf = btrfs_find_create_tree_block(root, bytenr);
        if (!buf)
                return 0;
 
@@ -1125,12 +1126,11 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
 }
 
 struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
-                                                u64 bytenr, u32 blocksize)
+                                                u64 bytenr)
 {
        if (btrfs_test_is_dummy_root(root))
-               return alloc_test_extent_buffer(root->fs_info, bytenr,
-                                               blocksize);
-       return alloc_extent_buffer(root->fs_info, bytenr, blocksize);
+               return alloc_test_extent_buffer(root->fs_info, bytenr);
+       return alloc_extent_buffer(root->fs_info, bytenr);
 }
 
 
@@ -1152,7 +1152,7 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
        struct extent_buffer *buf = NULL;
        int ret;
 
-       buf = btrfs_find_create_tree_block(root, bytenr, root->nodesize);
+       buf = btrfs_find_create_tree_block(root, bytenr);
        if (!buf)
                return NULL;
 
@@ -1275,12 +1275,10 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
        memset(&root->root_key, 0, sizeof(root->root_key));
        memset(&root->root_item, 0, sizeof(root->root_item));
        memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
-       memset(&root->root_kobj, 0, sizeof(root->root_kobj));
        if (fs_info)
                root->defrag_trans_start = fs_info->generation;
        else
                root->defrag_trans_start = 0;
-       init_completion(&root->kobj_unregister);
        root->root_key.objectid = objectid;
        root->anon_dev = 0;
 
@@ -1630,6 +1628,8 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
                                     bool check_ref)
 {
        struct btrfs_root *root;
+       struct btrfs_path *path;
+       struct btrfs_key key;
        int ret;
 
        if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
@@ -1669,8 +1669,17 @@ again:
        if (ret)
                goto fail;
 
-       ret = btrfs_find_item(fs_info->tree_root, NULL, BTRFS_ORPHAN_OBJECTID,
-                       location->objectid, BTRFS_ORPHAN_ITEM_KEY, NULL);
+       path = btrfs_alloc_path();
+       if (!path) {
+               ret = -ENOMEM;
+               goto fail;
+       }
+       key.objectid = BTRFS_ORPHAN_OBJECTID;
+       key.type = BTRFS_ORPHAN_ITEM_KEY;
+       key.offset = location->objectid;
+
+       ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
+       btrfs_free_path(path);
        if (ret < 0)
                goto fail;
        if (ret == 0)
@@ -2232,6 +2241,7 @@ int open_ctree(struct super_block *sb,
        spin_lock_init(&fs_info->qgroup_op_lock);
        spin_lock_init(&fs_info->buffer_lock);
        spin_lock_init(&fs_info->unused_bgs_lock);
+       mutex_init(&fs_info->unused_bg_unpin_mutex);
        rwlock_init(&fs_info->tree_mod_log_lock);
        mutex_init(&fs_info->reloc_mutex);
        mutex_init(&fs_info->delalloc_root_mutex);
@@ -2496,7 +2506,7 @@ int open_ctree(struct super_block *sb,
                features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
 
        if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
-               printk(KERN_ERR "BTRFS: has skinny extents\n");
+               printk(KERN_INFO "BTRFS: has skinny extents\n");
 
        /*
         * flag our filesystem as having big metadata blocks if
@@ -2520,7 +2530,7 @@ int open_ctree(struct super_block *sb,
         */
        if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
            (sectorsize != nodesize)) {
-               printk(KERN_WARNING "BTRFS: unequal leaf/node/sector sizes "
+               printk(KERN_ERR "BTRFS: unequal leaf/node/sector sizes "
                                "are not allowed for mixed block groups on %s\n",
                                sb->s_id);
                goto fail_alloc;
@@ -2628,12 +2638,12 @@ int open_ctree(struct super_block *sb,
        sb->s_blocksize_bits = blksize_bits(sectorsize);
 
        if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
-               printk(KERN_INFO "BTRFS: valid FS not found on %s\n", sb->s_id);
+               printk(KERN_ERR "BTRFS: valid FS not found on %s\n", sb->s_id);
                goto fail_sb_buffer;
        }
 
        if (sectorsize != PAGE_SIZE) {
-               printk(KERN_WARNING "BTRFS: Incompatible sector size(%lu) "
+               printk(KERN_ERR "BTRFS: incompatible sector size (%lu) "
                       "found on %s\n", (unsigned long)sectorsize, sb->s_id);
                goto fail_sb_buffer;
        }
@@ -2642,7 +2652,7 @@ int open_ctree(struct super_block *sb,
        ret = btrfs_read_sys_array(tree_root);
        mutex_unlock(&fs_info->chunk_mutex);
        if (ret) {
-               printk(KERN_WARNING "BTRFS: failed to read the system "
+               printk(KERN_ERR "BTRFS: failed to read the system "
                       "array on %s\n", sb->s_id);
                goto fail_sb_buffer;
        }
@@ -2657,7 +2667,7 @@ int open_ctree(struct super_block *sb,
                                           generation);
        if (!chunk_root->node ||
            !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
-               printk(KERN_WARNING "BTRFS: failed to read chunk root on %s\n",
+               printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n",
                       sb->s_id);
                goto fail_tree_roots;
        }
@@ -2669,7 +2679,7 @@ int open_ctree(struct super_block *sb,
 
        ret = btrfs_read_chunk_tree(chunk_root);
        if (ret) {
-               printk(KERN_WARNING "BTRFS: failed to read chunk tree on %s\n",
+               printk(KERN_ERR "BTRFS: failed to read chunk tree on %s\n",
                       sb->s_id);
                goto fail_tree_roots;
        }
@@ -2681,7 +2691,7 @@ int open_ctree(struct super_block *sb,
        btrfs_close_extra_devices(fs_info, fs_devices, 0);
 
        if (!fs_devices->latest_bdev) {
-               printk(KERN_CRIT "BTRFS: failed to read devices on %s\n",
+               printk(KERN_ERR "BTRFS: failed to read devices on %s\n",
                       sb->s_id);
                goto fail_tree_roots;
        }
@@ -2765,7 +2775,7 @@ retry_root_backup:
 
        ret = btrfs_recover_balance(fs_info);
        if (ret) {
-               printk(KERN_WARNING "BTRFS: failed to recover balance\n");
+               printk(KERN_ERR "BTRFS: failed to recover balance\n");
                goto fail_block_groups;
        }
 
@@ -3860,6 +3870,21 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
                printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n",
                                btrfs_super_log_root(sb));
 
+       /*
+        * Check the lower bound, the alignment and other constraints are
+        * checked later.
+        */
+       if (btrfs_super_nodesize(sb) < 4096) {
+               printk(KERN_ERR "BTRFS: nodesize too small: %u < 4096\n",
+                               btrfs_super_nodesize(sb));
+               ret = -EINVAL;
+       }
+       if (btrfs_super_sectorsize(sb) < 4096) {
+               printk(KERN_ERR "BTRFS: sectorsize too small: %u < 4096\n",
+                               btrfs_super_sectorsize(sb));
+               ret = -EINVAL;
+       }
+
        if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) {
                printk(KERN_ERR "BTRFS: dev_item UUID does not match fsid: %pU != %pU\n",
                                fs_info->fsid, sb->dev_item.fsid);
@@ -3873,6 +3898,10 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
        if (btrfs_super_num_devices(sb) > (1UL << 31))
                printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n",
                                btrfs_super_num_devices(sb));
+       if (btrfs_super_num_devices(sb) == 0) {
+               printk(KERN_ERR "BTRFS: number of devices is 0\n");
+               ret = -EINVAL;
+       }
 
        if (btrfs_super_bytenr(sb) != BTRFS_SUPER_INFO_OFFSET) {
                printk(KERN_ERR "BTRFS: super offset mismatch %llu != %u\n",
@@ -3880,6 +3909,25 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
                ret = -EINVAL;
        }
 
+       /*
+        * Obvious sys_chunk_array corruptions, it must hold at least one key
+        * and one chunk
+        */
+       if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
+               printk(KERN_ERR "BTRFS: system chunk array too big %u > %u\n",
+                               btrfs_super_sys_array_size(sb),
+                               BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
+               ret = -EINVAL;
+       }
+       if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
+                       + sizeof(struct btrfs_chunk)) {
+               printk(KERN_ERR "BTRFS: system chunk array too small %u < %lu\n",
+                               btrfs_super_sys_array_size(sb),
+                               sizeof(struct btrfs_disk_key)
+                               + sizeof(struct btrfs_chunk));
+               ret = -EINVAL;
+       }
+
        /*
         * The generation is a global counter, we'll trust it more than the others
         * but it's still possible that it's the one that's wrong.
index 4146518..27d44c0 100644 (file)
@@ -46,11 +46,11 @@ struct btrfs_fs_devices;
 
 struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
                                      u64 parent_transid);
-void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize);
-int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+void readahead_tree_block(struct btrfs_root *root, u64 bytenr);
+int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
                         int mirror_num, struct extent_buffer **eb);
 struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
-                                                  u64 bytenr, u32 blocksize);
+                                                  u64 bytenr);
 void clean_tree_block(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root, struct extent_buffer *buf);
 int open_ctree(struct super_block *sb,
index a684086..571f402 100644 (file)
@@ -74,8 +74,9 @@ enum {
        RESERVE_ALLOC_NO_ACCOUNT = 2,
 };
 
-static int update_block_group(struct btrfs_root *root,
-                             u64 bytenr, u64 num_bytes, int alloc);
+static int update_block_group(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root, u64 bytenr,
+                             u64 num_bytes, int alloc);
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
                                u64 bytenr, u64 num_bytes, u64 parent,
@@ -1925,7 +1926,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
                         */
                        ret = 0;
                }
-               kfree(bbio);
+               btrfs_put_bbio(bbio);
        }
 
        if (actual_bytes)
@@ -2768,7 +2769,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
        struct btrfs_delayed_ref_head *head;
        int ret;
        int run_all = count == (unsigned long)-1;
-       int run_most = 0;
 
        /* We'll clean this up in btrfs_cleanup_transaction */
        if (trans->aborted)
@@ -2778,10 +2778,8 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                root = root->fs_info->tree_root;
 
        delayed_refs = &trans->transaction->delayed_refs;
-       if (count == 0) {
+       if (count == 0)
                count = atomic_read(&delayed_refs->num_entries) * 2;
-               run_most = 1;
-       }
 
 again:
 #ifdef SCRAMBLE_DELAYED_REFS
@@ -3315,120 +3313,42 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root)
 {
        struct btrfs_block_group_cache *cache;
-       int err = 0;
+       struct btrfs_transaction *cur_trans = trans->transaction;
+       int ret = 0;
        struct btrfs_path *path;
-       u64 last = 0;
+
+       if (list_empty(&cur_trans->dirty_bgs))
+               return 0;
 
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
 
-again:
-       while (1) {
-               cache = btrfs_lookup_first_block_group(root->fs_info, last);
-               while (cache) {
-                       if (cache->disk_cache_state == BTRFS_DC_CLEAR)
-                               break;
-                       cache = next_block_group(root, cache);
-               }
-               if (!cache) {
-                       if (last == 0)
-                               break;
-                       last = 0;
-                       continue;
-               }
-               err = cache_save_setup(cache, trans, path);
-               last = cache->key.objectid + cache->key.offset;
-               btrfs_put_block_group(cache);
-       }
-
-       while (1) {
-               if (last == 0) {
-                       err = btrfs_run_delayed_refs(trans, root,
-                                                    (unsigned long)-1);
-                       if (err) /* File system offline */
-                               goto out;
-               }
-
-               cache = btrfs_lookup_first_block_group(root->fs_info, last);
-               while (cache) {
-                       if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
-                               btrfs_put_block_group(cache);
-                               goto again;
-                       }
-
-                       if (cache->dirty)
-                               break;
-                       cache = next_block_group(root, cache);
-               }
-               if (!cache) {
-                       if (last == 0)
-                               break;
-                       last = 0;
-                       continue;
-               }
-
-               if (cache->disk_cache_state == BTRFS_DC_SETUP)
-                       cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
-               cache->dirty = 0;
-               last = cache->key.objectid + cache->key.offset;
-
-               err = write_one_cache_group(trans, root, path, cache);
-               btrfs_put_block_group(cache);
-               if (err) /* File system offline */
-                       goto out;
-       }
-
-       while (1) {
-               /*
-                * I don't think this is needed since we're just marking our
-                * preallocated extent as written, but just in case it can't
-                * hurt.
-                */
-               if (last == 0) {
-                       err = btrfs_run_delayed_refs(trans, root,
-                                                    (unsigned long)-1);
-                       if (err) /* File system offline */
-                               goto out;
-               }
-
-               cache = btrfs_lookup_first_block_group(root->fs_info, last);
-               while (cache) {
-                       /*
-                        * Really this shouldn't happen, but it could if we
-                        * couldn't write the entire preallocated extent and
-                        * splitting the extent resulted in a new block.
-                        */
-                       if (cache->dirty) {
-                               btrfs_put_block_group(cache);
-                               goto again;
-                       }
-                       if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
-                               break;
-                       cache = next_block_group(root, cache);
-               }
-               if (!cache) {
-                       if (last == 0)
-                               break;
-                       last = 0;
-                       continue;
-               }
-
-               err = btrfs_write_out_cache(root, trans, cache, path);
-
-               /*
-                * If we didn't have an error then the cache state is still
-                * NEED_WRITE, so we can set it to WRITTEN.
-                */
-               if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
-                       cache->disk_cache_state = BTRFS_DC_WRITTEN;
-               last = cache->key.objectid + cache->key.offset;
+       /*
+        * We don't need the lock here since we are protected by the transaction
+        * commit.  We want to do the cache_save_setup first and then run the
+        * delayed refs to make sure we have the best chance at doing this all
+        * in one shot.
+        */
+       while (!list_empty(&cur_trans->dirty_bgs)) {
+               cache = list_first_entry(&cur_trans->dirty_bgs,
+                                        struct btrfs_block_group_cache,
+                                        dirty_list);
+               list_del_init(&cache->dirty_list);
+               if (cache->disk_cache_state == BTRFS_DC_CLEAR)
+                       cache_save_setup(cache, trans, path);
+               if (!ret)
+                       ret = btrfs_run_delayed_refs(trans, root,
+                                                    (unsigned long) -1);
+               if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP)
+                       btrfs_write_out_cache(root, trans, cache, path);
+               if (!ret)
+                       ret = write_one_cache_group(trans, root, path, cache);
                btrfs_put_block_group(cache);
        }
-out:
 
        btrfs_free_path(path);
-       return err;
+       return ret;
 }
 
 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
@@ -5043,19 +4963,25 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
 /**
  * drop_outstanding_extent - drop an outstanding extent
  * @inode: the inode we're dropping the extent for
+ * @num_bytes: the number of bytes we're relaseing.
  *
  * This is called when we are freeing up an outstanding extent, either called
  * after an error or after an extent is written.  This will return the number of
  * reserved extents that need to be freed.  This must be called with
  * BTRFS_I(inode)->lock held.
  */
-static unsigned drop_outstanding_extent(struct inode *inode)
+static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes)
 {
        unsigned drop_inode_space = 0;
        unsigned dropped_extents = 0;
+       unsigned num_extents = 0;
 
-       BUG_ON(!BTRFS_I(inode)->outstanding_extents);
-       BTRFS_I(inode)->outstanding_extents--;
+       num_extents = (unsigned)div64_u64(num_bytes +
+                                         BTRFS_MAX_EXTENT_SIZE - 1,
+                                         BTRFS_MAX_EXTENT_SIZE);
+       ASSERT(num_extents);
+       ASSERT(BTRFS_I(inode)->outstanding_extents >= num_extents);
+       BTRFS_I(inode)->outstanding_extents -= num_extents;
 
        if (BTRFS_I(inode)->outstanding_extents == 0 &&
            test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
@@ -5226,7 +5152,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 
 out_fail:
        spin_lock(&BTRFS_I(inode)->lock);
-       dropped = drop_outstanding_extent(inode);
+       dropped = drop_outstanding_extent(inode, num_bytes);
        /*
         * If the inodes csum_bytes is the same as the original
         * csum_bytes then we know we haven't raced with any free()ers
@@ -5305,7 +5231,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
 
        num_bytes = ALIGN(num_bytes, root->sectorsize);
        spin_lock(&BTRFS_I(inode)->lock);
-       dropped = drop_outstanding_extent(inode);
+       dropped = drop_outstanding_extent(inode, num_bytes);
 
        if (num_bytes)
                to_free = calc_csum_metadata_size(inode, num_bytes, 0);
@@ -5375,8 +5301,9 @@ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
        btrfs_free_reserved_data_space(inode, num_bytes);
 }
 
-static int update_block_group(struct btrfs_root *root,
-                             u64 bytenr, u64 num_bytes, int alloc)
+static int update_block_group(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root, u64 bytenr,
+                             u64 num_bytes, int alloc)
 {
        struct btrfs_block_group_cache *cache = NULL;
        struct btrfs_fs_info *info = root->fs_info;
@@ -5414,6 +5341,14 @@ static int update_block_group(struct btrfs_root *root,
                if (!alloc && cache->cached == BTRFS_CACHE_NO)
                        cache_block_group(cache, 1);
 
+               spin_lock(&trans->transaction->dirty_bgs_lock);
+               if (list_empty(&cache->dirty_list)) {
+                       list_add_tail(&cache->dirty_list,
+                                     &trans->transaction->dirty_bgs);
+                       btrfs_get_block_group(cache);
+               }
+               spin_unlock(&trans->transaction->dirty_bgs_lock);
+
                byte_in_group = bytenr - cache->key.objectid;
                WARN_ON(byte_in_group > cache->key.offset);
 
@@ -5424,7 +5359,6 @@ static int update_block_group(struct btrfs_root *root,
                    cache->disk_cache_state < BTRFS_DC_CLEAR)
                        cache->disk_cache_state = BTRFS_DC_CLEAR;
 
-               cache->dirty = 1;
                old_val = btrfs_block_group_used(&cache->item);
                num_bytes = min(total, cache->key.offset - byte_in_group);
                if (alloc) {
@@ -5807,10 +5741,13 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
                unpin = &fs_info->freed_extents[0];
 
        while (1) {
+               mutex_lock(&fs_info->unused_bg_unpin_mutex);
                ret = find_first_extent_bit(unpin, 0, &start, &end,
                                            EXTENT_DIRTY, NULL);
-               if (ret)
+               if (ret) {
+                       mutex_unlock(&fs_info->unused_bg_unpin_mutex);
                        break;
+               }
 
                if (btrfs_test_opt(root, DISCARD))
                        ret = btrfs_discard_extent(root, start,
@@ -5818,6 +5755,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 
                clear_extent_dirty(unpin, start, end, GFP_NOFS);
                unpin_extent_range(root, start, end, true);
+               mutex_unlock(&fs_info->unused_bg_unpin_mutex);
                cond_resched();
        }
 
@@ -6103,7 +6041,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                        }
                }
 
-               ret = update_block_group(root, bytenr, num_bytes, 0);
+               ret = update_block_group(trans, root, bytenr, num_bytes, 0);
                if (ret) {
                        btrfs_abort_transaction(trans, extent_root, ret);
                        goto out;
@@ -6205,7 +6143,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                           struct extent_buffer *buf,
                           u64 parent, int last_ref)
 {
-       struct btrfs_block_group_cache *cache = NULL;
        int pin = 1;
        int ret;
 
@@ -6221,17 +6158,20 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
        if (!last_ref)
                return;
 
-       cache = btrfs_lookup_block_group(root->fs_info, buf->start);
-
        if (btrfs_header_generation(buf) == trans->transid) {
+               struct btrfs_block_group_cache *cache;
+
                if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
                        ret = check_ref_cleanup(trans, root, buf->start);
                        if (!ret)
                                goto out;
                }
 
+               cache = btrfs_lookup_block_group(root->fs_info, buf->start);
+
                if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
                        pin_down_extent(root, cache, buf->start, buf->len, 1);
+                       btrfs_put_block_group(cache);
                        goto out;
                }
 
@@ -6239,6 +6179,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 
                btrfs_add_free_space(cache, buf->start, buf->len);
                btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0);
+               btrfs_put_block_group(cache);
                trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
                pin = 0;
        }
@@ -6253,7 +6194,6 @@ out:
         * anymore.
         */
        clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
-       btrfs_put_block_group(cache);
 }
 
 /* Can return -ENOMEM */
@@ -7063,7 +7003,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
        if (ret)
                return ret;
 
-       ret = update_block_group(root, ins->objectid, ins->offset, 1);
+       ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
        if (ret) { /* -ENOENT, logic error */
                btrfs_err(fs_info, "update block group failed for %llu %llu",
                        ins->objectid, ins->offset);
@@ -7152,7 +7092,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
                        return ret;
        }
 
-       ret = update_block_group(root, ins->objectid, root->nodesize, 1);
+       ret = update_block_group(trans, root, ins->objectid, root->nodesize,
+                                1);
        if (ret) { /* -ENOENT, logic error */
                btrfs_err(fs_info, "update block group failed for %llu %llu",
                        ins->objectid, ins->offset);
@@ -7217,11 +7158,11 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 
 static struct extent_buffer *
 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                     u64 bytenr, u32 blocksize, int level)
+                     u64 bytenr, int level)
 {
        struct extent_buffer *buf;
 
-       buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+       buf = btrfs_find_create_tree_block(root, bytenr);
        if (!buf)
                return ERR_PTR(-ENOMEM);
        btrfs_set_header_generation(buf, trans->transid);
@@ -7340,7 +7281,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
 
        if (btrfs_test_is_dummy_root(root)) {
                buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
-                                           blocksize, level);
+                                           level);
                if (!IS_ERR(buf))
                        root->alloc_bytenr += blocksize;
                return buf;
@@ -7357,8 +7298,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
                return ERR_PTR(ret);
        }
 
-       buf = btrfs_init_new_buffer(trans, root, ins.objectid,
-                                   blocksize, level);
+       buf = btrfs_init_new_buffer(trans, root, ins.objectid, level);
        BUG_ON(IS_ERR(buf)); /* -ENOMEM */
 
        if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
@@ -7487,7 +7427,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
                                continue;
                }
 reada:
-               readahead_tree_block(root, bytenr, blocksize);
+               readahead_tree_block(root, bytenr);
                nread++;
        }
        wc->reada_slot = slot;
@@ -7828,7 +7768,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
 
        next = btrfs_find_tree_block(root, bytenr);
        if (!next) {
-               next = btrfs_find_create_tree_block(root, bytenr, blocksize);
+               next = btrfs_find_create_tree_block(root, bytenr);
                if (!next)
                        return -ENOMEM;
                btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
@@ -8548,14 +8488,6 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
        if (IS_ERR(trans))
                return PTR_ERR(trans);
 
-       alloc_flags = update_block_group_flags(root, cache->flags);
-       if (alloc_flags != cache->flags) {
-               ret = do_chunk_alloc(trans, root, alloc_flags,
-                                    CHUNK_ALLOC_FORCE);
-               if (ret < 0)
-                       goto out;
-       }
-
        ret = set_block_group_ro(cache, 0);
        if (!ret)
                goto out;
@@ -8566,6 +8498,11 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
                goto out;
        ret = set_block_group_ro(cache, 0);
 out:
+       if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
+               alloc_flags = update_block_group_flags(root, cache->flags);
+               check_system_chunk(trans, root, alloc_flags);
+       }
+
        btrfs_end_transaction(trans, root);
        return ret;
 }
@@ -9005,6 +8942,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
        INIT_LIST_HEAD(&cache->cluster_list);
        INIT_LIST_HEAD(&cache->bg_list);
        INIT_LIST_HEAD(&cache->ro_list);
+       INIT_LIST_HEAD(&cache->dirty_list);
        btrfs_init_free_space_ctl(cache);
        atomic_set(&cache->trimming, 0);
 
@@ -9068,9 +9006,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                         * b) Setting 'dirty flag' makes sure that we flush
                         *    the new space cache info onto disk.
                         */
-                       cache->disk_cache_state = BTRFS_DC_CLEAR;
                        if (btrfs_test_opt(root, SPACE_CACHE))
-                               cache->dirty = 1;
+                               cache->disk_cache_state = BTRFS_DC_CLEAR;
                }
 
                read_extent_buffer(leaf, &cache->item,
@@ -9460,6 +9397,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                }
        }
 
+       spin_lock(&trans->transaction->dirty_bgs_lock);
+       if (!list_empty(&block_group->dirty_list)) {
+               list_del_init(&block_group->dirty_list);
+               btrfs_put_block_group(block_group);
+       }
+       spin_unlock(&trans->transaction->dirty_bgs_lock);
+
        btrfs_remove_free_space_cache(block_group);
 
        spin_lock(&block_group->space_info->lock);
@@ -9611,7 +9555,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                 * Want to do this before we do anything else so we can recover
                 * properly if we fail to join the transaction.
                 */
-               trans = btrfs_join_transaction(root);
+               /* 1 for btrfs_orphan_reserve_metadata() */
+               trans = btrfs_start_transaction(root, 1);
                if (IS_ERR(trans)) {
                        btrfs_set_block_group_rw(root, block_group);
                        ret = PTR_ERR(trans);
@@ -9624,18 +9569,33 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                 */
                start = block_group->key.objectid;
                end = start + block_group->key.offset - 1;
+               /*
+                * Hold the unused_bg_unpin_mutex lock to avoid racing with
+                * btrfs_finish_extent_commit(). If we are at transaction N,
+                * another task might be running finish_extent_commit() for the
+                * previous transaction N - 1, and have seen a range belonging
+                * to the block group in freed_extents[] before we were able to
+                * clear the whole block group range from freed_extents[]. This
+                * means that task can lookup for the block group after we
+                * unpinned it from freed_extents[] and removed it, leading to
+                * a BUG_ON() at btrfs_unpin_extent_range().
+                */
+               mutex_lock(&fs_info->unused_bg_unpin_mutex);
                ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
                                  EXTENT_DIRTY, GFP_NOFS);
                if (ret) {
+                       mutex_unlock(&fs_info->unused_bg_unpin_mutex);
                        btrfs_set_block_group_rw(root, block_group);
                        goto end_trans;
                }
                ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
                                  EXTENT_DIRTY, GFP_NOFS);
                if (ret) {
+                       mutex_unlock(&fs_info->unused_bg_unpin_mutex);
                        btrfs_set_block_group_rw(root, block_group);
                        goto end_trans;
                }
+               mutex_unlock(&fs_info->unused_bg_unpin_mutex);
 
                /* Reset pinned so btrfs_put_block_group doesn't complain */
                block_group->pinned = 0;
index c73df6a..c7233ff 100644 (file)
@@ -64,7 +64,7 @@ void btrfs_leak_debug_check(void)
 
        while (!list_empty(&states)) {
                state = list_entry(states.next, struct extent_state, leak_list);
-               pr_err("BTRFS: state leak: start %llu end %llu state %lu in tree %d refs %d\n",
+               pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
                       state->start, state->end, state->state,
                       extent_state_in_tree(state),
                       atomic_read(&state->refs));
@@ -396,21 +396,21 @@ static void merge_state(struct extent_io_tree *tree,
 }
 
 static void set_state_cb(struct extent_io_tree *tree,
-                        struct extent_state *state, unsigned long *bits)
+                        struct extent_state *state, unsigned *bits)
 {
        if (tree->ops && tree->ops->set_bit_hook)
                tree->ops->set_bit_hook(tree->mapping->host, state, bits);
 }
 
 static void clear_state_cb(struct extent_io_tree *tree,
-                          struct extent_state *state, unsigned long *bits)
+                          struct extent_state *state, unsigned *bits)
 {
        if (tree->ops && tree->ops->clear_bit_hook)
                tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
 }
 
 static void set_state_bits(struct extent_io_tree *tree,
-                          struct extent_state *state, unsigned long *bits);
+                          struct extent_state *state, unsigned *bits);
 
 /*
  * insert an extent_state struct into the tree.  'bits' are set on the
@@ -426,7 +426,7 @@ static int insert_state(struct extent_io_tree *tree,
                        struct extent_state *state, u64 start, u64 end,
                        struct rb_node ***p,
                        struct rb_node **parent,
-                       unsigned long *bits)
+                       unsigned *bits)
 {
        struct rb_node *node;
 
@@ -511,10 +511,10 @@ static struct extent_state *next_state(struct extent_state *state)
  */
 static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
                                            struct extent_state *state,
-                                           unsigned long *bits, int wake)
+                                           unsigned *bits, int wake)
 {
        struct extent_state *next;
-       unsigned long bits_to_clear = *bits & ~EXTENT_CTLBITS;
+       unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
 
        if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
                u64 range = state->end - state->start + 1;
@@ -570,7 +570,7 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
  * This takes the tree lock, and returns 0 on success and < 0 on error.
  */
 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                    unsigned long bits, int wake, int delete,
+                    unsigned bits, int wake, int delete,
                     struct extent_state **cached_state,
                     gfp_t mask)
 {
@@ -789,9 +789,9 @@ out:
 
 static void set_state_bits(struct extent_io_tree *tree,
                           struct extent_state *state,
-                          unsigned long *bits)
+                          unsigned *bits)
 {
-       unsigned long bits_to_set = *bits & ~EXTENT_CTLBITS;
+       unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
 
        set_state_cb(tree, state, bits);
        if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
@@ -803,7 +803,7 @@ static void set_state_bits(struct extent_io_tree *tree,
 
 static void cache_state_if_flags(struct extent_state *state,
                                 struct extent_state **cached_ptr,
-                                const u64 flags)
+                                unsigned flags)
 {
        if (cached_ptr && !(*cached_ptr)) {
                if (!flags || (state->state & flags)) {
@@ -833,7 +833,7 @@ static void cache_state(struct extent_state *state,
 
 static int __must_check
 __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                unsigned long bits, unsigned long exclusive_bits,
+                unsigned bits, unsigned exclusive_bits,
                 u64 *failed_start, struct extent_state **cached_state,
                 gfp_t mask)
 {
@@ -1034,7 +1034,7 @@ search_again:
 }
 
 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                  unsigned long bits, u64 * failed_start,
+                  unsigned bits, u64 * failed_start,
                   struct extent_state **cached_state, gfp_t mask)
 {
        return __set_extent_bit(tree, start, end, bits, 0, failed_start,
@@ -1060,7 +1060,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
  * boundary bits like LOCK.
  */
 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                      unsigned long bits, unsigned long clear_bits,
+                      unsigned bits, unsigned clear_bits,
                       struct extent_state **cached_state, gfp_t mask)
 {
        struct extent_state *state;
@@ -1268,14 +1268,14 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 }
 
 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-                   unsigned long bits, gfp_t mask)
+                   unsigned bits, gfp_t mask)
 {
        return set_extent_bit(tree, start, end, bits, NULL,
                              NULL, mask);
 }
 
 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-                     unsigned long bits, gfp_t mask)
+                     unsigned bits, gfp_t mask)
 {
        return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
 }
@@ -1330,10 +1330,11 @@ int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
  * us if waiting is desired.
  */
 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-                    unsigned long bits, struct extent_state **cached_state)
+                    unsigned bits, struct extent_state **cached_state)
 {
        int err;
        u64 failed_start;
+
        while (1) {
                err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
                                       EXTENT_LOCKED, &failed_start,
@@ -1440,7 +1441,7 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
  */
 static struct extent_state *
 find_first_extent_bit_state(struct extent_io_tree *tree,
-                           u64 start, unsigned long bits)
+                           u64 start, unsigned bits)
 {
        struct rb_node *node;
        struct extent_state *state;
@@ -1474,7 +1475,7 @@ out:
  * If nothing was found, 1 is returned. If found something, return 0.
  */
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
-                         u64 *start_ret, u64 *end_ret, unsigned long bits,
+                         u64 *start_ret, u64 *end_ret, unsigned bits,
                          struct extent_state **cached_state)
 {
        struct extent_state *state;
@@ -1753,7 +1754,7 @@ out_failed:
 
 int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
                                 struct page *locked_page,
-                                unsigned long clear_bits,
+                                unsigned clear_bits,
                                 unsigned long page_ops)
 {
        struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
@@ -1810,7 +1811,7 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
  */
 u64 count_range_bits(struct extent_io_tree *tree,
                     u64 *start, u64 search_end, u64 max_bytes,
-                    unsigned long bits, int contig)
+                    unsigned bits, int contig)
 {
        struct rb_node *node;
        struct extent_state *state;
@@ -1928,7 +1929,7 @@ out:
  * range is found set.
  */
 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                  unsigned long bits, int filled, struct extent_state *cached)
+                  unsigned bits, int filled, struct extent_state *cached)
 {
        struct extent_state *state = NULL;
        struct rb_node *node;
@@ -2057,7 +2058,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
        sector = bbio->stripes[mirror_num-1].physical >> 9;
        bio->bi_iter.bi_sector = sector;
        dev = bbio->stripes[mirror_num-1].dev;
-       kfree(bbio);
+       btrfs_put_bbio(bbio);
        if (!dev || !dev->bdev || !dev->writeable) {
                bio_put(bio);
                return -EIO;
@@ -2816,8 +2817,10 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
                    bio_add_page(bio, page, page_size, offset) < page_size) {
                        ret = submit_one_bio(rw, bio, mirror_num,
                                             prev_bio_flags);
-                       if (ret < 0)
+                       if (ret < 0) {
+                               *bio_ret = NULL;
                                return ret;
+                       }
                        bio = NULL;
                } else {
                        return 0;
@@ -3239,7 +3242,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
                                               page,
                                               &delalloc_start,
                                               &delalloc_end,
-                                              128 * 1024 * 1024);
+                                              BTRFS_MAX_EXTENT_SIZE);
                if (nr_delalloc == 0) {
                        delalloc_start = delalloc_end + 1;
                        continue;
@@ -4598,11 +4601,11 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
 
 static struct extent_buffer *
 __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
-                     unsigned long len, gfp_t mask)
+                     unsigned long len)
 {
        struct extent_buffer *eb = NULL;
 
-       eb = kmem_cache_zalloc(extent_buffer_cache, mask);
+       eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS);
        if (eb == NULL)
                return NULL;
        eb->start = start;
@@ -4643,7 +4646,7 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
        struct extent_buffer *new;
        unsigned long num_pages = num_extent_pages(src->start, src->len);
 
-       new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_NOFS);
+       new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
        if (new == NULL)
                return NULL;
 
@@ -4666,13 +4669,26 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
        return new;
 }
 
-struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len)
+struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+                                               u64 start)
 {
        struct extent_buffer *eb;
-       unsigned long num_pages = num_extent_pages(0, len);
+       unsigned long len;
+       unsigned long num_pages;
        unsigned long i;
 
-       eb = __alloc_extent_buffer(NULL, start, len, GFP_NOFS);
+       if (!fs_info) {
+               /*
+                * Called only from tests that don't always have a fs_info
+                * available, but we know that nodesize is 4096
+                */
+               len = 4096;
+       } else {
+               len = fs_info->tree_root->nodesize;
+       }
+       num_pages = num_extent_pages(0, len);
+
+       eb = __alloc_extent_buffer(fs_info, start, len);
        if (!eb)
                return NULL;
 
@@ -4762,7 +4778,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
 
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
-                                              u64 start, unsigned long len)
+                                              u64 start)
 {
        struct extent_buffer *eb, *exists = NULL;
        int ret;
@@ -4770,7 +4786,7 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
        eb = find_extent_buffer(fs_info, start);
        if (eb)
                return eb;
-       eb = alloc_dummy_extent_buffer(start, len);
+       eb = alloc_dummy_extent_buffer(fs_info, start);
        if (!eb)
                return NULL;
        eb->fs_info = fs_info;
@@ -4808,8 +4824,9 @@ free_eb:
 #endif
 
 struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
-                                         u64 start, unsigned long len)
+                                         u64 start)
 {
+       unsigned long len = fs_info->tree_root->nodesize;
        unsigned long num_pages = num_extent_pages(start, len);
        unsigned long i;
        unsigned long index = start >> PAGE_CACHE_SHIFT;
@@ -4824,7 +4841,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
        if (eb)
                return eb;
 
-       eb = __alloc_extent_buffer(fs_info, start, len, GFP_NOFS);
+       eb = __alloc_extent_buffer(fs_info, start, len);
        if (!eb)
                return NULL;
 
index ece9ce8..695b0cc 100644 (file)
@@ -4,22 +4,22 @@
 #include <linux/rbtree.h>
 
 /* bits for the extent state */
-#define EXTENT_DIRTY 1
-#define EXTENT_WRITEBACK (1 << 1)
-#define EXTENT_UPTODATE (1 << 2)
-#define EXTENT_LOCKED (1 << 3)
-#define EXTENT_NEW (1 << 4)
-#define EXTENT_DELALLOC (1 << 5)
-#define EXTENT_DEFRAG (1 << 6)
-#define EXTENT_BOUNDARY (1 << 9)
-#define EXTENT_NODATASUM (1 << 10)
-#define EXTENT_DO_ACCOUNTING (1 << 11)
-#define EXTENT_FIRST_DELALLOC (1 << 12)
-#define EXTENT_NEED_WAIT (1 << 13)
-#define EXTENT_DAMAGED (1 << 14)
-#define EXTENT_NORESERVE (1 << 15)
-#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
-#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
+#define EXTENT_DIRTY           (1U << 0)
+#define EXTENT_WRITEBACK       (1U << 1)
+#define EXTENT_UPTODATE                (1U << 2)
+#define EXTENT_LOCKED          (1U << 3)
+#define EXTENT_NEW             (1U << 4)
+#define EXTENT_DELALLOC                (1U << 5)
+#define EXTENT_DEFRAG          (1U << 6)
+#define EXTENT_BOUNDARY                (1U << 9)
+#define EXTENT_NODATASUM       (1U << 10)
+#define EXTENT_DO_ACCOUNTING   (1U << 11)
+#define EXTENT_FIRST_DELALLOC  (1U << 12)
+#define EXTENT_NEED_WAIT       (1U << 13)
+#define EXTENT_DAMAGED         (1U << 14)
+#define EXTENT_NORESERVE       (1U << 15)
+#define EXTENT_IOBITS          (EXTENT_LOCKED | EXTENT_WRITEBACK)
+#define EXTENT_CTLBITS         (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
 
 /*
  * flags for bio submission. The high bits indicate the compression
@@ -81,9 +81,9 @@ struct extent_io_ops {
        int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
                                      struct extent_state *state, int uptodate);
        void (*set_bit_hook)(struct inode *inode, struct extent_state *state,
-                            unsigned long *bits);
+                            unsigned *bits);
        void (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
-                              unsigned long *bits);
+                              unsigned *bits);
        void (*merge_extent_hook)(struct inode *inode,
                                  struct extent_state *new,
                                  struct extent_state *other);
@@ -108,7 +108,7 @@ struct extent_state {
        /* ADD NEW ELEMENTS AFTER THIS */
        wait_queue_head_t wq;
        atomic_t refs;
-       unsigned long state;
+       unsigned state;
 
        /* for use by the FS */
        u64 private;
@@ -188,7 +188,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
 int try_release_extent_buffer(struct page *page);
 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-                    unsigned long bits, struct extent_state **cached);
+                    unsigned bits, struct extent_state **cached);
 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end);
 int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
                         struct extent_state **cached, gfp_t mask);
@@ -202,21 +202,21 @@ void extent_io_exit(void);
 
 u64 count_range_bits(struct extent_io_tree *tree,
                     u64 *start, u64 search_end,
-                    u64 max_bytes, unsigned long bits, int contig);
+                    u64 max_bytes, unsigned bits, int contig);
 
 void free_extent_state(struct extent_state *state);
 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                  unsigned long bits, int filled,
+                  unsigned bits, int filled,
                   struct extent_state *cached_state);
 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-                     unsigned long bits, gfp_t mask);
+                     unsigned bits, gfp_t mask);
 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                    unsigned long bits, int wake, int delete,
+                    unsigned bits, int wake, int delete,
                     struct extent_state **cached, gfp_t mask);
 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-                   unsigned long bits, gfp_t mask);
+                   unsigned bits, gfp_t mask);
 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                  unsigned long bits, u64 *failed_start,
+                  unsigned bits, u64 *failed_start,
                   struct extent_state **cached_state, gfp_t mask);
 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
                        struct extent_state **cached_state, gfp_t mask);
@@ -229,14 +229,14 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
                       gfp_t mask);
 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                      unsigned long bits, unsigned long clear_bits,
+                      unsigned bits, unsigned clear_bits,
                       struct extent_state **cached_state, gfp_t mask);
 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
                        struct extent_state **cached_state, gfp_t mask);
 int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
                      struct extent_state **cached_state, gfp_t mask);
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
-                         u64 *start_ret, u64 *end_ret, unsigned long bits,
+                         u64 *start_ret, u64 *end_ret, unsigned bits,
                          struct extent_state **cached_state);
 int extent_invalidatepage(struct extent_io_tree *tree,
                          struct page *page, unsigned long offset);
@@ -262,8 +262,9 @@ int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
 void set_page_extent_mapped(struct page *page);
 
 struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
-                                         u64 start, unsigned long len);
-struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len);
+                                         u64 start);
+struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+               u64 start);
 struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
 struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
                                         u64 start);
@@ -322,7 +323,7 @@ int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
 int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
 int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
                                 struct page *locked_page,
-                                unsigned long bits_to_clear,
+                                unsigned bits_to_clear,
                                 unsigned long page_ops);
 struct bio *
 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
@@ -377,5 +378,5 @@ noinline u64 find_lock_delalloc_range(struct inode *inode,
                                      u64 *end, u64 max_bytes);
 #endif
 struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
-                                              u64 start, unsigned long len);
+                                              u64 start);
 #endif
index d6c03f7..a719785 100644 (file)
@@ -651,15 +651,13 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
        struct io_ctl io_ctl;
        struct btrfs_key key;
        struct btrfs_free_space *e, *n;
-       struct list_head bitmaps;
+       LIST_HEAD(bitmaps);
        u64 num_entries;
        u64 num_bitmaps;
        u64 generation;
        u8 type;
        int ret = 0;
 
-       INIT_LIST_HEAD(&bitmaps);
-
        /* Nothing in the space cache, goodbye */
        if (!i_size_read(inode))
                return 0;
@@ -1243,6 +1241,7 @@ int btrfs_write_out_cache(struct btrfs_root *root,
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct inode *inode;
        int ret = 0;
+       enum btrfs_disk_cache_state dcs = BTRFS_DC_WRITTEN;
 
        root = root->fs_info->tree_root;
 
@@ -1266,9 +1265,7 @@ int btrfs_write_out_cache(struct btrfs_root *root,
        ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
                                      path, block_group->key.objectid);
        if (ret) {
-               spin_lock(&block_group->lock);
-               block_group->disk_cache_state = BTRFS_DC_ERROR;
-               spin_unlock(&block_group->lock);
+               dcs = BTRFS_DC_ERROR;
                ret = 0;
 #ifdef DEBUG
                btrfs_err(root->fs_info,
@@ -1277,6 +1274,9 @@ int btrfs_write_out_cache(struct btrfs_root *root,
 #endif
        }
 
+       spin_lock(&block_group->lock);
+       block_group->disk_cache_state = dcs;
+       spin_unlock(&block_group->lock);
        iput(inode);
        return ret;
 }
@@ -2903,7 +2903,6 @@ int btrfs_find_space_cluster(struct btrfs_root *root,
        trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
                                 min_bytes);
 
-       INIT_LIST_HEAD(&bitmaps);
        ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
                                      bytes + empty_size,
                                      cont1_bytes, min_bytes);
index 8ffa478..265e03c 100644 (file)
@@ -344,6 +344,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
                return -ENOMEM;
 
        path->leave_spinning = 1;
+       path->skip_release_on_error = 1;
        ret = btrfs_insert_empty_item(trans, root, path, &key,
                                      ins_len);
        if (ret == -EEXIST) {
@@ -362,8 +363,12 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
                ptr = (unsigned long)(ref + 1);
                ret = 0;
        } else if (ret < 0) {
-               if (ret == -EOVERFLOW)
-                       ret = -EMLINK;
+               if (ret == -EOVERFLOW) {
+                       if (find_name_in_backref(path, name, name_len, &ref))
+                               ret = -EEXIST;
+                       else
+                               ret = -EMLINK;
+               }
                goto out;
        } else {
                ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
index 54bcf63..a85c23d 100644 (file)
@@ -1530,10 +1530,45 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
 static void btrfs_split_extent_hook(struct inode *inode,
                                    struct extent_state *orig, u64 split)
 {
+       u64 size;
+
        /* not delalloc, ignore it */
        if (!(orig->state & EXTENT_DELALLOC))
                return;
 
+       size = orig->end - orig->start + 1;
+       if (size > BTRFS_MAX_EXTENT_SIZE) {
+               u64 num_extents;
+               u64 new_size;
+
+               /*
+                * We need the largest size of the remaining extent to see if we
+                * need to add a new outstanding extent.  Think of the following
+                * case
+                *
+                * [MEAX_EXTENT_SIZEx2 - 4k][4k]
+                *
+                * The new_size would just be 4k and we'd think we had enough
+                * outstanding extents for this if we only took one side of the
+                * split, same goes for the other direction.  We need to see if
+                * the larger size still is the same amount of extents as the
+                * original size, because if it is we need to add a new
+                * outstanding extent.  But if we split up and the larger size
+                * is less than the original then we are good to go since we've
+                * already accounted for the extra extent in our original
+                * accounting.
+                */
+               new_size = orig->end - split + 1;
+               if ((split - orig->start) > new_size)
+                       new_size = split - orig->start;
+
+               num_extents = div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1,
+                                       BTRFS_MAX_EXTENT_SIZE);
+               if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
+                             BTRFS_MAX_EXTENT_SIZE) < num_extents)
+                       return;
+       }
+
        spin_lock(&BTRFS_I(inode)->lock);
        BTRFS_I(inode)->outstanding_extents++;
        spin_unlock(&BTRFS_I(inode)->lock);
@@ -1549,10 +1584,34 @@ static void btrfs_merge_extent_hook(struct inode *inode,
                                    struct extent_state *new,
                                    struct extent_state *other)
 {
+       u64 new_size, old_size;
+       u64 num_extents;
+
        /* not delalloc, ignore it */
        if (!(other->state & EXTENT_DELALLOC))
                return;
 
+       old_size = other->end - other->start + 1;
+       new_size = old_size + (new->end - new->start + 1);
+
+       /* we're not bigger than the max, unreserve the space and go */
+       if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
+               spin_lock(&BTRFS_I(inode)->lock);
+               BTRFS_I(inode)->outstanding_extents--;
+               spin_unlock(&BTRFS_I(inode)->lock);
+               return;
+       }
+
+       /*
+        * If we grew by another max_extent, just return, we want to keep that
+        * reserved amount.
+        */
+       num_extents = div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
+                               BTRFS_MAX_EXTENT_SIZE);
+       if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
+                     BTRFS_MAX_EXTENT_SIZE) > num_extents)
+               return;
+
        spin_lock(&BTRFS_I(inode)->lock);
        BTRFS_I(inode)->outstanding_extents--;
        spin_unlock(&BTRFS_I(inode)->lock);
@@ -1604,7 +1663,7 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root,
  * have pending delalloc work to be done.
  */
 static void btrfs_set_bit_hook(struct inode *inode,
-                              struct extent_state *state, unsigned long *bits)
+                              struct extent_state *state, unsigned *bits)
 {
 
        if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
@@ -1645,9 +1704,11 @@ static void btrfs_set_bit_hook(struct inode *inode,
  */
 static void btrfs_clear_bit_hook(struct inode *inode,
                                 struct extent_state *state,
-                                unsigned long *bits)
+                                unsigned *bits)
 {
        u64 len = state->end + 1 - state->start;
+       u64 num_extents = div64_u64(len + BTRFS_MAX_EXTENT_SIZE -1,
+                                   BTRFS_MAX_EXTENT_SIZE);
 
        spin_lock(&BTRFS_I(inode)->lock);
        if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG))
@@ -1667,7 +1728,7 @@ static void btrfs_clear_bit_hook(struct inode *inode,
                        *bits &= ~EXTENT_FIRST_DELALLOC;
                } else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
                        spin_lock(&BTRFS_I(inode)->lock);
-                       BTRFS_I(inode)->outstanding_extents--;
+                       BTRFS_I(inode)->outstanding_extents -= num_extents;
                        spin_unlock(&BTRFS_I(inode)->lock);
                }
 
@@ -2945,7 +3006,7 @@ static int __readpage_endio_check(struct inode *inode,
        return 0;
 zeroit:
        if (__ratelimit(&_rs))
-               btrfs_info(BTRFS_I(inode)->root->fs_info,
+               btrfs_warn(BTRFS_I(inode)->root->fs_info,
                           "csum failed ino %llu off %llu csum %u expected csum %u",
                           btrfs_ino(inode), start, csum, csum_expected);
        memset(kaddr + pgoff, 1, len);
@@ -3407,7 +3468,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 
 out:
        if (ret)
-               btrfs_crit(root->fs_info,
+               btrfs_err(root->fs_info,
                        "could not do orphan cleanup %d", ret);
        btrfs_free_path(path);
        return ret;
@@ -3490,7 +3551,6 @@ static void btrfs_read_locked_inode(struct inode *inode)
        struct btrfs_path *path;
        struct extent_buffer *leaf;
        struct btrfs_inode_item *inode_item;
-       struct btrfs_timespec *tspec;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_key location;
        unsigned long ptr;
@@ -3527,17 +3587,19 @@ static void btrfs_read_locked_inode(struct inode *inode)
        i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
        btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
 
-       tspec = btrfs_inode_atime(inode_item);
-       inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec);
-       inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+       inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
+       inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
+
+       inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
+       inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
 
-       tspec = btrfs_inode_mtime(inode_item);
-       inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec);
-       inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+       inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime);
+       inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime);
 
-       tspec = btrfs_inode_ctime(inode_item);
-       inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec);
-       inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+       BTRFS_I(inode)->i_otime.tv_sec =
+               btrfs_timespec_sec(leaf, &inode_item->otime);
+       BTRFS_I(inode)->i_otime.tv_nsec =
+               btrfs_timespec_nsec(leaf, &inode_item->otime);
 
        inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
        BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
@@ -3656,21 +3718,26 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
        btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
        btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
 
-       btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
+       btrfs_set_token_timespec_sec(leaf, &item->atime,
                                     inode->i_atime.tv_sec, &token);
-       btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
+       btrfs_set_token_timespec_nsec(leaf, &item->atime,
                                      inode->i_atime.tv_nsec, &token);
 
-       btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
+       btrfs_set_token_timespec_sec(leaf, &item->mtime,
                                     inode->i_mtime.tv_sec, &token);
-       btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
+       btrfs_set_token_timespec_nsec(leaf, &item->mtime,
                                      inode->i_mtime.tv_nsec, &token);
 
-       btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
+       btrfs_set_token_timespec_sec(leaf, &item->ctime,
                                     inode->i_ctime.tv_sec, &token);
-       btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
+       btrfs_set_token_timespec_nsec(leaf, &item->ctime,
                                      inode->i_ctime.tv_nsec, &token);
 
+       btrfs_set_token_timespec_sec(leaf, &item->otime,
+                                    BTRFS_I(inode)->i_otime.tv_sec, &token);
+       btrfs_set_token_timespec_nsec(leaf, &item->otime,
+                                     BTRFS_I(inode)->i_otime.tv_nsec, &token);
+
        btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
                                     &token);
        btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
@@ -5007,6 +5074,7 @@ static int fixup_tree_root_location(struct btrfs_root *root,
        struct btrfs_root *new_root;
        struct btrfs_root_ref *ref;
        struct extent_buffer *leaf;
+       struct btrfs_key key;
        int ret;
        int err = 0;
 
@@ -5017,9 +5085,12 @@ static int fixup_tree_root_location(struct btrfs_root *root,
        }
 
        err = -ENOENT;
-       ret = btrfs_find_item(root->fs_info->tree_root, path,
-                               BTRFS_I(dir)->root->root_key.objectid,
-                               location->objectid, BTRFS_ROOT_REF_KEY, NULL);
+       key.objectid = BTRFS_I(dir)->root->root_key.objectid;
+       key.type = BTRFS_ROOT_REF_KEY;
+       key.offset = location->objectid;
+
+       ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key, path,
+                               0, 0);
        if (ret) {
                if (ret < 0)
                        err = ret;
@@ -5258,7 +5329,10 @@ static struct inode *new_simple_dir(struct super_block *s,
        inode->i_op = &btrfs_dir_ro_inode_operations;
        inode->i_fop = &simple_dir_operations;
        inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
-       inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+       inode->i_mtime = CURRENT_TIME;
+       inode->i_atime = inode->i_mtime;
+       inode->i_ctime = inode->i_mtime;
+       BTRFS_I(inode)->i_otime = inode->i_mtime;
 
        return inode;
 }
@@ -5826,7 +5900,12 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 
        inode_init_owner(inode, dir, mode);
        inode_set_bytes(inode, 0);
-       inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+
+       inode->i_mtime = CURRENT_TIME;
+       inode->i_atime = inode->i_mtime;
+       inode->i_ctime = inode->i_mtime;
+       BTRFS_I(inode)->i_otime = inode->i_mtime;
+
        inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
                                  struct btrfs_inode_item);
        memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item,
@@ -7134,11 +7213,12 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
        u64 start = iblock << inode->i_blkbits;
        u64 lockstart, lockend;
        u64 len = bh_result->b_size;
+       u64 orig_len = len;
        int unlock_bits = EXTENT_LOCKED;
        int ret = 0;
 
        if (create)
-               unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY;
+               unlock_bits |= EXTENT_DIRTY;
        else
                len = min_t(u64, len, root->sectorsize);
 
@@ -7269,14 +7349,12 @@ unlock:
                if (start + len > i_size_read(inode))
                        i_size_write(inode, start + len);
 
-               spin_lock(&BTRFS_I(inode)->lock);
-               BTRFS_I(inode)->outstanding_extents++;
-               spin_unlock(&BTRFS_I(inode)->lock);
-
-               ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
-                                    lockstart + len - 1, EXTENT_DELALLOC, NULL,
-                                    &cached_state, GFP_NOFS);
-               BUG_ON(ret);
+               if (len < orig_len) {
+                       spin_lock(&BTRFS_I(inode)->lock);
+                       BTRFS_I(inode)->outstanding_extents++;
+                       spin_unlock(&BTRFS_I(inode)->lock);
+               }
+               btrfs_free_reserved_data_space(inode, len);
        }
 
        /*
@@ -7805,8 +7883,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
        }
 
        /* async crcs make it difficult to collect full stripe writes. */
-       if (btrfs_get_alloc_profile(root, 1) &
-           (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))
+       if (btrfs_get_alloc_profile(root, 1) & BTRFS_BLOCK_GROUP_RAID56_MASK)
                async_submit = 0;
        else
                async_submit = 1;
@@ -8053,8 +8130,6 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
                else if (ret >= 0 && (size_t)ret < count)
                        btrfs_delalloc_release_space(inode,
                                                     count - (size_t)ret);
-               else
-                       btrfs_delalloc_release_metadata(inode, 0);
        }
 out:
        if (wakeup)
@@ -8575,6 +8650,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 
        ei->delayed_node = NULL;
 
+       ei->i_otime.tv_sec = 0;
+       ei->i_otime.tv_nsec = 0;
+
        inode = &ei->vfs_inode;
        extent_map_tree_init(&ei->extent_tree);
        extent_io_tree_init(&ei->io_tree, &inode->i_data);
index 48b60db..97159a8 100644 (file)
@@ -1431,9 +1431,8 @@ static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
                qgroup = u64_to_ptr(unode->aux);
                qgroup->rfer += sign * oper->num_bytes;
                qgroup->rfer_cmpr += sign * oper->num_bytes;
+               WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes);
                qgroup->excl += sign * oper->num_bytes;
-               if (sign < 0)
-                       WARN_ON(qgroup->excl < oper->num_bytes);
                qgroup->excl_cmpr += sign * oper->num_bytes;
                qgroup_dirty(fs_info, qgroup);
 
index 8ab2a17..5264858 100644 (file)
  */
 #define RBIO_CACHE_READY_BIT   3
 
-/*
- * bbio and raid_map is managed by the caller, so we shouldn't free
- * them here. And besides that, all rbios with this flag should not
- * be cached, because we need raid_map to check the rbios' stripe
- * is the same or not, but it is very likely that the caller has
- * free raid_map, so don't cache those rbios.
- */
-#define RBIO_HOLD_BBIO_MAP_BIT 4
-
 #define RBIO_CACHE_SIZE 1024
 
 enum btrfs_rbio_ops {
@@ -79,13 +70,6 @@ struct btrfs_raid_bio {
        struct btrfs_fs_info *fs_info;
        struct btrfs_bio *bbio;
 
-       /*
-        * logical block numbers for the start of each stripe
-        * The last one or two are p/q.  These are sorted,
-        * so raid_map[0] is the start of our full stripe
-        */
-       u64 *raid_map;
-
        /* while we're doing rmw on a stripe
         * we put it into a hash table so we can
         * lock the stripe and merge more rbios
@@ -303,7 +287,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
  */
 static int rbio_bucket(struct btrfs_raid_bio *rbio)
 {
-       u64 num = rbio->raid_map[0];
+       u64 num = rbio->bbio->raid_map[0];
 
        /*
         * we shift down quite a bit.  We're using byte
@@ -606,8 +590,8 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
            test_bit(RBIO_CACHE_BIT, &cur->flags))
                return 0;
 
-       if (last->raid_map[0] !=
-           cur->raid_map[0])
+       if (last->bbio->raid_map[0] !=
+           cur->bbio->raid_map[0])
                return 0;
 
        /* we can't merge with different operations */
@@ -689,7 +673,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
        spin_lock_irqsave(&h->lock, flags);
        list_for_each_entry(cur, &h->hash_list, hash_list) {
                walk++;
-               if (cur->raid_map[0] == rbio->raid_map[0]) {
+               if (cur->bbio->raid_map[0] == rbio->bbio->raid_map[0]) {
                        spin_lock(&cur->bio_list_lock);
 
                        /* can we steal this cached rbio's pages? */
@@ -841,21 +825,6 @@ done_nolock:
                remove_rbio_from_cache(rbio);
 }
 
-static inline void
-__free_bbio_and_raid_map(struct btrfs_bio *bbio, u64 *raid_map, int need)
-{
-       if (need) {
-               kfree(raid_map);
-               kfree(bbio);
-       }
-}
-
-static inline void free_bbio_and_raid_map(struct btrfs_raid_bio *rbio)
-{
-       __free_bbio_and_raid_map(rbio->bbio, rbio->raid_map,
-                       !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags));
-}
-
 static void __free_raid_bio(struct btrfs_raid_bio *rbio)
 {
        int i;
@@ -875,8 +844,7 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio)
                }
        }
 
-       free_bbio_and_raid_map(rbio);
-
+       btrfs_put_bbio(rbio->bbio);
        kfree(rbio);
 }
 
@@ -985,8 +953,7 @@ static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
  * this does not allocate any pages for rbio->pages.
  */
 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
-                         struct btrfs_bio *bbio, u64 *raid_map,
-                         u64 stripe_len)
+                         struct btrfs_bio *bbio, u64 stripe_len)
 {
        struct btrfs_raid_bio *rbio;
        int nr_data = 0;
@@ -1007,7 +974,6 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
        INIT_LIST_HEAD(&rbio->stripe_cache);
        INIT_LIST_HEAD(&rbio->hash_list);
        rbio->bbio = bbio;
-       rbio->raid_map = raid_map;
        rbio->fs_info = root->fs_info;
        rbio->stripe_len = stripe_len;
        rbio->nr_pages = num_pages;
@@ -1028,10 +994,12 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
        rbio->bio_pages = p + sizeof(struct page *) * num_pages;
        rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2;
 
-       if (raid_map[real_stripes - 1] == RAID6_Q_STRIPE)
+       if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
+               nr_data = real_stripes - 1;
+       else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
                nr_data = real_stripes - 2;
        else
-               nr_data = real_stripes - 1;
+               BUG();
 
        rbio->nr_data = nr_data;
        return rbio;
@@ -1182,7 +1150,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
        spin_lock_irq(&rbio->bio_list_lock);
        bio_list_for_each(bio, &rbio->bio_list) {
                start = (u64)bio->bi_iter.bi_sector << 9;
-               stripe_offset = start - rbio->raid_map[0];
+               stripe_offset = start - rbio->bbio->raid_map[0];
                page_index = stripe_offset >> PAGE_CACHE_SHIFT;
 
                for (i = 0; i < bio->bi_vcnt; i++) {
@@ -1402,7 +1370,7 @@ static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
        logical <<= 9;
 
        for (i = 0; i < rbio->nr_data; i++) {
-               stripe_start = rbio->raid_map[i];
+               stripe_start = rbio->bbio->raid_map[i];
                if (logical >= stripe_start &&
                    logical < stripe_start + rbio->stripe_len) {
                        return i;
@@ -1776,17 +1744,16 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
  * our main entry point for writes from the rest of the FS.
  */
 int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
-                       struct btrfs_bio *bbio, u64 *raid_map,
-                       u64 stripe_len)
+                       struct btrfs_bio *bbio, u64 stripe_len)
 {
        struct btrfs_raid_bio *rbio;
        struct btrfs_plug_cb *plug = NULL;
        struct blk_plug_cb *cb;
        int ret;
 
-       rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
+       rbio = alloc_rbio(root, bbio, stripe_len);
        if (IS_ERR(rbio)) {
-               __free_bbio_and_raid_map(bbio, raid_map, 1);
+               btrfs_put_bbio(bbio);
                return PTR_ERR(rbio);
        }
        bio_list_add(&rbio->bio_list, bio);
@@ -1885,9 +1852,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
                }
 
                /* all raid6 handling here */
-               if (rbio->raid_map[rbio->real_stripes - 1] ==
-                   RAID6_Q_STRIPE) {
-
+               if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) {
                        /*
                         * single failure, rebuild from parity raid5
                         * style
@@ -1922,8 +1887,9 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
                         * here due to a crc mismatch and we can't give them the
                         * data they want
                         */
-                       if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
-                               if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
+                       if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) {
+                               if (rbio->bbio->raid_map[faila] ==
+                                   RAID5_P_STRIPE) {
                                        err = -EIO;
                                        goto cleanup;
                                }
@@ -1934,7 +1900,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
                                goto pstripe;
                        }
 
-                       if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
+                       if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) {
                                raid6_datap_recov(rbio->real_stripes,
                                                  PAGE_SIZE, faila, pointers);
                        } else {
@@ -2001,8 +1967,7 @@ cleanup:
 
 cleanup_io:
        if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
-               if (err == 0 &&
-                   !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags))
+               if (err == 0)
                        cache_rbio_pages(rbio);
                else
                        clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
@@ -2156,15 +2121,16 @@ cleanup:
  * of the drive.
  */
 int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
-                         struct btrfs_bio *bbio, u64 *raid_map,
-                         u64 stripe_len, int mirror_num, int generic_io)
+                         struct btrfs_bio *bbio, u64 stripe_len,
+                         int mirror_num, int generic_io)
 {
        struct btrfs_raid_bio *rbio;
        int ret;
 
-       rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
+       rbio = alloc_rbio(root, bbio, stripe_len);
        if (IS_ERR(rbio)) {
-               __free_bbio_and_raid_map(bbio, raid_map, generic_io);
+               if (generic_io)
+                       btrfs_put_bbio(bbio);
                return PTR_ERR(rbio);
        }
 
@@ -2175,7 +2141,8 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
        rbio->faila = find_logical_bio_stripe(rbio, bio);
        if (rbio->faila == -1) {
                BUG();
-               __free_bbio_and_raid_map(bbio, raid_map, generic_io);
+               if (generic_io)
+                       btrfs_put_bbio(bbio);
                kfree(rbio);
                return -EIO;
        }
@@ -2184,7 +2151,7 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
                btrfs_bio_counter_inc_noblocked(root->fs_info);
                rbio->generic_bio_cnt = 1;
        } else {
-               set_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags);
+               btrfs_get_bbio(bbio);
        }
 
        /*
@@ -2240,14 +2207,14 @@ static void read_rebuild_work(struct btrfs_work *work)
 
 struct btrfs_raid_bio *
 raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
-                              struct btrfs_bio *bbio, u64 *raid_map,
-                              u64 stripe_len, struct btrfs_device *scrub_dev,
+                              struct btrfs_bio *bbio, u64 stripe_len,
+                              struct btrfs_device *scrub_dev,
                               unsigned long *dbitmap, int stripe_nsectors)
 {
        struct btrfs_raid_bio *rbio;
        int i;
 
-       rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
+       rbio = alloc_rbio(root, bbio, stripe_len);
        if (IS_ERR(rbio))
                return NULL;
        bio_list_add(&rbio->bio_list, bio);
@@ -2279,10 +2246,10 @@ void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
        int stripe_offset;
        int index;
 
-       ASSERT(logical >= rbio->raid_map[0]);
-       ASSERT(logical + PAGE_SIZE <= rbio->raid_map[0] +
+       ASSERT(logical >= rbio->bbio->raid_map[0]);
+       ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] +
                                rbio->stripe_len * rbio->nr_data);
-       stripe_offset = (int)(logical - rbio->raid_map[0]);
+       stripe_offset = (int)(logical - rbio->bbio->raid_map[0]);
        index = stripe_offset >> PAGE_CACHE_SHIFT;
        rbio->bio_pages[index] = page;
 }
index 31d4a15..2b5d797 100644 (file)
@@ -43,16 +43,15 @@ struct btrfs_raid_bio;
 struct btrfs_device;
 
 int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
-                         struct btrfs_bio *bbio, u64 *raid_map,
-                         u64 stripe_len, int mirror_num, int generic_io);
+                         struct btrfs_bio *bbio, u64 stripe_len,
+                         int mirror_num, int generic_io);
 int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
-                              struct btrfs_bio *bbio, u64 *raid_map,
-                              u64 stripe_len);
+                              struct btrfs_bio *bbio, u64 stripe_len);
 
 struct btrfs_raid_bio *
 raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
-                              struct btrfs_bio *bbio, u64 *raid_map,
-                              u64 stripe_len, struct btrfs_device *scrub_dev,
+                              struct btrfs_bio *bbio, u64 stripe_len,
+                              struct btrfs_device *scrub_dev,
                               unsigned long *dbitmap, int stripe_nsectors);
 void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
                                   struct page *page, u64 logical);
index b63ae20..0e7beea 100644 (file)
@@ -66,7 +66,6 @@ struct reada_extctl {
 struct reada_extent {
        u64                     logical;
        struct btrfs_key        top;
-       u32                     blocksize;
        int                     err;
        struct list_head        extctl;
        int                     refcnt;
@@ -349,7 +348,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
 
        blocksize = root->nodesize;
        re->logical = logical;
-       re->blocksize = blocksize;
        re->top = *top;
        INIT_LIST_HEAD(&re->extctl);
        spin_lock_init(&re->lock);
@@ -463,7 +461,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
        spin_unlock(&fs_info->reada_lock);
        btrfs_dev_replace_unlock(&fs_info->dev_replace);
 
-       kfree(bbio);
+       btrfs_put_bbio(bbio);
        return re;
 
 error:
@@ -488,7 +486,7 @@ error:
                kref_put(&zone->refcnt, reada_zone_release);
                spin_unlock(&fs_info->reada_lock);
        }
-       kfree(bbio);
+       btrfs_put_bbio(bbio);
        kfree(re);
        return re_exist;
 }
@@ -660,7 +658,6 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
        int mirror_num = 0;
        struct extent_buffer *eb = NULL;
        u64 logical;
-       u32 blocksize;
        int ret;
        int i;
        int need_kick = 0;
@@ -694,7 +691,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
                spin_unlock(&fs_info->reada_lock);
                return 0;
        }
-       dev->reada_next = re->logical + re->blocksize;
+       dev->reada_next = re->logical + fs_info->tree_root->nodesize;
        re->refcnt++;
 
        spin_unlock(&fs_info->reada_lock);
@@ -709,7 +706,6 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
                }
        }
        logical = re->logical;
-       blocksize = re->blocksize;
 
        spin_lock(&re->lock);
        if (re->scheduled_for == NULL) {
@@ -724,8 +720,8 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
                return 0;
 
        atomic_inc(&dev->reada_in_flight);
-       ret = reada_tree_block_flagged(fs_info->extent_root, logical, blocksize,
-                        mirror_num, &eb);
+       ret = reada_tree_block_flagged(fs_info->extent_root, logical,
+                       mirror_num, &eb);
        if (ret)
                __readahead_hook(fs_info->extent_root, NULL, logical, ret);
        else if (eb)
@@ -851,7 +847,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
                                break;
                        printk(KERN_DEBUG
                                "  re: logical %llu size %u empty %d for %lld",
-                               re->logical, re->blocksize,
+                               re->logical, fs_info->tree_root->nodesize,
                                list_empty(&re->extctl), re->scheduled_for ?
                                re->scheduled_for->devid : -1);
 
@@ -886,7 +882,8 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
                }
                printk(KERN_DEBUG
                        "re: logical %llu size %u list empty %d for %lld",
-                       re->logical, re->blocksize, list_empty(&re->extctl),
+                       re->logical, fs_info->tree_root->nodesize,
+                       list_empty(&re->extctl),
                        re->scheduled_for ? re->scheduled_for->devid : -1);
                for (i = 0; i < re->nzones; ++i) {
                        printk(KERN_CONT " zone %llu-%llu devs",
index 74257d6..d830853 100644 (file)
@@ -2855,9 +2855,10 @@ static void update_processed_blocks(struct reloc_control *rc,
        }
 }
 
-static int tree_block_processed(u64 bytenr, u32 blocksize,
-                               struct reloc_control *rc)
+static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
 {
+       u32 blocksize = rc->extent_root->nodesize;
+
        if (test_range_bit(&rc->processed_blocks, bytenr,
                           bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
                return 1;
@@ -2965,8 +2966,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
        while (rb_node) {
                block = rb_entry(rb_node, struct tree_block, rb_node);
                if (!block->key_ready)
-                       readahead_tree_block(rc->extent_root, block->bytenr,
-                                       block->key.objectid);
+                       readahead_tree_block(rc->extent_root, block->bytenr);
                rb_node = rb_next(rb_node);
        }
 
@@ -3353,7 +3353,7 @@ static int __add_tree_block(struct reloc_control *rc,
        bool skinny = btrfs_fs_incompat(rc->extent_root->fs_info,
                                        SKINNY_METADATA);
 
-       if (tree_block_processed(bytenr, blocksize, rc))
+       if (tree_block_processed(bytenr, rc))
                return 0;
 
        if (tree_search(blocks, bytenr))
@@ -3611,7 +3611,7 @@ static int find_data_references(struct reloc_control *rc,
                if (added)
                        goto next;
 
-               if (!tree_block_processed(leaf->start, leaf->len, rc)) {
+               if (!tree_block_processed(leaf->start, rc)) {
                        block = kmalloc(sizeof(*block), GFP_NOFS);
                        if (!block) {
                                err = -ENOMEM;
index e427cb7..ec57687 100644 (file)
@@ -66,7 +66,6 @@ struct scrub_ctx;
 struct scrub_recover {
        atomic_t                refs;
        struct btrfs_bio        *bbio;
-       u64                     *raid_map;
        u64                     map_length;
 };
 
@@ -80,7 +79,7 @@ struct scrub_page {
        u64                     logical;
        u64                     physical;
        u64                     physical_for_dev_replace;
-       atomic_t                ref_count;
+       atomic_t                refs;
        struct {
                unsigned int    mirror_num:8;
                unsigned int    have_csum:1;
@@ -113,7 +112,7 @@ struct scrub_block {
        struct scrub_page       *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
        int                     page_count;
        atomic_t                outstanding_pages;
-       atomic_t                ref_count; /* free mem on transition to zero */
+       atomic_t                refs; /* free mem on transition to zero */
        struct scrub_ctx        *sctx;
        struct scrub_parity     *sparity;
        struct {
@@ -142,7 +141,7 @@ struct scrub_parity {
 
        int                     stripe_len;
 
-       atomic_t                ref_count;
+       atomic_t                refs;
 
        struct list_head        spages;
 
@@ -194,6 +193,15 @@ struct scrub_ctx {
         */
        struct btrfs_scrub_progress stat;
        spinlock_t              stat_lock;
+
+       /*
+        * Use a ref counter to avoid use-after-free issues. Scrub workers
+        * decrement bios_in_flight and workers_pending and then do a wakeup
+        * on the list_wait wait queue. We must ensure the main scrub task
+        * doesn't free the scrub context before or while the workers are
+        * doing the wakeup() call.
+        */
+       atomic_t                refs;
 };
 
 struct scrub_fixup_nodatasum {
@@ -236,10 +244,7 @@ static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
 static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
-static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
-                                    struct btrfs_fs_info *fs_info,
-                                    struct scrub_block *original_sblock,
-                                    u64 length, u64 logical,
+static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
                                     struct scrub_block *sblocks_for_recheck);
 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
                                struct scrub_block *sblock, int is_metadata,
@@ -251,8 +256,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
                                         const u8 *csum, u64 generation,
                                         u16 csum_size);
 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
-                                            struct scrub_block *sblock_good,
-                                            int force_write);
+                                            struct scrub_block *sblock_good);
 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
                                            struct scrub_block *sblock_good,
                                            int page_num, int force_write);
@@ -302,10 +306,12 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 static void copy_nocow_pages_worker(struct btrfs_work *work);
 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
+static void scrub_put_ctx(struct scrub_ctx *sctx);
 
 
 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
 {
+       atomic_inc(&sctx->refs);
        atomic_inc(&sctx->bios_in_flight);
 }
 
@@ -313,6 +319,7 @@ static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
 {
        atomic_dec(&sctx->bios_in_flight);
        wake_up(&sctx->list_wait);
+       scrub_put_ctx(sctx);
 }
 
 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
@@ -346,6 +353,7 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
 {
        struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
 
+       atomic_inc(&sctx->refs);
        /*
         * increment scrubs_running to prevent cancel requests from
         * completing as long as a worker is running. we must also
@@ -388,6 +396,7 @@ static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
        atomic_dec(&sctx->workers_pending);
        wake_up(&fs_info->scrub_pause_wait);
        wake_up(&sctx->list_wait);
+       scrub_put_ctx(sctx);
 }
 
 static void scrub_free_csums(struct scrub_ctx *sctx)
@@ -433,6 +442,12 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
        kfree(sctx);
 }
 
+static void scrub_put_ctx(struct scrub_ctx *sctx)
+{
+       if (atomic_dec_and_test(&sctx->refs))
+               scrub_free_ctx(sctx);
+}
+
 static noinline_for_stack
 struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
 {
@@ -457,6 +472,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
        sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
        if (!sctx)
                goto nomem;
+       atomic_set(&sctx->refs, 1);
        sctx->is_dev_replace = is_dev_replace;
        sctx->pages_per_rd_bio = pages_per_rd_bio;
        sctx->curr = -1;
@@ -520,6 +536,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
        struct inode_fs_paths *ipath = NULL;
        struct btrfs_root *local_root;
        struct btrfs_key root_key;
+       struct btrfs_key key;
 
        root_key.objectid = root;
        root_key.type = BTRFS_ROOT_ITEM_KEY;
@@ -530,7 +547,14 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
                goto err;
        }
 
-       ret = inode_item_info(inum, 0, local_root, swarn->path);
+       /*
+        * this makes the path point to (inum INODE_ITEM ioff)
+        */
+       key.objectid = inum;
+       key.type = BTRFS_INODE_ITEM_KEY;
+       key.offset = 0;
+
+       ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
        if (ret) {
                btrfs_release_path(swarn->path);
                goto err;
@@ -848,8 +872,7 @@ static inline void scrub_get_recover(struct scrub_recover *recover)
 static inline void scrub_put_recover(struct scrub_recover *recover)
 {
        if (atomic_dec_and_test(&recover->refs)) {
-               kfree(recover->bbio);
-               kfree(recover->raid_map);
+               btrfs_put_bbio(recover->bbio);
                kfree(recover);
        }
 }
@@ -955,8 +978,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
        }
 
        /* setup the context, map the logical blocks and alloc the pages */
-       ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,
-                                       logical, sblocks_for_recheck);
+       ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
        if (ret) {
                spin_lock(&sctx->stat_lock);
                sctx->stat.read_errors++;
@@ -1030,9 +1052,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
        if (!is_metadata && !have_csum) {
                struct scrub_fixup_nodatasum *fixup_nodatasum;
 
-nodatasum_case:
                WARN_ON(sctx->is_dev_replace);
 
+nodatasum_case:
+
                /*
                 * !is_metadata and !have_csum, this means that the data
                 * might not be COW'ed, that it might be modified
@@ -1091,76 +1114,20 @@ nodatasum_case:
                    sblock_other->no_io_error_seen) {
                        if (sctx->is_dev_replace) {
                                scrub_write_block_to_dev_replace(sblock_other);
+                               goto corrected_error;
                        } else {
-                               int force_write = is_metadata || have_csum;
-
                                ret = scrub_repair_block_from_good_copy(
-                                               sblock_bad, sblock_other,
-                                               force_write);
+                                               sblock_bad, sblock_other);
+                               if (!ret)
+                                       goto corrected_error;
                        }
-                       if (0 == ret)
-                               goto corrected_error;
                }
        }
 
-       /*
-        * for dev_replace, pick good pages and write to the target device.
-        */
-       if (sctx->is_dev_replace) {
-               success = 1;
-               for (page_num = 0; page_num < sblock_bad->page_count;
-                    page_num++) {
-                       int sub_success;
-
-                       sub_success = 0;
-                       for (mirror_index = 0;
-                            mirror_index < BTRFS_MAX_MIRRORS &&
-                            sblocks_for_recheck[mirror_index].page_count > 0;
-                            mirror_index++) {
-                               struct scrub_block *sblock_other =
-                                       sblocks_for_recheck + mirror_index;
-                               struct scrub_page *page_other =
-                                       sblock_other->pagev[page_num];
-
-                               if (!page_other->io_error) {
-                                       ret = scrub_write_page_to_dev_replace(
-                                                       sblock_other, page_num);
-                                       if (ret == 0) {
-                                               /* succeeded for this page */
-                                               sub_success = 1;
-                                               break;
-                                       } else {
-                                               btrfs_dev_replace_stats_inc(
-                                                       &sctx->dev_root->
-                                                       fs_info->dev_replace.
-                                                       num_write_errors);
-                                       }
-                               }
-                       }
-
-                       if (!sub_success) {
-                               /*
-                                * did not find a mirror to fetch the page
-                                * from. scrub_write_page_to_dev_replace()
-                                * handles this case (page->io_error), by
-                                * filling the block with zeros before
-                                * submitting the write request
-                                */
-                               success = 0;
-                               ret = scrub_write_page_to_dev_replace(
-                                               sblock_bad, page_num);
-                               if (ret)
-                                       btrfs_dev_replace_stats_inc(
-                                               &sctx->dev_root->fs_info->
-                                               dev_replace.num_write_errors);
-                       }
-               }
-
-               goto out;
-       }
+       if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
+               goto did_not_correct_error;
 
        /*
-        * for regular scrub, repair those pages that are errored.
         * In case of I/O errors in the area that is supposed to be
         * repaired, continue by picking good copies of those pages.
         * Select the good pages from mirrors to rewrite bad pages from
@@ -1184,44 +1151,64 @@ nodatasum_case:
         * mirror, even if other 512 byte sectors in the same PAGE_SIZE
         * area are unreadable.
         */
-
-       /* can only fix I/O errors from here on */
-       if (sblock_bad->no_io_error_seen)
-               goto did_not_correct_error;
-
        success = 1;
-       for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
+       for (page_num = 0; page_num < sblock_bad->page_count;
+            page_num++) {
                struct scrub_page *page_bad = sblock_bad->pagev[page_num];
+               struct scrub_block *sblock_other = NULL;
 
-               if (!page_bad->io_error)
+               /* skip no-io-error page in scrub */
+               if (!page_bad->io_error && !sctx->is_dev_replace)
                        continue;
 
-               for (mirror_index = 0;
-                    mirror_index < BTRFS_MAX_MIRRORS &&
-                    sblocks_for_recheck[mirror_index].page_count > 0;
-                    mirror_index++) {
-                       struct scrub_block *sblock_other = sblocks_for_recheck +
-                                                          mirror_index;
-                       struct scrub_page *page_other = sblock_other->pagev[
-                                                       page_num];
-
-                       if (!page_other->io_error) {
-                               ret = scrub_repair_page_from_good_copy(
-                                       sblock_bad, sblock_other, page_num, 0);
-                               if (0 == ret) {
-                                       page_bad->io_error = 0;
-                                       break; /* succeeded for this page */
+               /* try to find no-io-error page in mirrors */
+               if (page_bad->io_error) {
+                       for (mirror_index = 0;
+                            mirror_index < BTRFS_MAX_MIRRORS &&
+                            sblocks_for_recheck[mirror_index].page_count > 0;
+                            mirror_index++) {
+                               if (!sblocks_for_recheck[mirror_index].
+                                   pagev[page_num]->io_error) {
+                                       sblock_other = sblocks_for_recheck +
+                                                      mirror_index;
+                                       break;
                                }
                        }
+                       if (!sblock_other)
+                               success = 0;
                }
 
-               if (page_bad->io_error) {
-                       /* did not find a mirror to copy the page from */
-                       success = 0;
+               if (sctx->is_dev_replace) {
+                       /*
+                        * did not find a mirror to fetch the page
+                        * from. scrub_write_page_to_dev_replace()
+                        * handles this case (page->io_error), by
+                        * filling the block with zeros before
+                        * submitting the write request
+                        */
+                       if (!sblock_other)
+                               sblock_other = sblock_bad;
+
+                       if (scrub_write_page_to_dev_replace(sblock_other,
+                                                           page_num) != 0) {
+                               btrfs_dev_replace_stats_inc(
+                                       &sctx->dev_root->
+                                       fs_info->dev_replace.
+                                       num_write_errors);
+                               success = 0;
+                       }
+               } else if (sblock_other) {
+                       ret = scrub_repair_page_from_good_copy(sblock_bad,
+                                                              sblock_other,
+                                                              page_num, 0);
+                       if (0 == ret)
+                               page_bad->io_error = 0;
+                       else
+                               success = 0;
                }
        }
 
-       if (success) {
+       if (success && !sctx->is_dev_replace) {
                if (is_metadata || have_csum) {
                        /*
                         * need to verify the checksum now that all
@@ -1288,19 +1275,18 @@ out:
        return 0;
 }
 
-static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio, u64 *raid_map)
+static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
 {
-       if (raid_map) {
-               if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
-                       return 3;
-               else
-                       return 2;
-       } else {
+       if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
+               return 2;
+       else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
+               return 3;
+       else
                return (int)bbio->num_stripes;
-       }
 }
 
-static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map,
+static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
+                                                u64 *raid_map,
                                                 u64 mapped_length,
                                                 int nstripes, int mirror,
                                                 int *stripe_index,
@@ -1308,7 +1294,7 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map,
 {
        int i;
 
-       if (raid_map) {
+       if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
                /* RAID5/6 */
                for (i = 0; i < nstripes; i++) {
                        if (raid_map[i] == RAID6_Q_STRIPE ||
@@ -1329,72 +1315,65 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map,
        }
 }
 
-static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
-                                    struct btrfs_fs_info *fs_info,
-                                    struct scrub_block *original_sblock,
-                                    u64 length, u64 logical,
+static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
                                     struct scrub_block *sblocks_for_recheck)
 {
+       struct scrub_ctx *sctx = original_sblock->sctx;
+       struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+       u64 length = original_sblock->page_count * PAGE_SIZE;
+       u64 logical = original_sblock->pagev[0]->logical;
        struct scrub_recover *recover;
        struct btrfs_bio *bbio;
-       u64 *raid_map;
        u64 sublen;
        u64 mapped_length;
        u64 stripe_offset;
        int stripe_index;
-       int page_index;
+       int page_index = 0;
        int mirror_index;
        int nmirrors;
        int ret;
 
        /*
-        * note: the two members ref_count and outstanding_pages
+        * note: the two members refs and outstanding_pages
         * are not used (and not set) in the blocks that are used for
         * the recheck procedure
         */
 
-       page_index = 0;
        while (length > 0) {
                sublen = min_t(u64, length, PAGE_SIZE);
                mapped_length = sublen;
                bbio = NULL;
-               raid_map = NULL;
 
                /*
                 * with a length of PAGE_SIZE, each returned stripe
                 * represents one mirror
                 */
                ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical,
-                                      &mapped_length, &bbio, 0, &raid_map);
+                                      &mapped_length, &bbio, 0, 1);
                if (ret || !bbio || mapped_length < sublen) {
-                       kfree(bbio);
-                       kfree(raid_map);
+                       btrfs_put_bbio(bbio);
                        return -EIO;
                }
 
                recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
                if (!recover) {
-                       kfree(bbio);
-                       kfree(raid_map);
+                       btrfs_put_bbio(bbio);
                        return -ENOMEM;
                }
 
                atomic_set(&recover->refs, 1);
                recover->bbio = bbio;
-               recover->raid_map = raid_map;
                recover->map_length = mapped_length;
 
                BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
 
-               nmirrors = scrub_nr_raid_mirrors(bbio, raid_map);
+               nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
+
                for (mirror_index = 0; mirror_index < nmirrors;
                     mirror_index++) {
                        struct scrub_block *sblock;
                        struct scrub_page *page;
 
-                       if (mirror_index >= BTRFS_MAX_MIRRORS)
-                               continue;
-
                        sblock = sblocks_for_recheck + mirror_index;
                        sblock->sctx = sctx;
                        page = kzalloc(sizeof(*page), GFP_NOFS);
@@ -1410,9 +1389,12 @@ leave_nomem:
                        sblock->pagev[page_index] = page;
                        page->logical = logical;
 
-                       scrub_stripe_index_and_offset(logical, raid_map,
+                       scrub_stripe_index_and_offset(logical,
+                                                     bbio->map_type,
+                                                     bbio->raid_map,
                                                      mapped_length,
-                                                     bbio->num_stripes,
+                                                     bbio->num_stripes -
+                                                     bbio->num_tgtdevs,
                                                      mirror_index,
                                                      &stripe_index,
                                                      &stripe_offset);
@@ -1458,7 +1440,8 @@ static void scrub_bio_wait_endio(struct bio *bio, int error)
 
 static inline int scrub_is_page_on_raid56(struct scrub_page *page)
 {
-       return page->recover && page->recover->raid_map;
+       return page->recover &&
+              (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
 }
 
 static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
@@ -1475,7 +1458,6 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
        bio->bi_end_io = scrub_bio_wait_endio;
 
        ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio,
-                                   page->recover->raid_map,
                                    page->recover->map_length,
                                    page->mirror_num, 0);
        if (ret)
@@ -1615,8 +1597,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
 }
 
 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
-                                            struct scrub_block *sblock_good,
-                                            int force_write)
+                                            struct scrub_block *sblock_good)
 {
        int page_num;
        int ret = 0;
@@ -1626,8 +1607,7 @@ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
 
                ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
                                                           sblock_good,
-                                                          page_num,
-                                                          force_write);
+                                                          page_num, 1);
                if (ret_sub)
                        ret = ret_sub;
        }
@@ -2067,12 +2047,12 @@ static int scrub_checksum_super(struct scrub_block *sblock)
 
 static void scrub_block_get(struct scrub_block *sblock)
 {
-       atomic_inc(&sblock->ref_count);
+       atomic_inc(&sblock->refs);
 }
 
 static void scrub_block_put(struct scrub_block *sblock)
 {
-       if (atomic_dec_and_test(&sblock->ref_count)) {
+       if (atomic_dec_and_test(&sblock->refs)) {
                int i;
 
                if (sblock->sparity)
@@ -2086,12 +2066,12 @@ static void scrub_block_put(struct scrub_block *sblock)
 
 static void scrub_page_get(struct scrub_page *spage)
 {
-       atomic_inc(&spage->ref_count);
+       atomic_inc(&spage->refs);
 }
 
 static void scrub_page_put(struct scrub_page *spage)
 {
-       if (atomic_dec_and_test(&spage->ref_count)) {
+       if (atomic_dec_and_test(&spage->refs)) {
                if (spage->page)
                        __free_page(spage->page);
                kfree(spage);
@@ -2217,7 +2197,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 
        /* one ref inside this function, plus one for each page added to
         * a bio later on */
-       atomic_set(&sblock->ref_count, 1);
+       atomic_set(&sblock->refs, 1);
        sblock->sctx = sctx;
        sblock->no_io_error_seen = 1;
 
@@ -2510,7 +2490,7 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity,
 
        /* one ref inside this function, plus one for each page added to
         * a bio later on */
-       atomic_set(&sblock->ref_count, 1);
+       atomic_set(&sblock->refs, 1);
        sblock->sctx = sctx;
        sblock->no_io_error_seen = 1;
        sblock->sparity = sparity;
@@ -2705,7 +2685,6 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
        struct btrfs_raid_bio *rbio;
        struct scrub_page *spage;
        struct btrfs_bio *bbio = NULL;
-       u64 *raid_map = NULL;
        u64 length;
        int ret;
 
@@ -2716,8 +2695,8 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
        length = sparity->logic_end - sparity->logic_start + 1;
        ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE,
                               sparity->logic_start,
-                              &length, &bbio, 0, &raid_map);
-       if (ret || !bbio || !raid_map)
+                              &length, &bbio, 0, 1);
+       if (ret || !bbio || !bbio->raid_map)
                goto bbio_out;
 
        bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
@@ -2729,8 +2708,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
        bio->bi_end_io = scrub_parity_bio_endio;
 
        rbio = raid56_parity_alloc_scrub_rbio(sctx->dev_root, bio, bbio,
-                                             raid_map, length,
-                                             sparity->scrub_dev,
+                                             length, sparity->scrub_dev,
                                              sparity->dbitmap,
                                              sparity->nsectors);
        if (!rbio)
@@ -2747,8 +2725,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
 rbio_out:
        bio_put(bio);
 bbio_out:
-       kfree(bbio);
-       kfree(raid_map);
+       btrfs_put_bbio(bbio);
        bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
                  sparity->nsectors);
        spin_lock(&sctx->stat_lock);
@@ -2765,12 +2742,12 @@ static inline int scrub_calc_parity_bitmap_len(int nsectors)
 
 static void scrub_parity_get(struct scrub_parity *sparity)
 {
-       atomic_inc(&sparity->ref_count);
+       atomic_inc(&sparity->refs);
 }
 
 static void scrub_parity_put(struct scrub_parity *sparity)
 {
-       if (!atomic_dec_and_test(&sparity->ref_count))
+       if (!atomic_dec_and_test(&sparity->refs))
                return;
 
        scrub_parity_check_and_repair(sparity);
@@ -2820,7 +2797,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
        sparity->scrub_dev = sdev;
        sparity->logic_start = logic_start;
        sparity->logic_end = logic_end;
-       atomic_set(&sparity->ref_count, 1);
+       atomic_set(&sparity->refs, 1);
        INIT_LIST_HEAD(&sparity->spages);
        sparity->dbitmap = sparity->bitmap;
        sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
@@ -3037,8 +3014,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
        } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
                increment = map->stripe_len;
                mirror_num = num % map->num_stripes + 1;
-       } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
-                               BTRFS_BLOCK_GROUP_RAID6)) {
+       } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
                get_raid56_logic_offset(physical, num, map, &offset, NULL);
                increment = map->stripe_len * nr_data_stripes(map);
                mirror_num = 1;
@@ -3074,8 +3050,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
         */
        logical = base + offset;
        physical_end = physical + nstripes * map->stripe_len;
-       if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
-                        BTRFS_BLOCK_GROUP_RAID6)) {
+       if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
                get_raid56_logic_offset(physical_end, num,
                                        map, &logic_end, NULL);
                logic_end += base;
@@ -3121,8 +3096,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
        ret = 0;
        while (physical < physical_end) {
                /* for raid56, we skip parity stripe */
-               if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
-                               BTRFS_BLOCK_GROUP_RAID6)) {
+               if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
                        ret = get_raid56_logic_offset(physical, num,
                                        map, &logical, &stripe_logical);
                        logical += base;
@@ -3280,8 +3254,7 @@ again:
                        scrub_free_csums(sctx);
                        if (extent_logical + extent_len <
                            key.objectid + bytes) {
-                               if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
-                                       BTRFS_BLOCK_GROUP_RAID6)) {
+                               if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
                                        /*
                                         * loop until we find next data stripe
                                         * or we have finished all stripes.
@@ -3775,7 +3748,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
        scrub_workers_put(fs_info);
        mutex_unlock(&fs_info->scrub_lock);
 
-       scrub_free_ctx(sctx);
+       scrub_put_ctx(sctx);
 
        return ret;
 }
@@ -3881,14 +3854,14 @@ static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
                              &mapped_length, &bbio, 0);
        if (ret || !bbio || mapped_length < extent_len ||
            !bbio->stripes[0].dev->bdev) {
-               kfree(bbio);
+               btrfs_put_bbio(bbio);
                return;
        }
 
        *extent_physical = bbio->stripes[0].physical;
        *extent_mirror_num = bbio->mirror_num;
        *extent_dev = bbio->stripes[0].dev;
-       kfree(bbio);
+       btrfs_put_bbio(bbio);
 }
 
 static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
index 804432d..fe58572 100644 (file)
@@ -2471,12 +2471,9 @@ verbose_printk("btrfs: send_utimes %llu\n", ino);
        if (ret < 0)
                goto out;
        TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
-       TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb,
-                       btrfs_inode_atime(ii));
-       TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb,
-                       btrfs_inode_mtime(ii));
-       TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb,
-                       btrfs_inode_ctime(ii));
+       TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime);
+       TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime);
+       TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, &ii->ctime);
        /* TODO Add otime support when the otime patches get into upstream */
 
        ret = send_cmd(sctx);
index 6f49b28..05fef19 100644 (file)
@@ -1958,11 +1958,6 @@ static int btrfs_freeze(struct super_block *sb)
        return btrfs_commit_transaction(trans, root);
 }
 
-static int btrfs_unfreeze(struct super_block *sb)
-{
-       return 0;
-}
-
 static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb);
@@ -2011,7 +2006,6 @@ static const struct super_operations btrfs_super_ops = {
        .statfs         = btrfs_statfs,
        .remount_fs     = btrfs_remount,
        .freeze_fs      = btrfs_freeze,
-       .unfreeze_fs    = btrfs_unfreeze,
 };
 
 static const struct file_operations btrfs_ctl_fops = {
index 92db3f6..94edb0a 100644 (file)
@@ -733,10 +733,18 @@ int btrfs_init_sysfs(void)
 
        ret = btrfs_init_debugfs();
        if (ret)
-               return ret;
+               goto out1;
 
        init_feature_attrs();
        ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
+       if (ret)
+               goto out2;
+
+       return 0;
+out2:
+       debugfs_remove_recursive(btrfs_debugfs_root_dentry);
+out1:
+       kset_unregister(btrfs_kset);
 
        return ret;
 }
index cc286ce..f51963a 100644 (file)
@@ -53,7 +53,7 @@ static int test_btrfs_split_item(void)
                return -ENOMEM;
        }
 
-       path->nodes[0] = eb = alloc_dummy_extent_buffer(0, 4096);
+       path->nodes[0] = eb = alloc_dummy_extent_buffer(NULL, 4096);
        if (!eb) {
                test_msg("Could not allocate dummy buffer\n");
                ret = -ENOMEM;
index 7e99c2f..9e9f236 100644 (file)
@@ -258,8 +258,7 @@ static int test_find_delalloc(void)
        }
        ret = 0;
 out_bits:
-       clear_extent_bits(&tmp, 0, total_dirty - 1,
-                         (unsigned long)-1, GFP_NOFS);
+       clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_NOFS);
 out:
        if (locked_page)
                page_cache_release(locked_page);
index 3ae0f5b..a116b55 100644 (file)
@@ -255,7 +255,7 @@ static noinline int test_btrfs_get_extent(void)
                goto out;
        }
 
-       root->node = alloc_dummy_extent_buffer(0, 4096);
+       root->node = alloc_dummy_extent_buffer(NULL, 4096);
        if (!root->node) {
                test_msg("Couldn't allocate dummy buffer\n");
                goto out;
@@ -843,7 +843,7 @@ static int test_hole_first(void)
                goto out;
        }
 
-       root->node = alloc_dummy_extent_buffer(0, 4096);
+       root->node = alloc_dummy_extent_buffer(NULL, 4096);
        if (!root->node) {
                test_msg("Couldn't allocate dummy buffer\n");
                goto out;
index ec3dcb2..73f299e 100644 (file)
@@ -404,12 +404,22 @@ int btrfs_test_qgroups(void)
                ret = -ENOMEM;
                goto out;
        }
+       /* We are using this root as our extent root */
+       root->fs_info->extent_root = root;
+
+       /*
+        * Some of the paths we test assume we have a filled out fs_info, so we
+        * just need to add the root in there so we don't panic.
+        */
+       root->fs_info->tree_root = root;
+       root->fs_info->quota_root = root;
+       root->fs_info->quota_enabled = 1;
 
        /*
         * Can't use bytenr 0, some things freak out
         * *cough*backref walking code*cough*
         */
-       root->node = alloc_test_extent_buffer(root->fs_info, 4096, 4096);
+       root->node = alloc_test_extent_buffer(root->fs_info, 4096);
        if (!root->node) {
                test_msg("Couldn't allocate dummy buffer\n");
                ret = -ENOMEM;
@@ -448,17 +458,6 @@ int btrfs_test_qgroups(void)
                goto out;
        }
 
-       /* We are using this root as our extent root */
-       root->fs_info->extent_root = root;
-
-       /*
-        * Some of the paths we test assume we have a filled out fs_info, so we
-        * just need to addt he root in there so we don't panic.
-        */
-       root->fs_info->tree_root = root;
-       root->fs_info->quota_root = root;
-       root->fs_info->quota_enabled = 1;
-
        test_msg("Running qgroup tests\n");
        ret = test_no_shared_qgroup(root);
        if (ret)
index e88b59d..7e80f32 100644 (file)
@@ -220,6 +220,7 @@ loop:
         * commit the transaction.
         */
        atomic_set(&cur_trans->use_count, 2);
+       cur_trans->have_free_bgs = 0;
        cur_trans->start_time = get_seconds();
 
        cur_trans->delayed_refs.href_root = RB_ROOT;
@@ -248,6 +249,8 @@ loop:
        INIT_LIST_HEAD(&cur_trans->pending_chunks);
        INIT_LIST_HEAD(&cur_trans->switch_commits);
        INIT_LIST_HEAD(&cur_trans->pending_ordered);
+       INIT_LIST_HEAD(&cur_trans->dirty_bgs);
+       spin_lock_init(&cur_trans->dirty_bgs_lock);
        list_add_tail(&cur_trans->list, &fs_info->trans_list);
        extent_io_tree_init(&cur_trans->dirty_pages,
                             fs_info->btree_inode->i_mapping);
@@ -1020,6 +1023,7 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
        u64 old_root_bytenr;
        u64 old_root_used;
        struct btrfs_root *tree_root = root->fs_info->tree_root;
+       bool extent_root = (root->objectid == BTRFS_EXTENT_TREE_OBJECTID);
 
        old_root_used = btrfs_root_used(&root->root_item);
        btrfs_write_dirty_block_groups(trans, root);
@@ -1027,7 +1031,9 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
        while (1) {
                old_root_bytenr = btrfs_root_bytenr(&root->root_item);
                if (old_root_bytenr == root->node->start &&
-                   old_root_used == btrfs_root_used(&root->root_item))
+                   old_root_used == btrfs_root_used(&root->root_item) &&
+                   (!extent_root ||
+                    list_empty(&trans->transaction->dirty_bgs)))
                        break;
 
                btrfs_set_root_node(&root->root_item, root->node);
@@ -1038,7 +1044,15 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
                        return ret;
 
                old_root_used = btrfs_root_used(&root->root_item);
-               ret = btrfs_write_dirty_block_groups(trans, root);
+               if (extent_root) {
+                       ret = btrfs_write_dirty_block_groups(trans, root);
+                       if (ret)
+                               return ret;
+               }
+               ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+               if (ret)
+                       return ret;
+               ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
                if (ret)
                        return ret;
        }
@@ -1061,10 +1075,6 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
        struct extent_buffer *eb;
        int ret;
 
-       ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-       if (ret)
-               return ret;
-
        eb = btrfs_lock_root_node(fs_info->tree_root);
        ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
                              0, &eb);
@@ -1097,6 +1107,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
                next = fs_info->dirty_cowonly_roots.next;
                list_del_init(next);
                root = list_entry(next, struct btrfs_root, dirty_list);
+               clear_bit(BTRFS_ROOT_DIRTY, &root->state);
 
                if (root != fs_info->extent_root)
                        list_add_tail(&root->dirty_list,
@@ -1983,6 +1994,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        switch_commit_roots(cur_trans, root->fs_info);
 
        assert_qgroups_uptodate(trans);
+       ASSERT(list_empty(&cur_trans->dirty_bgs));
        update_super_roots(root);
 
        btrfs_set_super_log_root(root->fs_info->super_copy, 0);
@@ -2026,6 +2038,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
        btrfs_finish_extent_commit(trans, root);
 
+       if (cur_trans->have_free_bgs)
+               btrfs_clear_space_info_full(root->fs_info);
+
        root->fs_info->last_trans_committed = cur_trans->transid;
        /*
         * We needn't acquire the lock here because there is no other task
index 00ed29c..937050a 100644 (file)
@@ -47,6 +47,11 @@ struct btrfs_transaction {
        atomic_t num_writers;
        atomic_t use_count;
 
+       /*
+        * true if there is free bgs operations in this transaction
+        */
+       int have_free_bgs;
+
        /* Be protected by fs_info->trans_lock when we want to change it. */
        enum btrfs_trans_state state;
        struct list_head list;
@@ -58,6 +63,8 @@ struct btrfs_transaction {
        struct list_head pending_chunks;
        struct list_head pending_ordered;
        struct list_head switch_commits;
+       struct list_head dirty_bgs;
+       spinlock_t dirty_bgs_lock;
        struct btrfs_delayed_ref_root delayed_refs;
        int aborted;
 };
index 1a9585d..9a37f8b 100644 (file)
@@ -453,11 +453,13 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
 insert:
        btrfs_release_path(path);
        /* try to insert the key into the destination tree */
+       path->skip_release_on_error = 1;
        ret = btrfs_insert_empty_item(trans, root, path,
                                      key, item_size);
+       path->skip_release_on_error = 0;
 
        /* make sure any existing item is the correct size */
-       if (ret == -EEXIST) {
+       if (ret == -EEXIST || ret == -EOVERFLOW) {
                u32 found_size;
                found_size = btrfs_item_size_nr(path->nodes[0],
                                                path->slots[0]);
@@ -488,8 +490,20 @@ insert:
                src_item = (struct btrfs_inode_item *)src_ptr;
                dst_item = (struct btrfs_inode_item *)dst_ptr;
 
-               if (btrfs_inode_generation(eb, src_item) == 0)
+               if (btrfs_inode_generation(eb, src_item) == 0) {
+                       struct extent_buffer *dst_eb = path->nodes[0];
+
+                       if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
+                           S_ISREG(btrfs_inode_mode(dst_eb, dst_item))) {
+                               struct btrfs_map_token token;
+                               u64 ino_size = btrfs_inode_size(eb, src_item);
+
+                               btrfs_init_map_token(&token);
+                               btrfs_set_token_inode_size(dst_eb, dst_item,
+                                                          ino_size, &token);
+                       }
                        goto no_copy;
+               }
 
                if (overwrite_root &&
                    S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
@@ -844,7 +858,7 @@ out:
 static noinline int backref_in_log(struct btrfs_root *log,
                                   struct btrfs_key *key,
                                   u64 ref_objectid,
-                                  char *name, int namelen)
+                                  const char *name, int namelen)
 {
        struct btrfs_path *path;
        struct btrfs_inode_ref *ref;
@@ -1254,13 +1268,14 @@ out:
 }
 
 static int insert_orphan_item(struct btrfs_trans_handle *trans,
-                             struct btrfs_root *root, u64 offset)
+                             struct btrfs_root *root, u64 ino)
 {
        int ret;
-       ret = btrfs_find_item(root, NULL, BTRFS_ORPHAN_OBJECTID,
-                       offset, BTRFS_ORPHAN_ITEM_KEY, NULL);
-       if (ret > 0)
-               ret = btrfs_insert_orphan_item(trans, root, offset);
+
+       ret = btrfs_insert_orphan_item(trans, root, ino);
+       if (ret == -EEXIST)
+               ret = 0;
+
        return ret;
 }
 
@@ -1287,6 +1302,7 @@ static int count_inode_extrefs(struct btrfs_root *root,
                leaf = path->nodes[0];
                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
                ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+               cur_offset = 0;
 
                while (cur_offset < item_size) {
                        extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
@@ -1302,7 +1318,7 @@ static int count_inode_extrefs(struct btrfs_root *root,
        }
        btrfs_release_path(path);
 
-       if (ret < 0)
+       if (ret < 0 && ret != -ENOENT)
                return ret;
        return nlink;
 }
@@ -1394,9 +1410,6 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
        nlink = ret;
 
        ret = count_inode_extrefs(root, inode, path);
-       if (ret == -ENOENT)
-               ret = 0;
-
        if (ret < 0)
                goto out;
 
@@ -1556,6 +1569,30 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
        return ret;
 }
 
+/*
+ * Return true if an inode reference exists in the log for the given name,
+ * inode and parent inode.
+ */
+static bool name_in_log_ref(struct btrfs_root *log_root,
+                           const char *name, const int name_len,
+                           const u64 dirid, const u64 ino)
+{
+       struct btrfs_key search_key;
+
+       search_key.objectid = ino;
+       search_key.type = BTRFS_INODE_REF_KEY;
+       search_key.offset = dirid;
+       if (backref_in_log(log_root, &search_key, dirid, name, name_len))
+               return true;
+
+       search_key.type = BTRFS_INODE_EXTREF_KEY;
+       search_key.offset = btrfs_extref_hash(dirid, name, name_len);
+       if (backref_in_log(log_root, &search_key, dirid, name, name_len))
+               return true;
+
+       return false;
+}
+
 /*
  * take a single entry in a log directory item and replay it into
  * the subvolume.
@@ -1666,10 +1703,17 @@ out:
        return ret;
 
 insert:
+       if (name_in_log_ref(root->log_root, name, name_len,
+                           key->objectid, log_key.objectid)) {
+               /* The dentry will be added later. */
+               ret = 0;
+               update_size = false;
+               goto out;
+       }
        btrfs_release_path(path);
        ret = insert_one_name(trans, root, path, key->objectid, key->offset,
                              name, name_len, log_type, &log_key);
-       if (ret && ret != -ENOENT)
+       if (ret && ret != -ENOENT && ret != -EEXIST)
                goto out;
        update_size = false;
        ret = 0;
@@ -2164,7 +2208,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
                parent = path->nodes[*level];
                root_owner = btrfs_header_owner(parent);
 
-               next = btrfs_find_create_tree_block(root, bytenr, blocksize);
+               next = btrfs_find_create_tree_block(root, bytenr);
                if (!next)
                        return -ENOMEM;
 
@@ -2416,8 +2460,8 @@ static void wait_for_writer(struct btrfs_trans_handle *trans,
                mutex_unlock(&root->log_mutex);
                if (atomic_read(&root->log_writers))
                        schedule();
-               mutex_lock(&root->log_mutex);
                finish_wait(&root->log_writer_wait, &wait);
+               mutex_lock(&root->log_mutex);
        }
 }
 
@@ -3219,7 +3263,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
 static void fill_inode_item(struct btrfs_trans_handle *trans,
                            struct extent_buffer *leaf,
                            struct btrfs_inode_item *item,
-                           struct inode *inode, int log_inode_only)
+                           struct inode *inode, int log_inode_only,
+                           u64 logged_isize)
 {
        struct btrfs_map_token token;
 
@@ -3232,7 +3277,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
                 * to say 'update this inode with these values'
                 */
                btrfs_set_token_inode_generation(leaf, item, 0, &token);
-               btrfs_set_token_inode_size(leaf, item, 0, &token);
+               btrfs_set_token_inode_size(leaf, item, logged_isize, &token);
        } else {
                btrfs_set_token_inode_generation(leaf, item,
                                                 BTRFS_I(inode)->generation,
@@ -3245,19 +3290,19 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
        btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
        btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
 
-       btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
+       btrfs_set_token_timespec_sec(leaf, &item->atime,
                                     inode->i_atime.tv_sec, &token);
-       btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
+       btrfs_set_token_timespec_nsec(leaf, &item->atime,
                                      inode->i_atime.tv_nsec, &token);
 
-       btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
+       btrfs_set_token_timespec_sec(leaf, &item->mtime,
                                     inode->i_mtime.tv_sec, &token);
-       btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
+       btrfs_set_token_timespec_nsec(leaf, &item->mtime,
                                      inode->i_mtime.tv_nsec, &token);
 
-       btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
+       btrfs_set_token_timespec_sec(leaf, &item->ctime,
                                     inode->i_ctime.tv_sec, &token);
-       btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
+       btrfs_set_token_timespec_nsec(leaf, &item->ctime,
                                      inode->i_ctime.tv_nsec, &token);
 
        btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
@@ -3284,7 +3329,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans,
                return ret;
        inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
                                    struct btrfs_inode_item);
-       fill_inode_item(trans, path->nodes[0], inode_item, inode, 0);
+       fill_inode_item(trans, path->nodes[0], inode_item, inode, 0, 0);
        btrfs_release_path(path);
        return 0;
 }
@@ -3293,7 +3338,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
                               struct inode *inode,
                               struct btrfs_path *dst_path,
                               struct btrfs_path *src_path, u64 *last_extent,
-                              int start_slot, int nr, int inode_only)
+                              int start_slot, int nr, int inode_only,
+                              u64 logged_isize)
 {
        unsigned long src_offset;
        unsigned long dst_offset;
@@ -3350,7 +3396,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
                                                    dst_path->slots[0],
                                                    struct btrfs_inode_item);
                        fill_inode_item(trans, dst_path->nodes[0], inode_item,
-                                       inode, inode_only == LOG_INODE_EXISTS);
+                                       inode, inode_only == LOG_INODE_EXISTS,
+                                       logged_isize);
                } else {
                        copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
                                           src_offset, ins_sizes[i]);
@@ -3902,6 +3949,33 @@ process:
        return ret;
 }
 
+static int logged_inode_size(struct btrfs_root *log, struct inode *inode,
+                            struct btrfs_path *path, u64 *size_ret)
+{
+       struct btrfs_key key;
+       int ret;
+
+       key.objectid = btrfs_ino(inode);
+       key.type = BTRFS_INODE_ITEM_KEY;
+       key.offset = 0;
+
+       ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
+       if (ret < 0) {
+               return ret;
+       } else if (ret > 0) {
+               *size_ret = i_size_read(inode);
+       } else {
+               struct btrfs_inode_item *item;
+
+               item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                                     struct btrfs_inode_item);
+               *size_ret = btrfs_inode_size(path->nodes[0], item);
+       }
+
+       btrfs_release_path(path);
+       return 0;
+}
+
 /* log a single inode in the tree log.
  * At least one parent directory for this inode must exist in the tree
  * or be logged already.
@@ -3939,6 +4013,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        bool fast_search = false;
        u64 ino = btrfs_ino(inode);
        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+       u64 logged_isize = 0;
 
        path = btrfs_alloc_path();
        if (!path)
@@ -3966,15 +4041,22 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                max_key.type = (u8)-1;
        max_key.offset = (u64)-1;
 
-       /* Only run delayed items if we are a dir or a new file */
+       /*
+        * Only run delayed items if we are a dir or a new file.
+        * Otherwise commit the delayed inode only, which is needed in
+        * order for the log replay code to mark inodes for link count
+        * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items).
+        */
        if (S_ISDIR(inode->i_mode) ||
-           BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) {
+           BTRFS_I(inode)->generation > root->fs_info->last_trans_committed)
                ret = btrfs_commit_inode_delayed_items(trans, inode);
-               if (ret) {
-                       btrfs_free_path(path);
-                       btrfs_free_path(dst_path);
-                       return ret;
-               }
+       else
+               ret = btrfs_commit_inode_delayed_inode(inode);
+
+       if (ret) {
+               btrfs_free_path(path);
+               btrfs_free_path(dst_path);
+               return ret;
        }
 
        mutex_lock(&BTRFS_I(inode)->log_mutex);
@@ -3988,22 +4070,56 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        if (S_ISDIR(inode->i_mode)) {
                int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
 
-               if (inode_only == LOG_INODE_EXISTS)
-                       max_key_type = BTRFS_XATTR_ITEM_KEY;
+               if (inode_only == LOG_INODE_EXISTS) {
+                       max_key_type = BTRFS_INODE_EXTREF_KEY;
+                       max_key.type = max_key_type;
+               }
                ret = drop_objectid_items(trans, log, path, ino, max_key_type);
        } else {
-               if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
-                                      &BTRFS_I(inode)->runtime_flags)) {
-                       clear_bit(BTRFS_INODE_COPY_EVERYTHING,
-                                 &BTRFS_I(inode)->runtime_flags);
-                       ret = btrfs_truncate_inode_items(trans, log,
-                                                        inode, 0, 0);
-               } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
-                                             &BTRFS_I(inode)->runtime_flags) ||
+               if (inode_only == LOG_INODE_EXISTS) {
+                       /*
+                        * Make sure the new inode item we write to the log has
+                        * the same isize as the current one (if it exists).
+                        * This is necessary to prevent data loss after log
+                        * replay, and also to prevent doing a wrong expanding
+                        * truncate - for e.g. create file, write 4K into offset
+                        * 0, fsync, write 4K into offset 4096, add hard link,
+                        * fsync some other file (to sync log), power fail - if
+                        * we use the inode's current i_size, after log replay
+                        * we get a 8Kb file, with the last 4Kb extent as a hole
+                        * (zeroes), as if an expanding truncate happened,
+                        * instead of getting a file of 4Kb only.
+                        */
+                       err = logged_inode_size(log, inode, path,
+                                               &logged_isize);
+                       if (err)
+                               goto out_unlock;
+               }
+               if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+                            &BTRFS_I(inode)->runtime_flags)) {
+                       if (inode_only == LOG_INODE_EXISTS) {
+                               max_key.type = BTRFS_INODE_EXTREF_KEY;
+                               ret = drop_objectid_items(trans, log, path, ino,
+                                                         max_key.type);
+                       } else {
+                               clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+                                         &BTRFS_I(inode)->runtime_flags);
+                               clear_bit(BTRFS_INODE_COPY_EVERYTHING,
+                                         &BTRFS_I(inode)->runtime_flags);
+                               ret = btrfs_truncate_inode_items(trans, log,
+                                                                inode, 0, 0);
+                       }
+               } else if (test_bit(BTRFS_INODE_COPY_EVERYTHING,
+                                   &BTRFS_I(inode)->runtime_flags) ||
                           inode_only == LOG_INODE_EXISTS) {
-                       if (inode_only == LOG_INODE_ALL)
+                       if (inode_only == LOG_INODE_ALL) {
+                               clear_bit(BTRFS_INODE_COPY_EVERYTHING,
+                                         &BTRFS_I(inode)->runtime_flags);
                                fast_search = true;
-                       max_key.type = BTRFS_XATTR_ITEM_KEY;
+                               max_key.type = BTRFS_XATTR_ITEM_KEY;
+                       } else {
+                               max_key.type = BTRFS_INODE_EXTREF_KEY;
+                       }
                        ret = drop_objectid_items(trans, log, path, ino,
                                                  max_key.type);
                } else {
@@ -4047,7 +4163,8 @@ again:
                }
 
                ret = copy_items(trans, inode, dst_path, path, &last_extent,
-                                ins_start_slot, ins_nr, inode_only);
+                                ins_start_slot, ins_nr, inode_only,
+                                logged_isize);
                if (ret < 0) {
                        err = ret;
                        goto out_unlock;
@@ -4071,7 +4188,7 @@ next_slot:
                if (ins_nr) {
                        ret = copy_items(trans, inode, dst_path, path,
                                         &last_extent, ins_start_slot,
-                                        ins_nr, inode_only);
+                                        ins_nr, inode_only, logged_isize);
                        if (ret < 0) {
                                err = ret;
                                goto out_unlock;
@@ -4092,7 +4209,8 @@ next_slot:
        }
        if (ins_nr) {
                ret = copy_items(trans, inode, dst_path, path, &last_extent,
-                                ins_start_slot, ins_nr, inode_only);
+                                ins_start_slot, ins_nr, inode_only,
+                                logged_isize);
                if (ret < 0) {
                        err = ret;
                        goto out_unlock;
@@ -4273,6 +4391,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
        struct dentry *old_parent = NULL;
        int ret = 0;
        u64 last_committed = root->fs_info->last_trans_committed;
+       const struct dentry * const first_parent = parent;
+       const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans >
+                                last_committed);
 
        sb = inode->i_sb;
 
@@ -4328,7 +4449,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                goto end_trans;
        }
 
-       inode_only = LOG_INODE_EXISTS;
        while (1) {
                if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
                        break;
@@ -4337,8 +4457,22 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                if (root != BTRFS_I(inode)->root)
                        break;
 
+               /*
+                * On unlink we must make sure our immediate parent directory
+                * inode is fully logged. This is to prevent leaving dangling
+                * directory index entries and a wrong directory inode's i_size.
+                * Not doing so can result in a directory being impossible to
+                * delete after log replay (rmdir will always fail with error
+                * -ENOTEMPTY).
+                */
+               if (did_unlink && parent == first_parent)
+                       inode_only = LOG_INODE_ALL;
+               else
+                       inode_only = LOG_INODE_EXISTS;
+
                if (BTRFS_I(inode)->generation >
-                   root->fs_info->last_trans_committed) {
+                   root->fs_info->last_trans_committed ||
+                   inode_only == LOG_INODE_ALL) {
                        ret = btrfs_log_inode(trans, root, inode, inode_only,
                                              0, LLONG_MAX, ctx);
                        if (ret)
index 50c5a87..cd4d131 100644 (file)
@@ -1310,6 +1310,8 @@ again:
        if (ret) {
                btrfs_error(root->fs_info, ret,
                            "Failed to remove dev extent item");
+       } else {
+               trans->transaction->have_free_bgs = 1;
        }
 out:
        btrfs_free_path(path);
@@ -4196,7 +4198,7 @@ static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
 
 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
 {
-       if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)))
+       if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
                return;
 
        btrfs_set_fs_incompat(info, RAID56);
@@ -4803,10 +4805,8 @@ unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
 
        BUG_ON(em->start > logical || em->start + em->len < logical);
        map = (struct map_lookup *)em->bdev;
-       if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
-                        BTRFS_BLOCK_GROUP_RAID6)) {
+       if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
                len = map->stripe_len * nr_data_stripes(map);
-       }
        free_extent_map(em);
        return len;
 }
@@ -4826,8 +4826,7 @@ int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
 
        BUG_ON(em->start > logical || em->start + em->len < logical);
        map = (struct map_lookup *)em->bdev;
-       if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
-                        BTRFS_BLOCK_GROUP_RAID6))
+       if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
                ret = 1;
        free_extent_map(em);
        return ret;
@@ -4876,32 +4875,24 @@ static inline int parity_smaller(u64 a, u64 b)
 }
 
 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */
-static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
+static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
 {
        struct btrfs_bio_stripe s;
-       int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
        int i;
        u64 l;
        int again = 1;
-       int m;
 
        while (again) {
                again = 0;
-               for (i = 0; i < real_stripes - 1; i++) {
-                       if (parity_smaller(raid_map[i], raid_map[i+1])) {
+               for (i = 0; i < num_stripes - 1; i++) {
+                       if (parity_smaller(bbio->raid_map[i],
+                                          bbio->raid_map[i+1])) {
                                s = bbio->stripes[i];
-                               l = raid_map[i];
+                               l = bbio->raid_map[i];
                                bbio->stripes[i] = bbio->stripes[i+1];
-                               raid_map[i] = raid_map[i+1];
+                               bbio->raid_map[i] = bbio->raid_map[i+1];
                                bbio->stripes[i+1] = s;
-                               raid_map[i+1] = l;
-
-                               if (bbio->tgtdev_map) {
-                                       m = bbio->tgtdev_map[i];
-                                       bbio->tgtdev_map[i] =
-                                                       bbio->tgtdev_map[i + 1];
-                                       bbio->tgtdev_map[i + 1] = m;
-                               }
+                               bbio->raid_map[i+1] = l;
 
                                again = 1;
                        }
@@ -4909,10 +4900,41 @@ static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
        }
 }
 
+static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
+{
+       struct btrfs_bio *bbio = kzalloc(
+               sizeof(struct btrfs_bio) +
+               sizeof(struct btrfs_bio_stripe) * (total_stripes) +
+               sizeof(int) * (real_stripes) +
+               sizeof(u64) * (real_stripes),
+               GFP_NOFS);
+       if (!bbio)
+               return NULL;
+
+       atomic_set(&bbio->error, 0);
+       atomic_set(&bbio->refs, 1);
+
+       return bbio;
+}
+
+void btrfs_get_bbio(struct btrfs_bio *bbio)
+{
+       WARN_ON(!atomic_read(&bbio->refs));
+       atomic_inc(&bbio->refs);
+}
+
+void btrfs_put_bbio(struct btrfs_bio *bbio)
+{
+       if (!bbio)
+               return;
+       if (atomic_dec_and_test(&bbio->refs))
+               kfree(bbio);
+}
+
 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                             u64 logical, u64 *length,
                             struct btrfs_bio **bbio_ret,
-                            int mirror_num, u64 **raid_map_ret)
+                            int mirror_num, int need_raid_map)
 {
        struct extent_map *em;
        struct map_lookup *map;
@@ -4925,7 +4947,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
        u64 stripe_nr_orig;
        u64 stripe_nr_end;
        u64 stripe_len;
-       u64 *raid_map = NULL;
        int stripe_index;
        int i;
        int ret = 0;
@@ -4976,7 +4997,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
        stripe_offset = offset - stripe_offset;
 
        /* if we're here for raid56, we need to know the stripe aligned start */
-       if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
+       if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
                unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
                raid56_full_stripe_start = offset;
 
@@ -4989,8 +5010,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 
        if (rw & REQ_DISCARD) {
                /* we don't discard raid56 yet */
-               if (map->type &
-                   (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
+               if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
                        ret = -EOPNOTSUPP;
                        goto out;
                }
@@ -5000,7 +5020,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                /* For writes to RAID[56], allow a full stripeset across all disks.
                   For other RAID types and for RAID[56] reads, just allow a single
                   stripe (on a single disk). */
-               if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) &&
+               if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
                    (rw & REQ_WRITE)) {
                        max_len = stripe_len * nr_data_stripes(map) -
                                (offset - raid56_full_stripe_start);
@@ -5047,7 +5067,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                u64 physical_of_found = 0;
 
                ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
-                            logical, &tmp_length, &tmp_bbio, 0, NULL);
+                            logical, &tmp_length, &tmp_bbio, 0, 0);
                if (ret) {
                        WARN_ON(tmp_bbio != NULL);
                        goto out;
@@ -5061,7 +5081,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                         * is not left of the left cursor
                         */
                        ret = -EIO;
-                       kfree(tmp_bbio);
+                       btrfs_put_bbio(tmp_bbio);
                        goto out;
                }
 
@@ -5096,11 +5116,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                } else {
                        WARN_ON(1);
                        ret = -EIO;
-                       kfree(tmp_bbio);
+                       btrfs_put_bbio(tmp_bbio);
                        goto out;
                }
 
-               kfree(tmp_bbio);
+               btrfs_put_bbio(tmp_bbio);
        } else if (mirror_num > map->num_stripes) {
                mirror_num = 0;
        }
@@ -5166,15 +5186,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                        mirror_num = stripe_index - old_stripe_index + 1;
                }
 
-       } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
-                               BTRFS_BLOCK_GROUP_RAID6)) {
-               u64 tmp;
-
-               if (raid_map_ret &&
+       } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+               if (need_raid_map &&
                    ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
                     mirror_num > 1)) {
-                       int i, rot;
-
                        /* push stripe_nr back to the start of the full stripe */
                        stripe_nr = raid56_full_stripe_start;
                        do_div(stripe_nr, stripe_len * nr_data_stripes(map));
@@ -5183,32 +5198,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                        num_stripes = map->num_stripes;
                        max_errors = nr_parity_stripes(map);
 
-                       raid_map = kmalloc_array(num_stripes, sizeof(u64),
-                                          GFP_NOFS);
-                       if (!raid_map) {
-                               ret = -ENOMEM;
-                               goto out;
-                       }
-
-                       /* Work out the disk rotation on this stripe-set */
-                       tmp = stripe_nr;
-                       rot = do_div(tmp, num_stripes);
-
-                       /* Fill in the logical address of each stripe */
-                       tmp = stripe_nr * nr_data_stripes(map);
-                       for (i = 0; i < nr_data_stripes(map); i++)
-                               raid_map[(i+rot) % num_stripes] =
-                                       em->start + (tmp + i) * map->stripe_len;
-
-                       raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
-                       if (map->type & BTRFS_BLOCK_GROUP_RAID6)
-                               raid_map[(i+rot+1) % num_stripes] =
-                                       RAID6_Q_STRIPE;
-
                        *length = map->stripe_len;
                        stripe_index = 0;
                        stripe_offset = 0;
                } else {
+                       u64 tmp;
+
                        /*
                         * Mirror #0 or #1 means the original data block.
                         * Mirror #2 is RAID5 parity block.
@@ -5246,17 +5241,42 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                tgtdev_indexes = num_stripes;
        }
 
-       bbio = kzalloc(btrfs_bio_size(num_alloc_stripes, tgtdev_indexes),
-                      GFP_NOFS);
+       bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
        if (!bbio) {
-               kfree(raid_map);
                ret = -ENOMEM;
                goto out;
        }
-       atomic_set(&bbio->error, 0);
        if (dev_replace_is_ongoing)
                bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
 
+       /* build raid_map */
+       if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK &&
+           need_raid_map && ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
+           mirror_num > 1)) {
+               u64 tmp;
+               int i, rot;
+
+               bbio->raid_map = (u64 *)((void *)bbio->stripes +
+                                sizeof(struct btrfs_bio_stripe) *
+                                num_alloc_stripes +
+                                sizeof(int) * tgtdev_indexes);
+
+               /* Work out the disk rotation on this stripe-set */
+               tmp = stripe_nr;
+               rot = do_div(tmp, num_stripes);
+
+               /* Fill in the logical address of each stripe */
+               tmp = stripe_nr * nr_data_stripes(map);
+               for (i = 0; i < nr_data_stripes(map); i++)
+                       bbio->raid_map[(i+rot) % num_stripes] =
+                               em->start + (tmp + i) * map->stripe_len;
+
+               bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
+               if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+                       bbio->raid_map[(i+rot+1) % num_stripes] =
+                               RAID6_Q_STRIPE;
+       }
+
        if (rw & REQ_DISCARD) {
                int factor = 0;
                int sub_stripes = 0;
@@ -5340,6 +5360,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
        if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
                max_errors = btrfs_chunk_max_errors(map);
 
+       if (bbio->raid_map)
+               sort_parity_stripes(bbio, num_stripes);
+
        tgtdev_indexes = 0;
        if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
            dev_replace->tgtdev != NULL) {
@@ -5427,6 +5450,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
        }
 
        *bbio_ret = bbio;
+       bbio->map_type = map->type;
        bbio->num_stripes = num_stripes;
        bbio->max_errors = max_errors;
        bbio->mirror_num = mirror_num;
@@ -5443,10 +5467,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
                bbio->mirror_num = map->num_stripes + 1;
        }
-       if (raid_map) {
-               sort_parity_stripes(bbio, raid_map);
-               *raid_map_ret = raid_map;
-       }
 out:
        if (dev_replace_is_ongoing)
                btrfs_dev_replace_unlock(dev_replace);
@@ -5459,17 +5479,17 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                      struct btrfs_bio **bbio_ret, int mirror_num)
 {
        return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
-                                mirror_num, NULL);
+                                mirror_num, 0);
 }
 
 /* For Scrub/replace */
 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
                     u64 logical, u64 *length,
                     struct btrfs_bio **bbio_ret, int mirror_num,
-                    u64 **raid_map_ret)
+                    int need_raid_map)
 {
        return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
-                                mirror_num, raid_map_ret);
+                                mirror_num, need_raid_map);
 }
 
 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -5511,8 +5531,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
                do_div(length, map->num_stripes / map->sub_stripes);
        else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
                do_div(length, map->num_stripes);
-       else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
-                             BTRFS_BLOCK_GROUP_RAID6)) {
+       else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
                do_div(length, nr_data_stripes(map));
                rmap_len = map->stripe_len * nr_data_stripes(map);
        }
@@ -5565,7 +5584,7 @@ static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int e
                bio_endio_nodec(bio, err);
        else
                bio_endio(bio, err);
-       kfree(bbio);
+       btrfs_put_bbio(bbio);
 }
 
 static void btrfs_end_bio(struct bio *bio, int err)
@@ -5808,7 +5827,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
        u64 logical = (u64)bio->bi_iter.bi_sector << 9;
        u64 length = 0;
        u64 map_length;
-       u64 *raid_map = NULL;
        int ret;
        int dev_nr = 0;
        int total_devs = 1;
@@ -5819,7 +5837,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 
        btrfs_bio_counter_inc_blocked(root->fs_info);
        ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
-                             mirror_num, &raid_map);
+                             mirror_num, 1);
        if (ret) {
                btrfs_bio_counter_dec(root->fs_info);
                return ret;
@@ -5832,15 +5850,13 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
        bbio->fs_info = root->fs_info;
        atomic_set(&bbio->stripes_pending, bbio->num_stripes);
 
-       if (raid_map) {
+       if (bbio->raid_map) {
                /* In this case, map_length has been set to the length of
                   a single stripe; not the whole write */
                if (rw & WRITE) {
-                       ret = raid56_parity_write(root, bio, bbio,
-                                                 raid_map, map_length);
+                       ret = raid56_parity_write(root, bio, bbio, map_length);
                } else {
-                       ret = raid56_parity_recover(root, bio, bbio,
-                                                   raid_map, map_length,
+                       ret = raid56_parity_recover(root, bio, bbio, map_length,
                                                    mirror_num, 1);
                }
 
@@ -6238,17 +6254,22 @@ int btrfs_read_sys_array(struct btrfs_root *root)
        struct extent_buffer *sb;
        struct btrfs_disk_key *disk_key;
        struct btrfs_chunk *chunk;
-       u8 *ptr;
-       unsigned long sb_ptr;
+       u8 *array_ptr;
+       unsigned long sb_array_offset;
        int ret = 0;
        u32 num_stripes;
        u32 array_size;
        u32 len = 0;
-       u32 cur;
+       u32 cur_offset;
        struct btrfs_key key;
 
-       sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
-                                         BTRFS_SUPER_INFO_SIZE);
+       ASSERT(BTRFS_SUPER_INFO_SIZE <= root->nodesize);
+       /*
+        * This will create extent buffer of nodesize, superblock size is
+        * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
+        * overallocate but we can keep it as-is, only the first page is used.
+        */
+       sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET);
        if (!sb)
                return -ENOMEM;
        btrfs_set_buffer_uptodate(sb);
@@ -6271,35 +6292,56 @@ int btrfs_read_sys_array(struct btrfs_root *root)
        write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
        array_size = btrfs_super_sys_array_size(super_copy);
 
-       ptr = super_copy->sys_chunk_array;
-       sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
-       cur = 0;
+       array_ptr = super_copy->sys_chunk_array;
+       sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
+       cur_offset = 0;
+
+       while (cur_offset < array_size) {
+               disk_key = (struct btrfs_disk_key *)array_ptr;
+               len = sizeof(*disk_key);
+               if (cur_offset + len > array_size)
+                       goto out_short_read;
 
-       while (cur < array_size) {
-               disk_key = (struct btrfs_disk_key *)ptr;
                btrfs_disk_key_to_cpu(&key, disk_key);
 
-               len = sizeof(*disk_key); ptr += len;
-               sb_ptr += len;
-               cur += len;
+               array_ptr += len;
+               sb_array_offset += len;
+               cur_offset += len;
 
                if (key.type == BTRFS_CHUNK_ITEM_KEY) {
-                       chunk = (struct btrfs_chunk *)sb_ptr;
+                       chunk = (struct btrfs_chunk *)sb_array_offset;
+                       /*
+                        * At least one btrfs_chunk with one stripe must be
+                        * present, exact stripe count check comes afterwards
+                        */
+                       len = btrfs_chunk_item_size(1);
+                       if (cur_offset + len > array_size)
+                               goto out_short_read;
+
+                       num_stripes = btrfs_chunk_num_stripes(sb, chunk);
+                       len = btrfs_chunk_item_size(num_stripes);
+                       if (cur_offset + len > array_size)
+                               goto out_short_read;
+
                        ret = read_one_chunk(root, &key, sb, chunk);
                        if (ret)
                                break;
-                       num_stripes = btrfs_chunk_num_stripes(sb, chunk);
-                       len = btrfs_chunk_item_size(num_stripes);
                } else {
                        ret = -EIO;
                        break;
                }
-               ptr += len;
-               sb_ptr += len;
-               cur += len;
+               array_ptr += len;
+               sb_array_offset += len;
+               cur_offset += len;
        }
        free_extent_buffer(sb);
        return ret;
+
+out_short_read:
+       printk(KERN_ERR "BTRFS: sys_array too short to read %u bytes at offset %u\n",
+                       len, cur_offset);
+       free_extent_buffer(sb);
+       return -EIO;
 }
 
 int btrfs_read_chunk_tree(struct btrfs_root *root)
index d6fe73c..83069de 100644 (file)
@@ -295,8 +295,10 @@ typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
 #define BTRFS_BIO_ORIG_BIO_SUBMITTED   (1 << 0)
 
 struct btrfs_bio {
+       atomic_t refs;
        atomic_t stripes_pending;
        struct btrfs_fs_info *fs_info;
+       u64 map_type; /* get from map_lookup->type */
        bio_end_io_t *end_io;
        struct bio *orig_bio;
        unsigned long flags;
@@ -307,6 +309,12 @@ struct btrfs_bio {
        int mirror_num;
        int num_tgtdevs;
        int *tgtdev_map;
+       /*
+        * logical block numbers for the start of each stripe
+        * The last one or two are p/q.  These are sorted,
+        * so raid_map[0] is the start of our full stripe
+        */
+       u64 *raid_map;
        struct btrfs_bio_stripe stripes[];
 };
 
@@ -388,19 +396,15 @@ struct btrfs_balance_control {
 
 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
                                   u64 end, u64 *length);
-
-#define btrfs_bio_size(total_stripes, real_stripes)            \
-       (sizeof(struct btrfs_bio) +                             \
-        (sizeof(struct btrfs_bio_stripe) * (total_stripes)) +  \
-        (sizeof(int) * (real_stripes)))
-
+void btrfs_get_bbio(struct btrfs_bio *bbio);
+void btrfs_put_bbio(struct btrfs_bio *bbio);
 int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                    u64 logical, u64 *length,
                    struct btrfs_bio **bbio_ret, int mirror_num);
 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
                     u64 logical, u64 *length,
                     struct btrfs_bio **bbio_ret, int mirror_num,
-                    u64 **raid_map_ret);
+                    int need_raid_map);
 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
                     u64 chunk_start, u64 physical, u64 devid,
                     u64 **logical, int *naddrs, int *stripe_len);
index 611e1c5..b6dec05 100644 (file)
@@ -495,8 +495,7 @@ struct btrfs_ioctl_send_args {
 
 /* Error codes as returned by the kernel */
 enum btrfs_err_code {
-       notused,
-       BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
+       BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET = 1,
        BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
        BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
        BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,