Merge branch 'for-linus-4.7' of git://git.kernel.org/pub/scm/linux/kernel/git/mason...

author Linus Torvalds <torvalds@linux-foundation.org>

Sat, 21 May 2016 17:49:22 +0000 (10:49 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sat, 21 May 2016 17:49:22 +0000 (10:49 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Sat, 21 May 2016 17:49:22 +0000 (10:49 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sat, 21 May 2016 17:49:22 +0000 (10:49 -0700)
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c

index 80e8472..d309018 100644 (file)
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1991,7 +1991,7 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
  
         ifp = kmalloc(sizeof(*ifp), GFP_NOFS);
         if (!ifp) {
-               kfree(fspath);
+               vfree(fspath);
                 return ERR_PTR(-ENOMEM);
         }
  
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h

index 61205e3..1da5753 100644 (file)
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -196,6 +196,16 @@ struct btrfs_inode {
         struct list_head delayed_iput;
         long delayed_iput_count;
  
+       /*
+        * To avoid races between lockless (i_mutex not held) direct IO writes
+        * and concurrent fsync requests. Direct IO writes must acquire read
+        * access on this semaphore for creating an extent map and its
+        * corresponding ordered extent. The fast fsync path must acquire write
+        * access on this semaphore before it collects ordered extents and
+        * extent maps.
+        */
+       struct rw_semaphore dio_sem;
+
         struct inode vfs_inode;
  };
  
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c

index ff61a41..658c39b 100644 (file)
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -743,8 +743,11 @@ out:
  static struct {
         struct list_head idle_ws;
         spinlock_t ws_lock;
-       int num_ws;
-       atomic_t alloc_ws;
+       /* Number of free workspaces */
+       int free_ws;
+       /* Total number of allocated workspaces */
+       atomic_t total_ws;
+       /* Waiters for a free workspace */
         wait_queue_head_t ws_wait;
  } btrfs_comp_ws[BTRFS_COMPRESS_TYPES];
  
@@ -758,16 +761,34 @@ void __init btrfs_init_compress(void)
         int i;
  
         for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
+               struct list_head *workspace;
+
                 INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws);
                 spin_lock_init(&btrfs_comp_ws[i].ws_lock);
-               atomic_set(&btrfs_comp_ws[i].alloc_ws, 0);
+               atomic_set(&btrfs_comp_ws[i].total_ws, 0);
                 init_waitqueue_head(&btrfs_comp_ws[i].ws_wait);
+
+               /*
+                * Preallocate one workspace for each compression type so
+                * we can guarantee forward progress in the worst case
+                */
+               workspace = btrfs_compress_op[i]->alloc_workspace();
+               if (IS_ERR(workspace)) {
+                       printk(KERN_WARNING
+       "BTRFS: cannot preallocate compression workspace, will try later");
+               } else {
+                       atomic_set(&btrfs_comp_ws[i].total_ws, 1);
+                       btrfs_comp_ws[i].free_ws = 1;
+                       list_add(workspace, &btrfs_comp_ws[i].idle_ws);
+               }
         }
  }
  
  /*
- * this finds an available workspace or allocates a new one
- * ERR_PTR is returned if things go bad.
+ * This finds an available workspace or allocates a new one.
+ * If it's not possible to allocate a new one, waits until there's one.
+ * Preallocation makes a forward progress guarantees and we do not return
+ * errors.
   */
  static struct list_head *find_workspace(int type)
  {
@@ -777,36 +798,58 @@ static struct list_head *find_workspace(int type)
  
         struct list_head *idle_ws       = &btrfs_comp_ws[idx].idle_ws;
         spinlock_t *ws_lock             = &btrfs_comp_ws[idx].ws_lock;
-       atomic_t *alloc_ws              = &btrfs_comp_ws[idx].alloc_ws;
+       atomic_t *total_ws              = &btrfs_comp_ws[idx].total_ws;
         wait_queue_head_t *ws_wait      = &btrfs_comp_ws[idx].ws_wait;
-       int *num_ws                     = &btrfs_comp_ws[idx].num_ws;
+       int *free_ws                    = &btrfs_comp_ws[idx].free_ws;
  again:
         spin_lock(ws_lock);
         if (!list_empty(idle_ws)) {
                 workspace = idle_ws->next;
                 list_del(workspace);
-               (*num_ws)--;
+               (*free_ws)--;
                 spin_unlock(ws_lock);
                 return workspace;
  
         }
-       if (atomic_read(alloc_ws) > cpus) {
+       if (atomic_read(total_ws) > cpus) {
                 DEFINE_WAIT(wait);
  
                 spin_unlock(ws_lock);
                 prepare_to_wait(ws_wait, &wait, TASK_UNINTERRUPTIBLE);
-               if (atomic_read(alloc_ws) > cpus && !*num_ws)
+               if (atomic_read(total_ws) > cpus && !*free_ws)
                         schedule();
                 finish_wait(ws_wait, &wait);
                 goto again;
         }
-       atomic_inc(alloc_ws);
+       atomic_inc(total_ws);
         spin_unlock(ws_lock);
  
         workspace = btrfs_compress_op[idx]->alloc_workspace();
         if (IS_ERR(workspace)) {
-               atomic_dec(alloc_ws);
+               atomic_dec(total_ws);
                 wake_up(ws_wait);
+
+               /*
+                * Do not return the error but go back to waiting. There's a
+                * workspace preallocated for each type and the compression
+                * time is bounded so we get to a workspace eventually. This
+                * makes our caller's life easier.
+                *
+                * To prevent silent and low-probability deadlocks (when the
+                * initial preallocation fails), check if there are any
+                * workspaces at all.
+                */
+               if (atomic_read(total_ws) == 0) {
+                       static DEFINE_RATELIMIT_STATE(_rs,
+                                       /* once per minute */ 60 * HZ,
+                                       /* no burst */ 1);
+
+                       if (__ratelimit(&_rs)) {
+                               printk(KERN_WARNING
+                           "no compression workspaces, low memory, retrying");
+                       }
+               }
+               goto again;
         }
         return workspace;
  }
@@ -820,21 +863,21 @@ static void free_workspace(int type, struct list_head *workspace)
         int idx = type - 1;
         struct list_head *idle_ws       = &btrfs_comp_ws[idx].idle_ws;
         spinlock_t *ws_lock             = &btrfs_comp_ws[idx].ws_lock;
-       atomic_t *alloc_ws              = &btrfs_comp_ws[idx].alloc_ws;
+       atomic_t *total_ws              = &btrfs_comp_ws[idx].total_ws;
         wait_queue_head_t *ws_wait      = &btrfs_comp_ws[idx].ws_wait;
-       int *num_ws                     = &btrfs_comp_ws[idx].num_ws;
+       int *free_ws                    = &btrfs_comp_ws[idx].free_ws;
  
         spin_lock(ws_lock);
-       if (*num_ws < num_online_cpus()) {
+       if (*free_ws < num_online_cpus()) {
                 list_add(workspace, idle_ws);
-               (*num_ws)++;
+               (*free_ws)++;
                 spin_unlock(ws_lock);
                 goto wake;
         }
         spin_unlock(ws_lock);
  
         btrfs_compress_op[idx]->free_workspace(workspace);
-       atomic_dec(alloc_ws);
+       atomic_dec(total_ws);
  wake:
         /*
          * Make sure counter is updated before we wake up waiters.
@@ -857,7 +900,7 @@ static void free_workspaces(void)
                         workspace = btrfs_comp_ws[i].idle_ws.next;
                         list_del(workspace);
                         btrfs_compress_op[i]->free_workspace(workspace);
-                       atomic_dec(&btrfs_comp_ws[i].alloc_ws);
+                       atomic_dec(&btrfs_comp_ws[i].total_ws);
                 }
         }
  }
@@ -894,8 +937,6 @@ int btrfs_compress_pages(int type, struct address_space *mapping,
         int ret;
  
         workspace = find_workspace(type);
-       if (IS_ERR(workspace))
-               return PTR_ERR(workspace);
  
         ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
                                                       start, len, pages,
@@ -930,8 +971,6 @@ static int btrfs_decompress_biovec(int type, struct page **pages_in,
         int ret;
  
         workspace = find_workspace(type);
-       if (IS_ERR(workspace))
-               return PTR_ERR(workspace);
  
         ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in,
                                                          disk_start,
@@ -952,8 +991,6 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
         int ret;
  
         workspace = find_workspace(type);
-       if (IS_ERR(workspace))
-               return PTR_ERR(workspace);
  
         ret = btrfs_compress_op[type-1]->decompress(workspace, data_in,
                                                   dest_page, start_byte,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c

index ec7928a..decd0a3 100644 (file)
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1011,7 +1011,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
                         return ret;
                 if (refs == 0) {
                         ret = -EROFS;
-                       btrfs_std_error(root->fs_info, ret, NULL);
+                       btrfs_handle_fs_error(root->fs_info, ret, NULL);
                         return ret;
                 }
         } else {
@@ -1928,7 +1928,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                 child = read_node_slot(root, mid, 0);
                 if (!child) {
                         ret = -EROFS;
-                       btrfs_std_error(root->fs_info, ret, NULL);
+                       btrfs_handle_fs_error(root->fs_info, ret, NULL);
                         goto enospc;
                 }
  
@@ -2031,7 +2031,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                  */
                 if (!left) {
                         ret = -EROFS;
-                       btrfs_std_error(root->fs_info, ret, NULL);
+                       btrfs_handle_fs_error(root->fs_info, ret, NULL);
                         goto enospc;
                 }
                 wret = balance_node_right(trans, root, mid, left);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h

index 84a6a5b..ddcc58f 100644 (file)
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -33,6 +33,7 @@
  #include <asm/kmap_types.h>
  #include <linux/pagemap.h>
  #include <linux/btrfs.h>
+#include <linux/btrfs_tree.h>
  #include <linux/workqueue.h>
  #include <linux/security.h>
  #include <linux/sizes.h>
@@ -64,98 +65,6 @@ struct btrfs_ordered_sum;
  
  #define BTRFS_COMPAT_EXTENT_TREE_V0
  
-/* holds pointers to all of the tree roots */
-#define BTRFS_ROOT_TREE_OBJECTID 1ULL
-
-/* stores information about which extents are in use, and reference counts */
-#define BTRFS_EXTENT_TREE_OBJECTID 2ULL
-
-/*
- * chunk tree stores translations from logical -> physical block numbering
- * the super block points to the chunk tree
- */
-#define BTRFS_CHUNK_TREE_OBJECTID 3ULL
-
-/*
- * stores information about which areas of a given device are in use.
- * one per device.  The tree of tree roots points to the device tree
- */
-#define BTRFS_DEV_TREE_OBJECTID 4ULL
-
-/* one per subvolume, storing files and directories */
-#define BTRFS_FS_TREE_OBJECTID 5ULL
-
-/* directory objectid inside the root tree */
-#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL
-
-/* holds checksums of all the data extents */
-#define BTRFS_CSUM_TREE_OBJECTID 7ULL
-
-/* holds quota configuration and tracking */
-#define BTRFS_QUOTA_TREE_OBJECTID 8ULL
-
-/* for storing items that use the BTRFS_UUID_KEY* types */
-#define BTRFS_UUID_TREE_OBJECTID 9ULL
-
-/* tracks free space in block groups. */
-#define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL
-
-/* device stats in the device tree */
-#define BTRFS_DEV_STATS_OBJECTID 0ULL
-
-/* for storing balance parameters in the root tree */
-#define BTRFS_BALANCE_OBJECTID -4ULL
-
-/* orhpan objectid for tracking unlinked/truncated files */
-#define BTRFS_ORPHAN_OBJECTID -5ULL
-
-/* does write ahead logging to speed up fsyncs */
-#define BTRFS_TREE_LOG_OBJECTID -6ULL
-#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL
-
-/* for space balancing */
-#define BTRFS_TREE_RELOC_OBJECTID -8ULL
-#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL
-
-/*
- * extent checksums all have this objectid
- * this allows them to share the logging tree
- * for fsyncs
- */
-#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL
-
-/* For storing free space cache */
-#define BTRFS_FREE_SPACE_OBJECTID -11ULL
-
-/*
- * The inode number assigned to the special inode for storing
- * free ino cache
- */
-#define BTRFS_FREE_INO_OBJECTID -12ULL
-
-/* dummy objectid represents multiple objectids */
-#define BTRFS_MULTIPLE_OBJECTIDS -255ULL
-
-/*
- * All files have objectids in this range.
- */
-#define BTRFS_FIRST_FREE_OBJECTID 256ULL
-#define BTRFS_LAST_FREE_OBJECTID -256ULL
-#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL
-
-
-/*
- * the device items go into the chunk tree.  The key is in the form
- * [ 1 BTRFS_DEV_ITEM_KEY device_id ]
- */
-#define BTRFS_DEV_ITEMS_OBJECTID 1ULL
-
-#define BTRFS_BTREE_INODE_OBJECTID 1
-
-#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
-
-#define BTRFS_DEV_REPLACE_DEVID 0ULL
-
  /*
   * the max metadata block size.  This limit is somewhat artificial,
   * but the memmove costs go through the roof for larger blocks.
@@ -175,12 +84,6 @@ struct btrfs_ordered_sum;
   */
  #define BTRFS_LINK_MAX 65535U
  
-/* 32 bytes in various csum fields */
-#define BTRFS_CSUM_SIZE 32
-
-/* csum types */
-#define BTRFS_CSUM_TYPE_CRC32  0
-
  static const int btrfs_csum_sizes[] = { 4 };
  
  /* four bytes for CRC32 */
@@ -189,17 +92,6 @@ static const int btrfs_csum_sizes[] = { 4 };
  /* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */
  #define REQ_GET_READ_MIRRORS   (1 << 30)
  
-#define BTRFS_FT_UNKNOWN       0
-#define BTRFS_FT_REG_FILE      1
-#define BTRFS_FT_DIR           2
-#define BTRFS_FT_CHRDEV                3
-#define BTRFS_FT_BLKDEV                4
-#define BTRFS_FT_FIFO          5
-#define BTRFS_FT_SOCK          6
-#define BTRFS_FT_SYMLINK       7
-#define BTRFS_FT_XATTR         8
-#define BTRFS_FT_MAX           9
-
  /* ioprio of readahead is set to idle */
  #define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
  
@@ -207,138 +99,10 @@ static const int btrfs_csum_sizes[] = { 4 };
  
  #define BTRFS_MAX_EXTENT_SIZE SZ_128M
  
-/*
- * The key defines the order in the tree, and so it also defines (optimal)
- * block layout.
- *
- * objectid corresponds to the inode number.
- *
- * type tells us things about the object, and is a kind of stream selector.
- * so for a given inode, keys with type of 1 might refer to the inode data,
- * type of 2 may point to file data in the btree and type == 3 may point to
- * extents.
- *
- * offset is the starting byte offset for this key in the stream.
- *
- * btrfs_disk_key is in disk byte order.  struct btrfs_key is always
- * in cpu native order.  Otherwise they are identical and their sizes
- * should be the same (ie both packed)
- */
-struct btrfs_disk_key {
-       __le64 objectid;
-       u8 type;
-       __le64 offset;
-} __attribute__ ((__packed__));
-
-struct btrfs_key {
-       u64 objectid;
-       u8 type;
-       u64 offset;
-} __attribute__ ((__packed__));
-
  struct btrfs_mapping_tree {
         struct extent_map_tree map_tree;
  };
  
-struct btrfs_dev_item {
-       /* the internal btrfs device id */
-       __le64 devid;
-
-       /* size of the device */
-       __le64 total_bytes;
-
-       /* bytes used */
-       __le64 bytes_used;
-
-       /* optimal io alignment for this device */
-       __le32 io_align;
-
-       /* optimal io width for this device */
-       __le32 io_width;
-
-       /* minimal io size for this device */
-       __le32 sector_size;
-
-       /* type and info about this device */
-       __le64 type;
-
-       /* expected generation for this device */
-       __le64 generation;
-
-       /*
-        * starting byte of this partition on the device,
-        * to allow for stripe alignment in the future
-        */
-       __le64 start_offset;
-
-       /* grouping information for allocation decisions */
-       __le32 dev_group;
-
-       /* seek speed 0-100 where 100 is fastest */
-       u8 seek_speed;
-
-       /* bandwidth 0-100 where 100 is fastest */
-       u8 bandwidth;
-
-       /* btrfs generated uuid for this device */
-       u8 uuid[BTRFS_UUID_SIZE];
-
-       /* uuid of FS who owns this device */
-       u8 fsid[BTRFS_UUID_SIZE];
-} __attribute__ ((__packed__));
-
-struct btrfs_stripe {
-       __le64 devid;
-       __le64 offset;
-       u8 dev_uuid[BTRFS_UUID_SIZE];
-} __attribute__ ((__packed__));
-
-struct btrfs_chunk {
-       /* size of this chunk in bytes */
-       __le64 length;
-
-       /* objectid of the root referencing this chunk */
-       __le64 owner;
-
-       __le64 stripe_len;
-       __le64 type;
-
-       /* optimal io alignment for this chunk */
-       __le32 io_align;
-
-       /* optimal io width for this chunk */
-       __le32 io_width;
-
-       /* minimal io size for this chunk */
-       __le32 sector_size;
-
-       /* 2^16 stripes is quite a lot, a second limit is the size of a single
-        * item in the btree
-        */
-       __le16 num_stripes;
-
-       /* sub stripes only matter for raid10 */
-       __le16 sub_stripes;
-       struct btrfs_stripe stripe;
-       /* additional stripes go here */
-} __attribute__ ((__packed__));
-
-#define BTRFS_FREE_SPACE_EXTENT        1
-#define BTRFS_FREE_SPACE_BITMAP        2
-
-struct btrfs_free_space_entry {
-       __le64 offset;
-       __le64 bytes;
-       u8 type;
-} __attribute__ ((__packed__));
-
-struct btrfs_free_space_header {
-       struct btrfs_disk_key location;
-       __le64 generation;
-       __le64 num_entries;
-       __le64 num_bitmaps;
-} __attribute__ ((__packed__));
-
  static inline unsigned long btrfs_chunk_item_size(int num_stripes)
  {
         BUG_ON(num_stripes == 0);
@@ -346,9 +110,6 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
                 sizeof(struct btrfs_stripe) * (num_stripes - 1);
  }
  
-#define BTRFS_HEADER_FLAG_WRITTEN      (1ULL << 0)
-#define BTRFS_HEADER_FLAG_RELOC                (1ULL << 1)
-
  /*
   * File system states
   */
@@ -357,13 +118,6 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
  #define BTRFS_FS_STATE_TRANS_ABORTED   2
  #define BTRFS_FS_STATE_DEV_REPLACING   3
  
-/* Super block flags */
-/* Errors detected */
-#define BTRFS_SUPER_FLAG_ERROR         (1ULL << 2)
-
-#define BTRFS_SUPER_FLAG_SEEDING       (1ULL << 32)
-#define BTRFS_SUPER_FLAG_METADUMP      (1ULL << 33)
-
  #define BTRFS_BACKREF_REV_MAX          256
  #define BTRFS_BACKREF_REV_SHIFT                56
  #define BTRFS_BACKREF_REV_MASK         (((u64)BTRFS_BACKREF_REV_MAX - 1) << \
@@ -410,7 +164,6 @@ struct btrfs_header {
   * room to translate 14 chunks with 3 stripes each.
   */
  #define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048
-#define BTRFS_LABEL_SIZE 256
  
  /*
   * just in case we somehow lose the roots and are not able to mount,
@@ -507,31 +260,6 @@ struct btrfs_super_block {
   * Compat flags that we support.  If any incompat flags are set other than the
   * ones specified below then we will fail to mount
   */
-#define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE        (1ULL << 0)
-
-#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF   (1ULL << 0)
-#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL  (1ULL << 1)
-#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS    (1ULL << 2)
-#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO    (1ULL << 3)
-/*
- * some patches floated around with a second compression method
- * lets save that incompat here for when they do get in
- * Note we don't actually support it, we're just reserving the
- * number
- */
-#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZOv2  (1ULL << 4)
-
-/*
- * older kernels tried to do bigger metadata blocks, but the
- * code was pretty buggy.  Lets not let them try anymore.
- */
-#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA    (1ULL << 5)
-
-#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF   (1ULL << 6)
-#define BTRFS_FEATURE_INCOMPAT_RAID56          (1ULL << 7)
-#define BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA (1ULL << 8)
-#define BTRFS_FEATURE_INCOMPAT_NO_HOLES                (1ULL << 9)
-
  #define BTRFS_FEATURE_COMPAT_SUPP              0ULL
  #define BTRFS_FEATURE_COMPAT_SAFE_SET          0ULL
  #define BTRFS_FEATURE_COMPAT_SAFE_CLEAR                0ULL
@@ -624,357 +352,8 @@ struct btrfs_path {
         unsigned int need_commit_sem:1;
         unsigned int skip_release_on_error:1;
  };
-
-/*
- * items in the extent btree are used to record the objectid of the
- * owner of the block and the number of references
- */
-
-struct btrfs_extent_item {
-       __le64 refs;
-       __le64 generation;
-       __le64 flags;
-} __attribute__ ((__packed__));
-
-struct btrfs_extent_item_v0 {
-       __le32 refs;
-} __attribute__ ((__packed__));
-
  #define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r) >> 4) - \
                                         sizeof(struct btrfs_item))
-
-#define BTRFS_EXTENT_FLAG_DATA         (1ULL << 0)
-#define BTRFS_EXTENT_FLAG_TREE_BLOCK   (1ULL << 1)
-
-/* following flags only apply to tree blocks */
-
-/* use full backrefs for extent pointers in the block */
-#define BTRFS_BLOCK_FLAG_FULL_BACKREF  (1ULL << 8)
-
-/*
- * this flag is only used internally by scrub and may be changed at any time
- * it is only declared here to avoid collisions
- */
-#define BTRFS_EXTENT_FLAG_SUPER                (1ULL << 48)
-
-struct btrfs_tree_block_info {
-       struct btrfs_disk_key key;
-       u8 level;
-} __attribute__ ((__packed__));
-
-struct btrfs_extent_data_ref {
-       __le64 root;
-       __le64 objectid;
-       __le64 offset;
-       __le32 count;
-} __attribute__ ((__packed__));
-
-struct btrfs_shared_data_ref {
-       __le32 count;
-} __attribute__ ((__packed__));
-
-struct btrfs_extent_inline_ref {
-       u8 type;
-       __le64 offset;
-} __attribute__ ((__packed__));
-
-/* old style backrefs item */
-struct btrfs_extent_ref_v0 {
-       __le64 root;
-       __le64 generation;
-       __le64 objectid;
-       __le32 count;
-} __attribute__ ((__packed__));
-
-
-/* dev extents record free space on individual devices.  The owner
- * field points back to the chunk allocation mapping tree that allocated
- * the extent.  The chunk tree uuid field is a way to double check the owner
- */
-struct btrfs_dev_extent {
-       __le64 chunk_tree;
-       __le64 chunk_objectid;
-       __le64 chunk_offset;
-       __le64 length;
-       u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
-} __attribute__ ((__packed__));
-
-struct btrfs_inode_ref {
-       __le64 index;
-       __le16 name_len;
-       /* name goes here */
-} __attribute__ ((__packed__));
-
-struct btrfs_inode_extref {
-       __le64 parent_objectid;
-       __le64 index;
-       __le16 name_len;
-       __u8   name[0];
-       /* name goes here */
-} __attribute__ ((__packed__));
-
-struct btrfs_timespec {
-       __le64 sec;
-       __le32 nsec;
-} __attribute__ ((__packed__));
-
-struct btrfs_inode_item {
-       /* nfs style generation number */
-       __le64 generation;
-       /* transid that last touched this inode */
-       __le64 transid;
-       __le64 size;
-       __le64 nbytes;
-       __le64 block_group;
-       __le32 nlink;
-       __le32 uid;
-       __le32 gid;
-       __le32 mode;
-       __le64 rdev;
-       __le64 flags;
-
-       /* modification sequence number for NFS */
-       __le64 sequence;
-
-       /*
-        * a little future expansion, for more than this we can
-        * just grow the inode item and version it
-        */
-       __le64 reserved[4];
-       struct btrfs_timespec atime;
-       struct btrfs_timespec ctime;
-       struct btrfs_timespec mtime;
-       struct btrfs_timespec otime;
-} __attribute__ ((__packed__));
-
-struct btrfs_dir_log_item {
-       __le64 end;
-} __attribute__ ((__packed__));
-
-struct btrfs_dir_item {
-       struct btrfs_disk_key location;
-       __le64 transid;
-       __le16 data_len;
-       __le16 name_len;
-       u8 type;
-} __attribute__ ((__packed__));
-
-#define BTRFS_ROOT_SUBVOL_RDONLY       (1ULL << 0)
-
-/*
- * Internal in-memory flag that a subvolume has been marked for deletion but
- * still visible as a directory
- */
-#define BTRFS_ROOT_SUBVOL_DEAD         (1ULL << 48)
-
-struct btrfs_root_item {
-       struct btrfs_inode_item inode;
-       __le64 generation;
-       __le64 root_dirid;
-       __le64 bytenr;
-       __le64 byte_limit;
-       __le64 bytes_used;
-       __le64 last_snapshot;
-       __le64 flags;
-       __le32 refs;
-       struct btrfs_disk_key drop_progress;
-       u8 drop_level;
-       u8 level;
-
-       /*
-        * The following fields appear after subvol_uuids+subvol_times
-        * were introduced.
-        */
-
-       /*
-        * This generation number is used to test if the new fields are valid
-        * and up to date while reading the root item. Every time the root item
-        * is written out, the "generation" field is copied into this field. If
-        * anyone ever mounted the fs with an older kernel, we will have
-        * mismatching generation values here and thus must invalidate the
-        * new fields. See btrfs_update_root and btrfs_find_last_root for
-        * details.
-        * the offset of generation_v2 is also used as the start for the memset
-        * when invalidating the fields.
-        */
-       __le64 generation_v2;
-       u8 uuid[BTRFS_UUID_SIZE];
-       u8 parent_uuid[BTRFS_UUID_SIZE];
-       u8 received_uuid[BTRFS_UUID_SIZE];
-       __le64 ctransid; /* updated when an inode changes */
-       __le64 otransid; /* trans when created */
-       __le64 stransid; /* trans when sent. non-zero for received subvol */
-       __le64 rtransid; /* trans when received. non-zero for received subvol */
-       struct btrfs_timespec ctime;
-       struct btrfs_timespec otime;
-       struct btrfs_timespec stime;
-       struct btrfs_timespec rtime;
-       __le64 reserved[8]; /* for future */
-} __attribute__ ((__packed__));
-
-/*
- * this is used for both forward and backward root refs
- */
-struct btrfs_root_ref {
-       __le64 dirid;
-       __le64 sequence;
-       __le16 name_len;
-} __attribute__ ((__packed__));
-
-struct btrfs_disk_balance_args {
-       /*
-        * profiles to operate on, single is denoted by
-        * BTRFS_AVAIL_ALLOC_BIT_SINGLE
-        */
-       __le64 profiles;
-
-       /*
-        * usage filter
-        * BTRFS_BALANCE_ARGS_USAGE with a single value means '0..N'
-        * BTRFS_BALANCE_ARGS_USAGE_RANGE - range syntax, min..max
-        */
-       union {
-               __le64 usage;
-               struct {
-                       __le32 usage_min;
-                       __le32 usage_max;
-               };
-       };
-
-       /* devid filter */
-       __le64 devid;
-
-       /* devid subset filter [pstart..pend) */
-       __le64 pstart;
-       __le64 pend;
-
-       /* btrfs virtual address space subset filter [vstart..vend) */
-       __le64 vstart;
-       __le64 vend;
-
-       /*
-        * profile to convert to, single is denoted by
-        * BTRFS_AVAIL_ALLOC_BIT_SINGLE
-        */
-       __le64 target;
-
-       /* BTRFS_BALANCE_ARGS_* */
-       __le64 flags;
-
-       /*
-        * BTRFS_BALANCE_ARGS_LIMIT with value 'limit'
-        * BTRFS_BALANCE_ARGS_LIMIT_RANGE - the extend version can use minimum
-        * and maximum
-        */
-       union {
-               __le64 limit;
-               struct {
-                       __le32 limit_min;
-                       __le32 limit_max;
-               };
-       };
-
-       /*
-        * Process chunks that cross stripes_min..stripes_max devices,
-        * BTRFS_BALANCE_ARGS_STRIPES_RANGE
-        */
-       __le32 stripes_min;
-       __le32 stripes_max;
-
-       __le64 unused[6];
-} __attribute__ ((__packed__));
-
-/*
- * store balance parameters to disk so that balance can be properly
- * resumed after crash or unmount
- */
-struct btrfs_balance_item {
-       /* BTRFS_BALANCE_* */
-       __le64 flags;
-
-       struct btrfs_disk_balance_args data;
-       struct btrfs_disk_balance_args meta;
-       struct btrfs_disk_balance_args sys;
-
-       __le64 unused[4];
-} __attribute__ ((__packed__));
-
-#define BTRFS_FILE_EXTENT_INLINE 0
-#define BTRFS_FILE_EXTENT_REG 1
-#define BTRFS_FILE_EXTENT_PREALLOC 2
-
-struct btrfs_file_extent_item {
-       /*
-        * transaction id that created this extent
-        */
-       __le64 generation;
-       /*
-        * max number of bytes to hold this extent in ram
-        * when we split a compressed extent we can't know how big
-        * each of the resulting pieces will be.  So, this is
-        * an upper limit on the size of the extent in ram instead of
-        * an exact limit.
-        */
-       __le64 ram_bytes;
-
-       /*
-        * 32 bits for the various ways we might encode the data,
-        * including compression and encryption.  If any of these
-        * are set to something a given disk format doesn't understand
-        * it is treated like an incompat flag for reading and writing,
-        * but not for stat.
-        */
-       u8 compression;
-       u8 encryption;
-       __le16 other_encoding; /* spare for later use */
-
-       /* are we inline data or a real extent? */
-       u8 type;
-
-       /*
-        * disk space consumed by the extent, checksum blocks are included
-        * in these numbers
-        *
-        * At this offset in the structure, the inline extent data start.
-        */
-       __le64 disk_bytenr;
-       __le64 disk_num_bytes;
-       /*
-        * the logical offset in file blocks (no csums)
-        * this extent record is for.  This allows a file extent to point
-        * into the middle of an existing extent on disk, sharing it
-        * between two snapshots (useful if some bytes in the middle of the
-        * extent have changed
-        */
-       __le64 offset;
-       /*
-        * the logical number of file blocks (no csums included).  This
-        * always reflects the size uncompressed and without encoding.
-        */
-       __le64 num_bytes;
-
-} __attribute__ ((__packed__));
-
-struct btrfs_csum_item {
-       u8 csum;
-} __attribute__ ((__packed__));
-
-struct btrfs_dev_stats_item {
-       /*
-        * grow this item struct at the end for future enhancements and keep
-        * the existing values unchanged
-        */
-       __le64 values[BTRFS_DEV_STAT_VALUES_MAX];
-} __attribute__ ((__packed__));
-
-#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS    0
-#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID     1
-#define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED     0
-#define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED           1
-#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED         2
-#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED          3
-#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED          4
-
  struct btrfs_dev_replace {
         u64 replace_state;      /* see #define above */
         u64 time_started;       /* seconds since 1-Jan-1970 */
@@ -1005,175 +384,6 @@ struct btrfs_dev_replace {
         struct btrfs_scrub_progress scrub_progress;
  };
  
-struct btrfs_dev_replace_item {
-       /*
-        * grow this item struct at the end for future enhancements and keep
-        * the existing values unchanged
-        */
-       __le64 src_devid;
-       __le64 cursor_left;
-       __le64 cursor_right;
-       __le64 cont_reading_from_srcdev_mode;
-
-       __le64 replace_state;
-       __le64 time_started;
-       __le64 time_stopped;
-       __le64 num_write_errors;
-       __le64 num_uncorrectable_read_errors;
-} __attribute__ ((__packed__));
-
-/* different types of block groups (and chunks) */
-#define BTRFS_BLOCK_GROUP_DATA         (1ULL << 0)
-#define BTRFS_BLOCK_GROUP_SYSTEM       (1ULL << 1)
-#define BTRFS_BLOCK_GROUP_METADATA     (1ULL << 2)
-#define BTRFS_BLOCK_GROUP_RAID0                (1ULL << 3)
-#define BTRFS_BLOCK_GROUP_RAID1                (1ULL << 4)
-#define BTRFS_BLOCK_GROUP_DUP          (1ULL << 5)
-#define BTRFS_BLOCK_GROUP_RAID10       (1ULL << 6)
-#define BTRFS_BLOCK_GROUP_RAID5         (1ULL << 7)
-#define BTRFS_BLOCK_GROUP_RAID6         (1ULL << 8)
-#define BTRFS_BLOCK_GROUP_RESERVED     (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \
-                                        BTRFS_SPACE_INFO_GLOBAL_RSV)
-
-enum btrfs_raid_types {
-       BTRFS_RAID_RAID10,
-       BTRFS_RAID_RAID1,
-       BTRFS_RAID_DUP,
-       BTRFS_RAID_RAID0,
-       BTRFS_RAID_SINGLE,
-       BTRFS_RAID_RAID5,
-       BTRFS_RAID_RAID6,
-       BTRFS_NR_RAID_TYPES
-};
-
-#define BTRFS_BLOCK_GROUP_TYPE_MASK    (BTRFS_BLOCK_GROUP_DATA |    \
-                                        BTRFS_BLOCK_GROUP_SYSTEM |  \
-                                        BTRFS_BLOCK_GROUP_METADATA)
-
-#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 |   \
-                                        BTRFS_BLOCK_GROUP_RAID1 |   \
-                                        BTRFS_BLOCK_GROUP_RAID5 |   \
-                                        BTRFS_BLOCK_GROUP_RAID6 |   \
-                                        BTRFS_BLOCK_GROUP_DUP |     \
-                                        BTRFS_BLOCK_GROUP_RAID10)
-#define BTRFS_BLOCK_GROUP_RAID56_MASK  (BTRFS_BLOCK_GROUP_RAID5 |   \
-                                        BTRFS_BLOCK_GROUP_RAID6)
-
-/*
- * We need a bit for restriper to be able to tell when chunks of type
- * SINGLE are available.  This "extended" profile format is used in
- * fs_info->avail_*_alloc_bits (in-memory) and balance item fields
- * (on-disk).  The corresponding on-disk bit in chunk.type is reserved
- * to avoid remappings between two formats in future.
- */
-#define BTRFS_AVAIL_ALLOC_BIT_SINGLE   (1ULL << 48)
-
-/*
- * A fake block group type that is used to communicate global block reserve
- * size to userspace via the SPACE_INFO ioctl.
- */
-#define BTRFS_SPACE_INFO_GLOBAL_RSV    (1ULL << 49)
-
-#define BTRFS_EXTENDED_PROFILE_MASK    (BTRFS_BLOCK_GROUP_PROFILE_MASK | \
-                                        BTRFS_AVAIL_ALLOC_BIT_SINGLE)
-
-static inline u64 chunk_to_extended(u64 flags)
-{
-       if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0)
-               flags |= BTRFS_AVAIL_ALLOC_BIT_SINGLE;
-
-       return flags;
-}
-static inline u64 extended_to_chunk(u64 flags)
-{
-       return flags & ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
-}
-
-struct btrfs_block_group_item {
-       __le64 used;
-       __le64 chunk_objectid;
-       __le64 flags;
-} __attribute__ ((__packed__));
-
-struct btrfs_free_space_info {
-       __le32 extent_count;
-       __le32 flags;
-} __attribute__ ((__packed__));
-
-#define BTRFS_FREE_SPACE_USING_BITMAPS (1ULL << 0)
-
-#define BTRFS_QGROUP_LEVEL_SHIFT               48
-static inline u64 btrfs_qgroup_level(u64 qgroupid)
-{
-       return qgroupid >> BTRFS_QGROUP_LEVEL_SHIFT;
-}
-
-/*
- * is subvolume quota turned on?
- */
-#define BTRFS_QGROUP_STATUS_FLAG_ON            (1ULL << 0)
-/*
- * RESCAN is set during the initialization phase
- */
-#define BTRFS_QGROUP_STATUS_FLAG_RESCAN                (1ULL << 1)
-/*
- * Some qgroup entries are known to be out of date,
- * either because the configuration has changed in a way that
- * makes a rescan necessary, or because the fs has been mounted
- * with a non-qgroup-aware version.
- * Turning qouta off and on again makes it inconsistent, too.
- */
-#define BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT  (1ULL << 2)
-
-#define BTRFS_QGROUP_STATUS_VERSION        1
-
-struct btrfs_qgroup_status_item {
-       __le64 version;
-       /*
-        * the generation is updated during every commit. As older
-        * versions of btrfs are not aware of qgroups, it will be
-        * possible to detect inconsistencies by checking the
-        * generation on mount time
-        */
-       __le64 generation;
-
-       /* flag definitions see above */
-       __le64 flags;
-
-       /*
-        * only used during scanning to record the progress
-        * of the scan. It contains a logical address
-        */
-       __le64 rescan;
-} __attribute__ ((__packed__));
-
-struct btrfs_qgroup_info_item {
-       __le64 generation;
-       __le64 rfer;
-       __le64 rfer_cmpr;
-       __le64 excl;
-       __le64 excl_cmpr;
-} __attribute__ ((__packed__));
-
-/* flags definition for qgroup limits */
-#define BTRFS_QGROUP_LIMIT_MAX_RFER    (1ULL << 0)
-#define BTRFS_QGROUP_LIMIT_MAX_EXCL    (1ULL << 1)
-#define BTRFS_QGROUP_LIMIT_RSV_RFER    (1ULL << 2)
-#define BTRFS_QGROUP_LIMIT_RSV_EXCL    (1ULL << 3)
-#define BTRFS_QGROUP_LIMIT_RFER_CMPR   (1ULL << 4)
-#define BTRFS_QGROUP_LIMIT_EXCL_CMPR   (1ULL << 5)
-
-struct btrfs_qgroup_limit_item {
-       /*
-        * only updated when any of the other values change
-        */
-       __le64 flags;
-       __le64 max_rfer;
-       __le64 max_excl;
-       __le64 rsv_rfer;
-       __le64 rsv_excl;
-} __attribute__ ((__packed__));
-
  /* For raid type sysfs entries */
  struct raid_kobject {
         int raid_type;
@@ -1408,6 +618,27 @@ struct btrfs_block_group_cache {
  
         struct btrfs_io_ctl io_ctl;
  
+       /*
+        * Incremented when doing extent allocations and holding a read lock
+        * on the space_info's groups_sem semaphore.
+        * Decremented when an ordered extent that represents an IO against this
+        * block group's range is created (after it's added to its inode's
+        * root's list of ordered extents) or immediately after the allocation
+        * if it's a metadata extent or fallocate extent (for these cases we
+        * don't create ordered extents).
+        */
+       atomic_t reservations;
+
+       /*
+        * Incremented while holding the spinlock *lock* by a task checking if
+        * it can perform a nocow write (incremented if the value for the *ro*
+        * field is 0). Decremented by such tasks once they create an ordered
+        * extent or before that if some error happens before reaching that step.
+        * This is to prevent races between block group relocation and nocow
+        * writes through direct IO.
+        */
+       atomic_t nocow_writers;
+
         /* Lock for free space tree operations. */
         struct mutex free_space_lock;
  
@@ -2026,228 +1257,6 @@ struct btrfs_root {
         atomic_t qgroup_meta_rsv;
  };
  
-struct btrfs_ioctl_defrag_range_args {
-       /* start of the defrag operation */
-       __u64 start;
-
-       /* number of bytes to defrag, use (u64)-1 to say all */
-       __u64 len;
-
-       /*
-        * flags for the operation, which can include turning
-        * on compression for this one defrag
-        */
-       __u64 flags;
-
-       /*
-        * any extent bigger than this will be considered
-        * already defragged.  Use 0 to take the kernel default
-        * Use 1 to say every single extent must be rewritten
-        */
-       __u32 extent_thresh;
-
-       /*
-        * which compression method to use if turning on compression
-        * for this defrag operation.  If unspecified, zlib will
-        * be used
-        */
-       __u32 compress_type;
-
-       /* spare for later */
-       __u32 unused[4];
-};
-
-
-/*
- * inode items have the data typically returned from stat and store other
- * info about object characteristics.  There is one for every file and dir in
- * the FS
- */
-#define BTRFS_INODE_ITEM_KEY           1
-#define BTRFS_INODE_REF_KEY            12
-#define BTRFS_INODE_EXTREF_KEY         13
-#define BTRFS_XATTR_ITEM_KEY           24
-#define BTRFS_ORPHAN_ITEM_KEY          48
-/* reserve 2-15 close to the inode for later flexibility */
-
-/*
- * dir items are the name -> inode pointers in a directory.  There is one
- * for every name in a directory.
- */
-#define BTRFS_DIR_LOG_ITEM_KEY  60
-#define BTRFS_DIR_LOG_INDEX_KEY 72
-#define BTRFS_DIR_ITEM_KEY     84
-#define BTRFS_DIR_INDEX_KEY    96
-/*
- * extent data is for file data
- */
-#define BTRFS_EXTENT_DATA_KEY  108
-
-/*
- * extent csums are stored in a separate tree and hold csums for
- * an entire extent on disk.
- */
-#define BTRFS_EXTENT_CSUM_KEY  128
-
-/*
- * root items point to tree roots.  They are typically in the root
- * tree used by the super block to find all the other trees
- */
-#define BTRFS_ROOT_ITEM_KEY    132
-
-/*
- * root backrefs tie subvols and snapshots to the directory entries that
- * reference them
- */
-#define BTRFS_ROOT_BACKREF_KEY 144
-
-/*
- * root refs make a fast index for listing all of the snapshots and
- * subvolumes referenced by a given root.  They point directly to the
- * directory item in the root that references the subvol
- */
-#define BTRFS_ROOT_REF_KEY     156
-
-/*
- * extent items are in the extent map tree.  These record which blocks
- * are used, and how many references there are to each block
- */
-#define BTRFS_EXTENT_ITEM_KEY  168
-
-/*
- * The same as the BTRFS_EXTENT_ITEM_KEY, except it's metadata we already know
- * the length, so we save the level in key->offset instead of the length.
- */
-#define BTRFS_METADATA_ITEM_KEY        169
-
-#define BTRFS_TREE_BLOCK_REF_KEY       176
-
-#define BTRFS_EXTENT_DATA_REF_KEY      178
-
-#define BTRFS_EXTENT_REF_V0_KEY                180
-
-#define BTRFS_SHARED_BLOCK_REF_KEY     182
-
-#define BTRFS_SHARED_DATA_REF_KEY      184
-
-/*
- * block groups give us hints into the extent allocation trees.  Which
- * blocks are free etc etc
- */
-#define BTRFS_BLOCK_GROUP_ITEM_KEY 192
-
-/*
- * Every block group is represented in the free space tree by a free space info
- * item, which stores some accounting information. It is keyed on
- * (block_group_start, FREE_SPACE_INFO, block_group_length).
- */
-#define BTRFS_FREE_SPACE_INFO_KEY 198
-
-/*
- * A free space extent tracks an extent of space that is free in a block group.
- * It is keyed on (start, FREE_SPACE_EXTENT, length).
- */
-#define BTRFS_FREE_SPACE_EXTENT_KEY 199
-
-/*
- * When a block group becomes very fragmented, we convert it to use bitmaps
- * instead of extents. A free space bitmap is keyed on
- * (start, FREE_SPACE_BITMAP, length); the corresponding item is a bitmap with
- * (length / sectorsize) bits.
- */
-#define BTRFS_FREE_SPACE_BITMAP_KEY 200
-
-#define BTRFS_DEV_EXTENT_KEY   204
-#define BTRFS_DEV_ITEM_KEY     216
-#define BTRFS_CHUNK_ITEM_KEY   228
-
-/*
- * Records the overall state of the qgroups.
- * There's only one instance of this key present,
- * (0, BTRFS_QGROUP_STATUS_KEY, 0)
- */
-#define BTRFS_QGROUP_STATUS_KEY         240
-/*
- * Records the currently used space of the qgroup.
- * One key per qgroup, (0, BTRFS_QGROUP_INFO_KEY, qgroupid).
- */
-#define BTRFS_QGROUP_INFO_KEY           242
-/*
- * Contains the user configured limits for the qgroup.
- * One key per qgroup, (0, BTRFS_QGROUP_LIMIT_KEY, qgroupid).
- */
-#define BTRFS_QGROUP_LIMIT_KEY          244
-/*
- * Records the child-parent relationship of qgroups. For
- * each relation, 2 keys are present:
- * (childid, BTRFS_QGROUP_RELATION_KEY, parentid)
- * (parentid, BTRFS_QGROUP_RELATION_KEY, childid)
- */
-#define BTRFS_QGROUP_RELATION_KEY       246
-
-/*
- * Obsolete name, see BTRFS_TEMPORARY_ITEM_KEY.
- */
-#define BTRFS_BALANCE_ITEM_KEY 248
-
-/*
- * The key type for tree items that are stored persistently, but do not need to
- * exist for extended period of time. The items can exist in any tree.
- *
- * [subtype, BTRFS_TEMPORARY_ITEM_KEY, data]
- *
- * Existing items:
- *
- * - balance status item
- *   (BTRFS_BALANCE_OBJECTID, BTRFS_TEMPORARY_ITEM_KEY, 0)
- */
-#define BTRFS_TEMPORARY_ITEM_KEY       248
-
-/*
- * Obsolete name, see BTRFS_PERSISTENT_ITEM_KEY
- */
-#define BTRFS_DEV_STATS_KEY            249
-
-/*
- * The key type for tree items that are stored persistently and usually exist
- * for a long period, eg. filesystem lifetime. The item kinds can be status
- * information, stats or preference values. The item can exist in any tree.
- *
- * [subtype, BTRFS_PERSISTENT_ITEM_KEY, data]
- *
- * Existing items:
- *
- * - device statistics, store IO stats in the device tree, one key for all
- *   stats
- *   (BTRFS_DEV_STATS_OBJECTID, BTRFS_DEV_STATS_KEY, 0)
- */
-#define BTRFS_PERSISTENT_ITEM_KEY      249
-
-/*
- * Persistantly stores the device replace state in the device tree.
- * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0).
- */
-#define BTRFS_DEV_REPLACE_KEY  250
-
-/*
- * Stores items that allow to quickly map UUIDs to something else.
- * These items are part of the filesystem UUID tree.
- * The key is built like this:
- * (UUID_upper_64_bits, BTRFS_UUID_KEY*, UUID_lower_64_bits).
- */
-#if BTRFS_UUID_SIZE != 16
-#error "UUID items require BTRFS_UUID_SIZE == 16!"
-#endif
-#define BTRFS_UUID_KEY_SUBVOL  251     /* for UUIDs assigned to subvols */
-#define BTRFS_UUID_KEY_RECEIVED_SUBVOL 252     /* for UUIDs assigned to
-                                                * received subvols */
-
-/*
- * string items are for debugging.  They just store a short string of
- * data in the FS
- */
-#define BTRFS_STRING_ITEM_KEY  253
-
  /*
   * Flags for mount options.
   *
@@ -3499,6 +2508,12 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
                                        struct btrfs_root *root);
  int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
                                        struct btrfs_root *root);
+void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
+                                        const u64 start);
+void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg);
+bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
+void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
+void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg);
  void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
  int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root, unsigned long count);
@@ -4122,6 +3137,7 @@ void btrfs_test_inode_set_ops(struct inode *inode);
  
  /* ioctl.c */
  long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
  int btrfs_ioctl_get_supported_features(void __user *arg);
  void btrfs_update_iflags(struct inode *inode);
  void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
@@ -4326,10 +3342,9 @@ static inline void assfail(char *expr, char *file, int line)
  #define ASSERT(expr)   ((void)0)
  #endif
  
-#define btrfs_assert()
  __printf(5, 6)
  __cold
-void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
+void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
                      unsigned int line, int errno, const char *fmt, ...);
  
  const char *btrfs_decode_error(int errno);
@@ -4339,6 +3354,46 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root, const char *function,
                                unsigned int line, int errno);
  
+/*
+ * Call btrfs_abort_transaction as early as possible when an error condition is
+ * detected, that way the exact line number is reported.
+ */
+#define btrfs_abort_transaction(trans, root, errno)            \
+do {                                                           \
+       /* Report first abort since mount */                    \
+       if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED,     \
+                       &((root)->fs_info->fs_state))) {        \
+               WARN(1, KERN_DEBUG                              \
+               "BTRFS: Transaction aborted (error %d)\n",      \
+               (errno));                                       \
+       }                                                       \
+       __btrfs_abort_transaction((trans), (root), __func__,    \
+                                 __LINE__, (errno));           \
+} while (0)
+
+#define btrfs_handle_fs_error(fs_info, errno, fmt, args...)            \
+do {                                                           \
+       __btrfs_handle_fs_error((fs_info), __func__, __LINE__,  \
+                         (errno), fmt, ##args);                \
+} while (0)
+
+__printf(5, 6)
+__cold
+void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
+                  unsigned int line, int errno, const char *fmt, ...);
+/*
+ * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic
+ * will panic().  Otherwise we BUG() here.
+ */
+#define btrfs_panic(fs_info, errno, fmt, args...)                      \
+do {                                                                   \
+       __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \
+       BUG();                                                          \
+} while (0)
+
+
+/* compatibility and incompatibility defines */
+
  #define btrfs_set_fs_incompat(__fs_info, opt) \
         __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
  
@@ -4455,44 +3510,6 @@ static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag)
         return !!(btrfs_super_compat_ro_flags(disk_super) & flag);
  }
  
-/*
- * Call btrfs_abort_transaction as early as possible when an error condition is
- * detected, that way the exact line number is reported.
- */
-#define btrfs_abort_transaction(trans, root, errno)            \
-do {                                                           \
-       /* Report first abort since mount */                    \
-       if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED,     \
-                       &((root)->fs_info->fs_state))) {        \
-               WARN(1, KERN_DEBUG                              \
-               "BTRFS: Transaction aborted (error %d)\n",      \
-               (errno));                                       \
-       }                                                       \
-       __btrfs_abort_transaction((trans), (root), __func__,    \
-                                 __LINE__, (errno));           \
-} while (0)
-
-#define btrfs_std_error(fs_info, errno, fmt, args...)          \
-do {                                                           \
-       __btrfs_std_error((fs_info), __func__, __LINE__,        \
-                         (errno), fmt, ##args);                \
-} while (0)
-
-__printf(5, 6)
-__cold
-void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
-                  unsigned int line, int errno, const char *fmt, ...);
-
-/*
- * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic
- * will panic().  Otherwise we BUG() here.
- */
-#define btrfs_panic(fs_info, errno, fmt, args...)                      \
-do {                                                                   \
-       __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \
-       BUG();                                                          \
-} while (0)
-
  /* acl.c */
  #ifdef CONFIG_BTRFS_FS_POSIX_ACL
  struct posix_acl *btrfs_get_acl(struct inode *inode, int type);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c

index 6cef006..61561c2 100644 (file)
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -134,7 +134,7 @@ again:
         /* cached in the btrfs inode and can be accessed */
         atomic_add(2, &node->refs);
  
-       ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+       ret = radix_tree_preload(GFP_NOFS);
         if (ret) {
                 kmem_cache_free(delayed_node_cache, node);
                 return ERR_PTR(ret);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c

index 26bcb48..85f12e6 100644 (file)
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -44,9 +44,6 @@ static void btrfs_dev_replace_update_device_in_mapping_tree(
                                                 struct btrfs_fs_info *fs_info,
                                                 struct btrfs_device *srcdev,
                                                 struct btrfs_device *tgtdev);
-static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
-                                        char *srcdev_name,
-                                        struct btrfs_device **device);
  static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
  static int btrfs_dev_replace_kthread(void *data);
  static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info);
@@ -305,8 +302,8 @@ void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
                 dev_replace->cursor_left_last_write_of_item;
  }
  
-int btrfs_dev_replace_start(struct btrfs_root *root,
-                           struct btrfs_ioctl_dev_replace_args *args)
+int btrfs_dev_replace_start(struct btrfs_root *root, char *tgtdev_name,
+                               u64 srcdevid, char *srcdev_name, int read_src)
  {
         struct btrfs_trans_handle *trans;
         struct btrfs_fs_info *fs_info = root->fs_info;
@@ -315,29 +312,16 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
         struct btrfs_device *tgt_device = NULL;
         struct btrfs_device *src_device = NULL;
  
-       switch (args->start.cont_reading_from_srcdev_mode) {
-       case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
-       case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
-               break;
-       default:
-               return -EINVAL;
-       }
-
-       if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
-           args->start.tgtdev_name[0] == '\0')
-               return -EINVAL;
-
         /* the disk copy procedure reuses the scrub code */
         mutex_lock(&fs_info->volume_mutex);
-       ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
-                                           args->start.srcdev_name,
-                                           &src_device);
+       ret = btrfs_find_device_by_devspec(root, srcdevid,
+                                           srcdev_name, &src_device);
         if (ret) {
                 mutex_unlock(&fs_info->volume_mutex);
                 return ret;
         }
  
-       ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
+       ret = btrfs_init_dev_replace_tgtdev(root, tgtdev_name,
                                             src_device, &tgt_device);
         mutex_unlock(&fs_info->volume_mutex);
         if (ret)
@@ -364,18 +348,17 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
                 break;
         case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
         case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
-               args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
+               ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
                 goto leave;
         }
  
-       dev_replace->cont_reading_from_srcdev_mode =
-               args->start.cont_reading_from_srcdev_mode;
+       dev_replace->cont_reading_from_srcdev_mode = read_src;
         WARN_ON(!src_device);
         dev_replace->srcdev = src_device;
         WARN_ON(!tgt_device);
         dev_replace->tgtdev = tgt_device;
  
-       btrfs_info_in_rcu(root->fs_info,
+       btrfs_info_in_rcu(fs_info,
                       "dev_replace from %s (devid %llu) to %s started",
                       src_device->missing ? "<missing disk>" :
                         rcu_str_deref(src_device->name),
@@ -396,14 +379,13 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
         dev_replace->item_needs_writeback = 1;
         atomic64_set(&dev_replace->num_write_errors, 0);
         atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
-       args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
         btrfs_dev_replace_unlock(dev_replace, 1);
  
         ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
         if (ret)
-               btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret);
+               btrfs_err(fs_info, "kobj add dev failed %d\n", ret);
  
-       btrfs_wait_ordered_roots(root->fs_info, -1);
+       btrfs_wait_ordered_roots(root->fs_info, -1, 0, (u64)-1);
  
         /* force writing the updated state information to disk */
         trans = btrfs_start_transaction(root, 0);
@@ -421,11 +403,9 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
                               btrfs_device_get_total_bytes(src_device),
                               &dev_replace->scrub_progress, 0, 1);
  
-       ret = btrfs_dev_replace_finishing(root->fs_info, ret);
-       /* don't warn if EINPROGRESS, someone else might be running scrub */
+       ret = btrfs_dev_replace_finishing(fs_info, ret);
         if (ret == -EINPROGRESS) {
-               args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
-               ret = 0;
+               ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
         } else {
                 WARN_ON(ret);
         }
@@ -440,6 +420,35 @@ leave:
         return ret;
  }
  
+int btrfs_dev_replace_by_ioctl(struct btrfs_root *root,
+                           struct btrfs_ioctl_dev_replace_args *args)
+{
+       int ret;
+
+       switch (args->start.cont_reading_from_srcdev_mode) {
+       case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
+       case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
+           args->start.tgtdev_name[0] == '\0')
+               return -EINVAL;
+
+       ret = btrfs_dev_replace_start(root, args->start.tgtdev_name,
+                                       args->start.srcdevid,
+                                       args->start.srcdev_name,
+                                       args->start.cont_reading_from_srcdev_mode);
+       args->result = ret;
+       /* don't warn if EINPROGRESS, someone else might be running scrub */
+       if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS)
+               ret = 0;
+
+       return ret;
+}
+
  /*
   * blocked until all flighting bios are finished.
   */
@@ -495,7 +504,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
                 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
                 return ret;
         }
-       btrfs_wait_ordered_roots(root->fs_info, -1);
+       btrfs_wait_ordered_roots(root->fs_info, -1, 0, (u64)-1);
  
         trans = btrfs_start_transaction(root, 0);
         if (IS_ERR(trans)) {
@@ -560,10 +569,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
         ASSERT(list_empty(&src_device->resized_list));
         tgt_device->commit_total_bytes = src_device->commit_total_bytes;
         tgt_device->commit_bytes_used = src_device->bytes_used;
-       if (fs_info->sb->s_bdev == src_device->bdev)
-               fs_info->sb->s_bdev = tgt_device->bdev;
-       if (fs_info->fs_devices->latest_bdev == src_device->bdev)
-               fs_info->fs_devices->latest_bdev = tgt_device->bdev;
+
+       btrfs_assign_next_active_device(fs_info, src_device, tgt_device);
+
         list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
         fs_info->fs_devices->rw_devices++;
  
@@ -626,25 +634,6 @@ static void btrfs_dev_replace_update_device_in_mapping_tree(
         write_unlock(&em_tree->lock);
  }
  
-static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
-                                        char *srcdev_name,
-                                        struct btrfs_device **device)
-{
-       int ret;
-
-       if (srcdevid) {
-               ret = 0;
-               *device = btrfs_find_device(root->fs_info, srcdevid, NULL,
-                                           NULL);
-               if (!*device)
-                       ret = -ENOENT;
-       } else {
-               ret = btrfs_find_device_missing_or_by_path(root, srcdev_name,
-                                                          device);
-       }
-       return ret;
-}
-
  void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
                               struct btrfs_ioctl_dev_replace_args *args)
  {
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h

index 29e3ef5..e922b42 100644 (file)
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -25,8 +25,10 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
  int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
                           struct btrfs_fs_info *fs_info);
  void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info);
-int btrfs_dev_replace_start(struct btrfs_root *root,
+int btrfs_dev_replace_by_ioctl(struct btrfs_root *root,
                             struct btrfs_ioctl_dev_replace_args *args);
+int btrfs_dev_replace_start(struct btrfs_root *root, char *tgtdev_name,
+                               u64 srcdevid, char *srcdev_name, int read_src);
  void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
                               struct btrfs_ioctl_dev_replace_args *args);
  int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c

index 4e47849..91d1239 100644 (file)
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1640,7 +1640,7 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
  {
         int ret;
  
-       ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+       ret = radix_tree_preload(GFP_NOFS);
         if (ret)
                 return ret;
  
@@ -2417,7 +2417,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
         /* returns with log_tree_root freed on success */
         ret = btrfs_recover_log_trees(log_tree_root);
         if (ret) {
-               btrfs_std_error(tree_root->fs_info, ret,
+               btrfs_handle_fs_error(tree_root->fs_info, ret,
                             "Failed to recover log tree");
                 free_extent_buffer(log_tree_root->node);
                 kfree(log_tree_root);
@@ -2517,6 +2517,7 @@ int open_ctree(struct super_block *sb,
         int num_backups_tried = 0;
         int backup_index = 0;
         int max_active;
+       bool cleaner_mutex_locked = false;
  
         tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
         chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
@@ -2713,7 +2714,7 @@ int open_ctree(struct super_block *sb,
          * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
          */
         if (btrfs_check_super_csum(bh->b_data)) {
-               printk(KERN_ERR "BTRFS: superblock checksum mismatch\n");
+               btrfs_err(fs_info, "superblock checksum mismatch");
                 err = -EINVAL;
                 brelse(bh);
                 goto fail_alloc;
@@ -2733,7 +2734,7 @@ int open_ctree(struct super_block *sb,
  
         ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
         if (ret) {
-               printk(KERN_ERR "BTRFS: superblock contains fatal errors\n");
+               btrfs_err(fs_info, "superblock contains fatal errors");
                 err = -EINVAL;
                 goto fail_alloc;
         }
@@ -2768,9 +2769,9 @@ int open_ctree(struct super_block *sb,
         features = btrfs_super_incompat_flags(disk_super) &
                 ~BTRFS_FEATURE_INCOMPAT_SUPP;
         if (features) {
-               printk(KERN_ERR "BTRFS: couldn't mount because of "
-                      "unsupported optional features (%Lx).\n",
-                      features);
+               btrfs_err(fs_info,
+                   "cannot mount because of unsupported optional features (%llx)",
+                   features);
                 err = -EINVAL;
                 goto fail_alloc;
         }
@@ -2781,7 +2782,7 @@ int open_ctree(struct super_block *sb,
                 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
  
         if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
-               printk(KERN_INFO "BTRFS: has skinny extents\n");
+               btrfs_info(fs_info, "has skinny extents");
  
         /*
          * flag our filesystem as having big metadata blocks if
@@ -2789,7 +2790,8 @@ int open_ctree(struct super_block *sb,
          */
         if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) {
                 if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
-                       printk(KERN_INFO "BTRFS: flagging fs with big metadata feature\n");
+                       btrfs_info(fs_info,
+                               "flagging fs with big metadata feature");
                 features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
         }
  
@@ -2805,9 +2807,9 @@ int open_ctree(struct super_block *sb,
          */
         if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
             (sectorsize != nodesize)) {
-               printk(KERN_ERR "BTRFS: unequal leaf/node/sector sizes "
-                               "are not allowed for mixed block groups on %s\n",
-                               sb->s_id);
+               btrfs_err(fs_info,
+"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
+                       nodesize, sectorsize);
                 goto fail_alloc;
         }
  
@@ -2820,8 +2822,8 @@ int open_ctree(struct super_block *sb,
         features = btrfs_super_compat_ro_flags(disk_super) &
                 ~BTRFS_FEATURE_COMPAT_RO_SUPP;
         if (!(sb->s_flags & MS_RDONLY) && features) {
-               printk(KERN_ERR "BTRFS: couldn't mount RDWR because of "
-                      "unsupported option features (%Lx).\n",
+               btrfs_err(fs_info,
+       "cannot mount read-write because of unsupported optional features (%llx)",
                        features);
                 err = -EINVAL;
                 goto fail_alloc;
@@ -2850,8 +2852,7 @@ int open_ctree(struct super_block *sb,
         ret = btrfs_read_sys_array(tree_root);
         mutex_unlock(&fs_info->chunk_mutex);
         if (ret) {
-               printk(KERN_ERR "BTRFS: failed to read the system "
-                      "array on %s\n", sb->s_id);
+               btrfs_err(fs_info, "failed to read the system array: %d", ret);
                 goto fail_sb_buffer;
         }
  
@@ -2865,8 +2866,7 @@ int open_ctree(struct super_block *sb,
                                            generation);
         if (IS_ERR(chunk_root->node) ||
             !extent_buffer_uptodate(chunk_root->node)) {
-               printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n",
-                      sb->s_id);
+               btrfs_err(fs_info, "failed to read chunk root");
                 if (!IS_ERR(chunk_root->node))
                         free_extent_buffer(chunk_root->node);
                 chunk_root->node = NULL;
@@ -2880,8 +2880,7 @@ int open_ctree(struct super_block *sb,
  
         ret = btrfs_read_chunk_tree(chunk_root);
         if (ret) {
-               printk(KERN_ERR "BTRFS: failed to read chunk tree on %s\n",
-                      sb->s_id);
+               btrfs_err(fs_info, "failed to read chunk tree: %d", ret);
                 goto fail_tree_roots;
         }
  
@@ -2892,8 +2891,7 @@ int open_ctree(struct super_block *sb,
         btrfs_close_extra_devices(fs_devices, 0);
  
         if (!fs_devices->latest_bdev) {
-               printk(KERN_ERR "BTRFS: failed to read devices on %s\n",
-                      sb->s_id);
+               btrfs_err(fs_info, "failed to read devices");
                 goto fail_tree_roots;
         }
  
@@ -2905,8 +2903,7 @@ retry_root_backup:
                                           generation);
         if (IS_ERR(tree_root->node) ||
             !extent_buffer_uptodate(tree_root->node)) {
-               printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n",
-                      sb->s_id);
+               btrfs_warn(fs_info, "failed to read tree root");
                 if (!IS_ERR(tree_root->node))
                         free_extent_buffer(tree_root->node);
                 tree_root->node = NULL;
@@ -2938,20 +2935,19 @@ retry_root_backup:
  
         ret = btrfs_recover_balance(fs_info);
         if (ret) {
-               printk(KERN_ERR "BTRFS: failed to recover balance\n");
+               btrfs_err(fs_info, "failed to recover balance: %d", ret);
                 goto fail_block_groups;
         }
  
         ret = btrfs_init_dev_stats(fs_info);
         if (ret) {
-               printk(KERN_ERR "BTRFS: failed to init dev_stats: %d\n",
-                      ret);
+               btrfs_err(fs_info, "failed to init dev_stats: %d", ret);
                 goto fail_block_groups;
         }
  
         ret = btrfs_init_dev_replace(fs_info);
         if (ret) {
-               pr_err("BTRFS: failed to init dev_replace: %d\n", ret);
+               btrfs_err(fs_info, "failed to init dev_replace: %d", ret);
                 goto fail_block_groups;
         }
  
@@ -2959,31 +2955,33 @@ retry_root_backup:
  
         ret = btrfs_sysfs_add_fsid(fs_devices, NULL);
         if (ret) {
-               pr_err("BTRFS: failed to init sysfs fsid interface: %d\n", ret);
+               btrfs_err(fs_info, "failed to init sysfs fsid interface: %d",
+                               ret);
                 goto fail_block_groups;
         }
  
         ret = btrfs_sysfs_add_device(fs_devices);
         if (ret) {
-               pr_err("BTRFS: failed to init sysfs device interface: %d\n", ret);
+               btrfs_err(fs_info, "failed to init sysfs device interface: %d",
+                               ret);
                 goto fail_fsdev_sysfs;
         }
  
         ret = btrfs_sysfs_add_mounted(fs_info);
         if (ret) {
-               pr_err("BTRFS: failed to init sysfs interface: %d\n", ret);
+               btrfs_err(fs_info, "failed to init sysfs interface: %d", ret);
                 goto fail_fsdev_sysfs;
         }
  
         ret = btrfs_init_space_info(fs_info);
         if (ret) {
-               printk(KERN_ERR "BTRFS: Failed to initial space info: %d\n", ret);
+               btrfs_err(fs_info, "failed to initialize space info: %d", ret);
                 goto fail_sysfs;
         }
  
         ret = btrfs_read_block_groups(fs_info->extent_root);
         if (ret) {
-               printk(KERN_ERR "BTRFS: Failed to read block groups: %d\n", ret);
+               btrfs_err(fs_info, "failed to read block groups: %d", ret);
                 goto fail_sysfs;
         }
         fs_info->num_tolerated_disk_barrier_failures =
@@ -2991,12 +2989,20 @@ retry_root_backup:
         if (fs_info->fs_devices->missing_devices >
              fs_info->num_tolerated_disk_barrier_failures &&
             !(sb->s_flags & MS_RDONLY)) {
-               pr_warn("BTRFS: missing devices(%llu) exceeds the limit(%d), writeable mount is not allowed\n",
+               btrfs_warn(fs_info,
+"missing devices (%llu) exceeds the limit (%d), writeable mount is not allowed",
                         fs_info->fs_devices->missing_devices,
                         fs_info->num_tolerated_disk_barrier_failures);
                 goto fail_sysfs;
         }
  
+       /*
+        * Hold the cleaner_mutex thread here so that we don't block
+        * for a long time on btrfs_recover_relocation.  cleaner_kthread
+        * will wait for us to finish mounting the filesystem.
+        */
+       mutex_lock(&fs_info->cleaner_mutex);
+       cleaner_mutex_locked = true;
         fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
                                                "btrfs-cleaner");
         if (IS_ERR(fs_info->cleaner_kthread))
@@ -3011,8 +3017,7 @@ retry_root_backup:
         if (!btrfs_test_opt(tree_root, SSD) &&
             !btrfs_test_opt(tree_root, NOSSD) &&
             !fs_info->fs_devices->rotating) {
-               printk(KERN_INFO "BTRFS: detected SSD devices, enabling SSD "
-                      "mode\n");
+               btrfs_info(fs_info, "detected SSD devices, enabling SSD mode");
                 btrfs_set_opt(fs_info->mount_opt, SSD);
         }
  
@@ -3030,8 +3035,9 @@ retry_root_backup:
                                     1 : 0,
                                     fs_info->check_integrity_print_mask);
                 if (ret)
-                       printk(KERN_WARNING "BTRFS: failed to initialize"
-                              " integrity check module %s\n", sb->s_id);
+                       btrfs_warn(fs_info,
+                               "failed to initialize integrity check module: %d",
+                               ret);
         }
  #endif
         ret = btrfs_read_qgroup_config(fs_info);
@@ -3056,17 +3062,17 @@ retry_root_backup:
                 ret = btrfs_cleanup_fs_roots(fs_info);
                 if (ret)
                         goto fail_qgroup;
-
-               mutex_lock(&fs_info->cleaner_mutex);
+               /* We locked cleaner_mutex before creating cleaner_kthread. */
                 ret = btrfs_recover_relocation(tree_root);
-               mutex_unlock(&fs_info->cleaner_mutex);
                 if (ret < 0) {
-                       printk(KERN_WARNING
-                              "BTRFS: failed to recover relocation\n");
+                       btrfs_warn(fs_info, "failed to recover relocation: %d",
+                                       ret);
                         err = -EINVAL;
                         goto fail_qgroup;
                 }
         }
+       mutex_unlock(&fs_info->cleaner_mutex);
+       cleaner_mutex_locked = false;
  
         location.objectid = BTRFS_FS_TREE_OBJECTID;
         location.type = BTRFS_ROOT_ITEM_KEY;
@@ -3083,11 +3089,11 @@ retry_root_backup:
  
         if (btrfs_test_opt(tree_root, FREE_SPACE_TREE) &&
             !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
-               pr_info("BTRFS: creating free space tree\n");
+               btrfs_info(fs_info, "creating free space tree");
                 ret = btrfs_create_free_space_tree(fs_info);
                 if (ret) {
-                       pr_warn("BTRFS: failed to create free space tree %d\n",
-                               ret);
+                       btrfs_warn(fs_info,
+                               "failed to create free space tree: %d", ret);
                         close_ctree(tree_root);
                         return ret;
                 }
@@ -3104,14 +3110,14 @@ retry_root_backup:
  
         ret = btrfs_resume_balance_async(fs_info);
         if (ret) {
-               printk(KERN_WARNING "BTRFS: failed to resume balance\n");
+               btrfs_warn(fs_info, "failed to resume balance: %d", ret);
                 close_ctree(tree_root);
                 return ret;
         }
  
         ret = btrfs_resume_dev_replace_async(fs_info);
         if (ret) {
-               pr_warn("BTRFS: failed to resume dev_replace\n");
+               btrfs_warn(fs_info, "failed to resume device replace: %d", ret);
                 close_ctree(tree_root);
                 return ret;
         }
@@ -3120,33 +3126,33 @@ retry_root_backup:
  
         if (btrfs_test_opt(tree_root, CLEAR_CACHE) &&
             btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
-               pr_info("BTRFS: clearing free space tree\n");
+               btrfs_info(fs_info, "clearing free space tree");
                 ret = btrfs_clear_free_space_tree(fs_info);
                 if (ret) {
-                       pr_warn("BTRFS: failed to clear free space tree %d\n",
-                               ret);
+                       btrfs_warn(fs_info,
+                               "failed to clear free space tree: %d", ret);
                         close_ctree(tree_root);
                         return ret;
                 }
         }
  
         if (!fs_info->uuid_root) {
-               pr_info("BTRFS: creating UUID tree\n");
+               btrfs_info(fs_info, "creating UUID tree");
                 ret = btrfs_create_uuid_tree(fs_info);
                 if (ret) {
-                       pr_warn("BTRFS: failed to create the UUID tree %d\n",
-                               ret);
+                       btrfs_warn(fs_info,
+                               "failed to create the UUID tree: %d", ret);
                         close_ctree(tree_root);
                         return ret;
                 }
         } else if (btrfs_test_opt(tree_root, RESCAN_UUID_TREE) ||
                    fs_info->generation !=
                                 btrfs_super_uuid_tree_generation(disk_super)) {
-               pr_info("BTRFS: checking UUID tree\n");
+               btrfs_info(fs_info, "checking UUID tree");
                 ret = btrfs_check_uuid_tree(fs_info);
                 if (ret) {
-                       pr_warn("BTRFS: failed to check the UUID tree %d\n",
-                               ret);
+                       btrfs_warn(fs_info,
+                               "failed to check the UUID tree: %d", ret);
                         close_ctree(tree_root);
                         return ret;
                 }
@@ -3180,6 +3186,10 @@ fail_cleaner:
         filemap_write_and_wait(fs_info->btree_inode->i_mapping);
  
  fail_sysfs:
+       if (cleaner_mutex_locked) {
+               mutex_unlock(&fs_info->cleaner_mutex);
+               cleaner_mutex_locked = false;
+       }
         btrfs_sysfs_remove_mounted(fs_info);
  
  fail_fsdev_sysfs:
@@ -3646,7 +3656,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
                 if (ret) {
                         mutex_unlock(
                                 &root->fs_info->fs_devices->device_list_mutex);
-                       btrfs_std_error(root->fs_info, ret,
+                       btrfs_handle_fs_error(root->fs_info, ret,
                                     "errors while submitting device barriers.");
                         return ret;
                 }
@@ -3686,7 +3696,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
                 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
  
                 /* FUA is masked off if unsupported and can't be the reason */
-               btrfs_std_error(root->fs_info, -EIO,
+               btrfs_handle_fs_error(root->fs_info, -EIO,
                             "%d errors while writing supers", total_errors);
                 return -EIO;
         }
@@ -3704,7 +3714,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
         }
         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
         if (total_errors > max_errors) {
-               btrfs_std_error(root->fs_info, -EIO,
+               btrfs_handle_fs_error(root->fs_info, -EIO,
                             "%d errors while writing supers", total_errors);
                 return -EIO;
         }
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c

index 84e060e..9424864 100644 (file)
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3824,6 +3824,59 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
         return readonly;
  }
  
+bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+       struct btrfs_block_group_cache *bg;
+       bool ret = true;
+
+       bg = btrfs_lookup_block_group(fs_info, bytenr);
+       if (!bg)
+               return false;
+
+       spin_lock(&bg->lock);
+       if (bg->ro)
+               ret = false;
+       else
+               atomic_inc(&bg->nocow_writers);
+       spin_unlock(&bg->lock);
+
+       /* no put on block group, done by btrfs_dec_nocow_writers */
+       if (!ret)
+               btrfs_put_block_group(bg);
+
+       return ret;
+
+}
+
+void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+       struct btrfs_block_group_cache *bg;
+
+       bg = btrfs_lookup_block_group(fs_info, bytenr);
+       ASSERT(bg);
+       if (atomic_dec_and_test(&bg->nocow_writers))
+               wake_up_atomic_t(&bg->nocow_writers);
+       /*
+        * Once for our lookup and once for the lookup done by a previous call
+        * to btrfs_inc_nocow_writers()
+        */
+       btrfs_put_block_group(bg);
+       btrfs_put_block_group(bg);
+}
+
+static int btrfs_wait_nocow_writers_atomic_t(atomic_t *a)
+{
+       schedule();
+       return 0;
+}
+
+void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
+{
+       wait_on_atomic_t(&bg->nocow_writers,
+                        btrfs_wait_nocow_writers_atomic_t,
+                        TASK_UNINTERRUPTIBLE);
+}
+
  static const char *alloc_name(u64 flags)
  {
         switch (flags) {
@@ -4141,7 +4194,7 @@ commit_trans:
  
                         if (need_commit > 0) {
                                 btrfs_start_delalloc_roots(fs_info, 0, -1);
-                               btrfs_wait_ordered_roots(fs_info, -1);
+                               btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1);
                         }
  
                         trans = btrfs_join_transaction(root);
@@ -4583,7 +4636,8 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
                  */
                 btrfs_start_delalloc_roots(root->fs_info, 0, nr_items);
                 if (!current->journal_info)
-                       btrfs_wait_ordered_roots(root->fs_info, nr_items);
+                       btrfs_wait_ordered_roots(root->fs_info, nr_items,
+                                                0, (u64)-1);
         }
  }
  
@@ -4620,7 +4674,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
  
         /* Calc the number of the pages we need flush for space reservation */
         items = calc_reclaim_items_nr(root, to_reclaim);
-       to_reclaim = items * EXTENT_SIZE_PER_ITEM;
+       to_reclaim = (u64)items * EXTENT_SIZE_PER_ITEM;
  
         trans = (struct btrfs_trans_handle *)current->journal_info;
         block_rsv = &root->fs_info->delalloc_block_rsv;
@@ -4632,7 +4686,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
                 if (trans)
                         return;
                 if (wait_ordered)
-                       btrfs_wait_ordered_roots(root->fs_info, items);
+                       btrfs_wait_ordered_roots(root->fs_info, items,
+                                                0, (u64)-1);
                 return;
         }
  
@@ -4671,7 +4726,8 @@ skip_async:
  
                 loops++;
                 if (wait_ordered && !trans) {
-                       btrfs_wait_ordered_roots(root->fs_info, items);
+                       btrfs_wait_ordered_roots(root->fs_info, items,
+                                                0, (u64)-1);
                 } else {
                         time_left = schedule_timeout_killable(1);
                         if (time_left)
@@ -6172,6 +6228,57 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log,
         return 0;
  }
  
+static void
+btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
+{
+       atomic_inc(&bg->reservations);
+}
+
+void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
+                                       const u64 start)
+{
+       struct btrfs_block_group_cache *bg;
+
+       bg = btrfs_lookup_block_group(fs_info, start);
+       ASSERT(bg);
+       if (atomic_dec_and_test(&bg->reservations))
+               wake_up_atomic_t(&bg->reservations);
+       btrfs_put_block_group(bg);
+}
+
+static int btrfs_wait_bg_reservations_atomic_t(atomic_t *a)
+{
+       schedule();
+       return 0;
+}
+
+void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
+{
+       struct btrfs_space_info *space_info = bg->space_info;
+
+       ASSERT(bg->ro);
+
+       if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
+               return;
+
+       /*
+        * Our block group is read only but before we set it to read only,
+        * some task might have had allocated an extent from it already, but it
+        * has not yet created a respective ordered extent (and added it to a
+        * root's list of ordered extents).
+        * Therefore wait for any task currently allocating extents, since the
+        * block group's reservations counter is incremented while a read lock
+        * on the groups' semaphore is held and decremented after releasing
+        * the read access on that semaphore and creating the ordered extent.
+        */
+       down_write(&space_info->groups_sem);
+       up_write(&space_info->groups_sem);
+
+       wait_on_atomic_t(&bg->reservations,
+                        btrfs_wait_bg_reservations_atomic_t,
+                        TASK_UNINTERRUPTIBLE);
+}
+
  /**
   * btrfs_update_reserved_bytes - update the block_group and space info counters
   * @cache:     The cache we are manipulating
@@ -7025,36 +7132,35 @@ btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
                    int delalloc)
  {
         struct btrfs_block_group_cache *used_bg = NULL;
-       bool locked = false;
-again:
+
         spin_lock(&cluster->refill_lock);
-       if (locked) {
-               if (used_bg == cluster->block_group)
+       while (1) {
+               used_bg = cluster->block_group;
+               if (!used_bg)
+                       return NULL;
+
+               if (used_bg == block_group)
                         return used_bg;
  
-               up_read(&used_bg->data_rwsem);
-               btrfs_put_block_group(used_bg);
-       }
+               btrfs_get_block_group(used_bg);
  
-       used_bg = cluster->block_group;
-       if (!used_bg)
-               return NULL;
+               if (!delalloc)
+                       return used_bg;
  
-       if (used_bg == block_group)
-               return used_bg;
+               if (down_read_trylock(&used_bg->data_rwsem))
+                       return used_bg;
  
-       btrfs_get_block_group(used_bg);
+               spin_unlock(&cluster->refill_lock);
  
-       if (!delalloc)
-               return used_bg;
+               down_read(&used_bg->data_rwsem);
  
-       if (down_read_trylock(&used_bg->data_rwsem))
-               return used_bg;
+               spin_lock(&cluster->refill_lock);
+               if (used_bg == cluster->block_group)
+                       return used_bg;
  
-       spin_unlock(&cluster->refill_lock);
-       down_read(&used_bg->data_rwsem);
-       locked = true;
-       goto again;
+               up_read(&used_bg->data_rwsem);
+               btrfs_put_block_group(used_bg);
+       }
  }
  
  static inline void
@@ -7431,6 +7537,7 @@ checks:
                         btrfs_add_free_space(block_group, offset, num_bytes);
                         goto loop;
                 }
+               btrfs_inc_block_group_reservations(block_group);
  
                 /* we are all good, lets return */
                 ins->objectid = search_start;
@@ -7612,8 +7719,10 @@ again:
         WARN_ON(num_bytes < root->sectorsize);
         ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
                                flags, delalloc);
-
-       if (ret == -ENOSPC) {
+       if (!ret && !is_data) {
+               btrfs_dec_block_group_reservations(root->fs_info,
+                                                  ins->objectid);
+       } else if (ret == -ENOSPC) {
                 if (!final_tried && ins->offset) {
                         num_bytes = min(num_bytes >> 1, ins->offset);
                         num_bytes = round_down(num_bytes, root->sectorsize);
@@ -9058,7 +9167,7 @@ out:
         if (!for_reloc && root_dropped == false)
                 btrfs_add_dead_root(root);
         if (err && err != -EAGAIN)
-               btrfs_std_error(root->fs_info, err, NULL);
+               btrfs_handle_fs_error(root->fs_info, err, NULL);
         return err;
  }
  
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c

index d247fc0..2f83448 100644 (file)
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3200,14 +3200,10 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
         return ret;
  }
  
-static noinline void update_nr_written(struct page *page,
-                                     struct writeback_control *wbc,
-                                     unsigned long nr_written)
+static void update_nr_written(struct page *page, struct writeback_control *wbc,
+                             unsigned long nr_written)
  {
         wbc->nr_to_write -= nr_written;
-       if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
-           wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
-               page->mapping->writeback_index = page->index + nr_written;
  }
  
  /*
@@ -3368,6 +3364,8 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
  
         while (cur <= end) {
                 u64 em_end;
+               unsigned long max_nr;
+
                 if (cur >= i_size) {
                         if (tree->ops && tree->ops->writepage_end_io_hook)
                                 tree->ops->writepage_end_io_hook(page, cur,
@@ -3423,32 +3421,23 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
                         continue;
                 }
  
-               if (tree->ops && tree->ops->writepage_io_hook) {
-                       ret = tree->ops->writepage_io_hook(page, cur,
-                                               cur + iosize - 1);
-               } else {
-                       ret = 0;
+               max_nr = (i_size >> PAGE_SHIFT) + 1;
+
+               set_range_writeback(tree, cur, cur + iosize - 1);
+               if (!PageWriteback(page)) {
+                       btrfs_err(BTRFS_I(inode)->root->fs_info,
+                                  "page %lu not writeback, cur %llu end %llu",
+                              page->index, cur, end);
                 }
-               if (ret) {
-                       SetPageError(page);
-               } else {
-                       unsigned long max_nr = (i_size >> PAGE_SHIFT) + 1;
  
-                       set_range_writeback(tree, cur, cur + iosize - 1);
-                       if (!PageWriteback(page)) {
-                               btrfs_err(BTRFS_I(inode)->root->fs_info,
-                                          "page %lu not writeback, cur %llu end %llu",
-                                      page->index, cur, end);
-                       }
+               ret = submit_extent_page(write_flags, tree, wbc, page,
+                                        sector, iosize, pg_offset,
+                                        bdev, &epd->bio, max_nr,
+                                        end_bio_extent_writepage,
+                                        0, 0, 0, false);
+               if (ret)
+                       SetPageError(page);
  
-                       ret = submit_extent_page(write_flags, tree, wbc, page,
-                                                sector, iosize, pg_offset,
-                                                bdev, &epd->bio, max_nr,
-                                                end_bio_extent_writepage,
-                                                0, 0, 0, false);
-                       if (ret)
-                               SetPageError(page);
-               }
                 cur = cur + iosize;
                 pg_offset += iosize;
                 nr++;
@@ -3920,12 +3909,13 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
         struct inode *inode = mapping->host;
         int ret = 0;
         int done = 0;
-       int err = 0;
         int nr_to_write_done = 0;
         struct pagevec pvec;
         int nr_pages;
         pgoff_t index;
         pgoff_t end;            /* Inclusive */
+       pgoff_t done_index;
+       int range_whole = 0;
         int scanned = 0;
         int tag;
  
@@ -3948,6 +3938,8 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
         } else {
                 index = wbc->range_start >> PAGE_SHIFT;
                 end = wbc->range_end >> PAGE_SHIFT;
+               if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+                       range_whole = 1;
                 scanned = 1;
         }
         if (wbc->sync_mode == WB_SYNC_ALL)
@@ -3957,6 +3949,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
  retry:
         if (wbc->sync_mode == WB_SYNC_ALL)
                 tag_pages_for_writeback(mapping, index, end);
+       done_index = index;
         while (!done && !nr_to_write_done && (index <= end) &&
                (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
                         min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
@@ -3966,6 +3959,7 @@ retry:
                 for (i = 0; i < nr_pages; i++) {
                         struct page *page = pvec.pages[i];
  
+                       done_index = page->index;
                         /*
                          * At this point we hold neither mapping->tree_lock nor
                          * lock on the page itself: the page may be truncated or
@@ -4007,8 +4001,20 @@ retry:
                                 unlock_page(page);
                                 ret = 0;
                         }
-                       if (!err && ret < 0)
-                               err = ret;
+                       if (ret < 0) {
+                               /*
+                                * done_index is set past this page,
+                                * so media errors will not choke
+                                * background writeout for the entire
+                                * file. This has consequences for
+                                * range_cyclic semantics (ie. it may
+                                * not be suitable for data integrity
+                                * writeout).
+                                */
+                               done_index = page->index + 1;
+                               done = 1;
+                               break;
+                       }
  
                         /*
                          * the filesystem may choose to bump up nr_to_write.
@@ -4020,7 +4026,7 @@ retry:
                 pagevec_release(&pvec);
                 cond_resched();
         }
-       if (!scanned && !done && !err) {
+       if (!scanned && !done) {
                 /*
                  * We hit the last page and there is more work to be done: wrap
                  * back to the start of the file
@@ -4029,8 +4035,12 @@ retry:
                 index = 0;
                 goto retry;
         }
+
+       if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
+               mapping->writeback_index = done_index;
+
         btrfs_add_delayed_iput(inode);
-       return err;
+       return ret;
  }
  
  static void flush_epd_write_bio(struct extent_page_data *epd)
@@ -4822,7 +4832,7 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
                 return NULL;
         eb->fs_info = fs_info;
  again:
-       ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+       ret = radix_tree_preload(GFP_NOFS);
         if (ret)
                 goto free_eb;
         spin_lock(&fs_info->buffer_lock);
@@ -4923,7 +4933,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
         if (uptodate)
                 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
  again:
-       ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+       ret = radix_tree_preload(GFP_NOFS);
         if (ret)
                 goto free_eb;
  
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h

index b5e0ade..981f402 100644 (file)
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -71,7 +71,6 @@ struct extent_io_ops {
                              u64 start, u64 end, int *page_started,
                              unsigned long *nr_written);
         int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
-       int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
         extent_submit_bio_hook_t *submit_bio_hook;
         int (*merge_bio_hook)(int rw, struct page *page, unsigned long offset,
                               size_t size, struct bio *bio,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c

index ea9f10b..c98805c 100644 (file)
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1696,7 +1696,9 @@ again:
                         btrfs_end_write_no_snapshoting(root);
                         btrfs_delalloc_release_metadata(inode, release_bytes);
                 } else {
-                       btrfs_delalloc_release_space(inode, pos, release_bytes);
+                       btrfs_delalloc_release_space(inode,
+                                               round_down(pos, root->sectorsize),
+                                               release_bytes);
                 }
         }
  
@@ -2952,7 +2954,7 @@ const struct file_operations btrfs_file_operations = {
         .fallocate      = btrfs_fallocate,
         .unlocked_ioctl = btrfs_ioctl,
  #ifdef CONFIG_COMPAT
-       .compat_ioctl   = btrfs_ioctl,
+       .compat_ioctl   = btrfs_compat_ioctl,
  #endif
         .copy_file_range = btrfs_copy_file_range,
         .clone_file_range = btrfs_clone_file_range,
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c

index be4d22a..b8acc07 100644 (file)
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -157,7 +157,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
          */
         if (!btrfs_find_name_in_ext_backref(path, ref_objectid,
                                             name, name_len, &extref)) {
-               btrfs_std_error(root->fs_info, -ENOENT, NULL);
+               btrfs_handle_fs_error(root->fs_info, -ENOENT, NULL);
                 ret = -EROFS;
                 goto out;
         }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c

index 6b7fe29..91419ef 100644 (file)
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -824,6 +824,7 @@ retry:
                                                 async_extent->ram_size - 1, 0);
                         goto out_free_reserve;
                 }
+               btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
  
                 /*
                  * clear dirty, set writeback and unlock the pages.
@@ -861,6 +862,7 @@ retry:
         }
         return;
  out_free_reserve:
+       btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
         btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
  out_free:
         extent_clear_unlock_delalloc(inode, async_extent->start,
@@ -1038,6 +1040,8 @@ static noinline int cow_file_range(struct inode *inode,
                                 goto out_drop_extent_cache;
                 }
  
+               btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
+
                 if (disk_num_bytes < cur_alloc_size)
                         break;
  
@@ -1066,6 +1070,7 @@ out:
  out_drop_extent_cache:
         btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
  out_reserve:
+       btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
         btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
  out_unlock:
         extent_clear_unlock_delalloc(inode, start, end, locked_page,
@@ -1377,6 +1382,9 @@ next_slot:
                          */
                         if (csum_exist_in_range(root, disk_bytenr, num_bytes))
                                 goto out_check;
+                       if (!btrfs_inc_nocow_writers(root->fs_info,
+                                                    disk_bytenr))
+                               goto out_check;
                         nocow = 1;
                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
                         extent_end = found_key.offset +
@@ -1391,6 +1399,9 @@ out_check:
                         path->slots[0]++;
                         if (!nolock && nocow)
                                 btrfs_end_write_no_snapshoting(root);
+                       if (nocow)
+                               btrfs_dec_nocow_writers(root->fs_info,
+                                                       disk_bytenr);
                         goto next_slot;
                 }
                 if (!nocow) {
@@ -1411,6 +1422,9 @@ out_check:
                         if (ret) {
                                 if (!nolock && nocow)
                                         btrfs_end_write_no_snapshoting(root);
+                               if (nocow)
+                                       btrfs_dec_nocow_writers(root->fs_info,
+                                                               disk_bytenr);
                                 goto error;
                         }
                         cow_start = (u64)-1;
@@ -1453,6 +1467,8 @@ out_check:
  
                 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
                                                num_bytes, num_bytes, type);
+               if (nocow)
+                       btrfs_dec_nocow_writers(root->fs_info, disk_bytenr);
                 BUG_ON(ret); /* -ENOMEM */
  
                 if (root->root_key.objectid ==
@@ -7129,6 +7145,43 @@ out:
         return em;
  }
  
+static struct extent_map *btrfs_create_dio_extent(struct inode *inode,
+                                                 const u64 start,
+                                                 const u64 len,
+                                                 const u64 orig_start,
+                                                 const u64 block_start,
+                                                 const u64 block_len,
+                                                 const u64 orig_block_len,
+                                                 const u64 ram_bytes,
+                                                 const int type)
+{
+       struct extent_map *em = NULL;
+       int ret;
+
+       down_read(&BTRFS_I(inode)->dio_sem);
+       if (type != BTRFS_ORDERED_NOCOW) {
+               em = create_pinned_em(inode, start, len, orig_start,
+                                     block_start, block_len, orig_block_len,
+                                     ram_bytes, type);
+               if (IS_ERR(em))
+                       goto out;
+       }
+       ret = btrfs_add_ordered_extent_dio(inode, start, block_start,
+                                          len, block_len, type);
+       if (ret) {
+               if (em) {
+                       free_extent_map(em);
+                       btrfs_drop_extent_cache(inode, start,
+                                               start + len - 1, 0);
+               }
+               em = ERR_PTR(ret);
+       }
+ out:
+       up_read(&BTRFS_I(inode)->dio_sem);
+
+       return em;
+}
+
  static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
                                                   u64 start, u64 len)
  {
@@ -7144,41 +7197,13 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
         if (ret)
                 return ERR_PTR(ret);
  
-       /*
-        * Create the ordered extent before the extent map. This is to avoid
-        * races with the fast fsync path that would lead to it logging file
-        * extent items that point to disk extents that were not yet written to.
-        * The fast fsync path collects ordered extents into a local list and
-        * then collects all the new extent maps, so we must create the ordered
-        * extent first and make sure the fast fsync path collects any new
-        * ordered extents after collecting new extent maps as well.
-        * The fsync path simply can not rely on inode_dio_wait() because it
-        * causes deadlock with AIO.
-        */
-       ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
-                                          ins.offset, ins.offset, 0);
-       if (ret) {
+       em = btrfs_create_dio_extent(inode, start, ins.offset, start,
+                                    ins.objectid, ins.offset, ins.offset,
+                                    ins.offset, 0);
+       btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
+       if (IS_ERR(em))
                 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
-               return ERR_PTR(ret);
-       }
-
-       em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
-                             ins.offset, ins.offset, ins.offset, 0);
-       if (IS_ERR(em)) {
-               struct btrfs_ordered_extent *oe;
  
-               btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
-               oe = btrfs_lookup_ordered_extent(inode, start);
-               ASSERT(oe);
-               if (WARN_ON(!oe))
-                       return em;
-               set_bit(BTRFS_ORDERED_IOERR, &oe->flags);
-               set_bit(BTRFS_ORDERED_IO_DONE, &oe->flags);
-               btrfs_remove_ordered_extent(inode, oe);
-               /* Once for our lookup and once for the ordered extents tree. */
-               btrfs_put_ordered_extent(oe);
-               btrfs_put_ordered_extent(oe);
-       }
         return em;
  }
  
@@ -7650,24 +7675,21 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
                 block_start = em->block_start + (start - em->start);
  
                 if (can_nocow_extent(inode, start, &len, &orig_start,
-                                    &orig_block_len, &ram_bytes) == 1) {
+                                    &orig_block_len, &ram_bytes) == 1 &&
+                   btrfs_inc_nocow_writers(root->fs_info, block_start)) {
+                       struct extent_map *em2;
+
+                       em2 = btrfs_create_dio_extent(inode, start, len,
+                                                     orig_start, block_start,
+                                                     len, orig_block_len,
+                                                     ram_bytes, type);
+                       btrfs_dec_nocow_writers(root->fs_info, block_start);
                         if (type == BTRFS_ORDERED_PREALLOC) {
                                 free_extent_map(em);
-                               em = create_pinned_em(inode, start, len,
-                                                      orig_start,
-                                                      block_start, len,
-                                                      orig_block_len,
-                                                      ram_bytes, type);
-                               if (IS_ERR(em)) {
-                                       ret = PTR_ERR(em);
-                                       goto unlock_err;
-                               }
+                               em = em2;
                         }
-
-                       ret = btrfs_add_ordered_extent_dio(inode, start,
-                                          block_start, len, len, type);
-                       if (ret) {
-                               free_extent_map(em);
+                       if (em2 && IS_ERR(em2)) {
+                               ret = PTR_ERR(em2);
                                 goto unlock_err;
                         }
                         goto unlock;
@@ -9230,6 +9252,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
         INIT_LIST_HEAD(&ei->delalloc_inodes);
         INIT_LIST_HEAD(&ei->delayed_iput);
         RB_CLEAR_NODE(&ei->rb_node);
+       init_rwsem(&ei->dio_sem);
  
         return inode;
  }
@@ -9387,10 +9410,281 @@ static int btrfs_getattr(struct vfsmount *mnt,
         return 0;
  }
  
+static int btrfs_rename_exchange(struct inode *old_dir,
+                             struct dentry *old_dentry,
+                             struct inode *new_dir,
+                             struct dentry *new_dentry)
+{
+       struct btrfs_trans_handle *trans;
+       struct btrfs_root *root = BTRFS_I(old_dir)->root;
+       struct btrfs_root *dest = BTRFS_I(new_dir)->root;
+       struct inode *new_inode = new_dentry->d_inode;
+       struct inode *old_inode = old_dentry->d_inode;
+       struct timespec ctime = CURRENT_TIME;
+       struct dentry *parent;
+       u64 old_ino = btrfs_ino(old_inode);
+       u64 new_ino = btrfs_ino(new_inode);
+       u64 old_idx = 0;
+       u64 new_idx = 0;
+       u64 root_objectid;
+       int ret;
+       bool root_log_pinned = false;
+       bool dest_log_pinned = false;
+
+       /* we only allow rename subvolume link between subvolumes */
+       if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
+               return -EXDEV;
+
+       /* close the race window with snapshot create/destroy ioctl */
+       if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
+               down_read(&root->fs_info->subvol_sem);
+       if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
+               down_read(&dest->fs_info->subvol_sem);
+
+       /*
+        * We want to reserve the absolute worst case amount of items.  So if
+        * both inodes are subvols and we need to unlink them then that would
+        * require 4 item modifications, but if they are both normal inodes it
+        * would require 5 item modifications, so we'll assume their normal
+        * inodes.  So 5 * 2 is 10, plus 2 for the new links, so 12 total items
+        * should cover the worst case number of items we'll modify.
+        */
+       trans = btrfs_start_transaction(root, 12);
+       if (IS_ERR(trans)) {
+               ret = PTR_ERR(trans);
+               goto out_notrans;
+       }
+
+       /*
+        * We need to find a free sequence number both in the source and
+        * in the destination directory for the exchange.
+        */
+       ret = btrfs_set_inode_index(new_dir, &old_idx);
+       if (ret)
+               goto out_fail;
+       ret = btrfs_set_inode_index(old_dir, &new_idx);
+       if (ret)
+               goto out_fail;
+
+       BTRFS_I(old_inode)->dir_index = 0ULL;
+       BTRFS_I(new_inode)->dir_index = 0ULL;
+
+       /* Reference for the source. */
+       if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
+               /* force full log commit if subvolume involved. */
+               btrfs_set_log_full_commit(root->fs_info, trans);
+       } else {
+               btrfs_pin_log_trans(root);
+               root_log_pinned = true;
+               ret = btrfs_insert_inode_ref(trans, dest,
+                                            new_dentry->d_name.name,
+                                            new_dentry->d_name.len,
+                                            old_ino,
+                                            btrfs_ino(new_dir), old_idx);
+               if (ret)
+                       goto out_fail;
+       }
+
+       /* And now for the dest. */
+       if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
+               /* force full log commit if subvolume involved. */
+               btrfs_set_log_full_commit(dest->fs_info, trans);
+       } else {
+               btrfs_pin_log_trans(dest);
+               dest_log_pinned = true;
+               ret = btrfs_insert_inode_ref(trans, root,
+                                            old_dentry->d_name.name,
+                                            old_dentry->d_name.len,
+                                            new_ino,
+                                            btrfs_ino(old_dir), new_idx);
+               if (ret)
+                       goto out_fail;
+       }
+
+       /* Update inode version and ctime/mtime. */
+       inode_inc_iversion(old_dir);
+       inode_inc_iversion(new_dir);
+       inode_inc_iversion(old_inode);
+       inode_inc_iversion(new_inode);
+       old_dir->i_ctime = old_dir->i_mtime = ctime;
+       new_dir->i_ctime = new_dir->i_mtime = ctime;
+       old_inode->i_ctime = ctime;
+       new_inode->i_ctime = ctime;
+
+       if (old_dentry->d_parent != new_dentry->d_parent) {
+               btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
+               btrfs_record_unlink_dir(trans, new_dir, new_inode, 1);
+       }
+
+       /* src is a subvolume */
+       if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
+               root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
+               ret = btrfs_unlink_subvol(trans, root, old_dir,
+                                         root_objectid,
+                                         old_dentry->d_name.name,
+                                         old_dentry->d_name.len);
+       } else { /* src is an inode */
+               ret = __btrfs_unlink_inode(trans, root, old_dir,
+                                          old_dentry->d_inode,
+                                          old_dentry->d_name.name,
+                                          old_dentry->d_name.len);
+               if (!ret)
+                       ret = btrfs_update_inode(trans, root, old_inode);
+       }
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto out_fail;
+       }
+
+       /* dest is a subvolume */
+       if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
+               root_objectid = BTRFS_I(new_inode)->root->root_key.objectid;
+               ret = btrfs_unlink_subvol(trans, dest, new_dir,
+                                         root_objectid,
+                                         new_dentry->d_name.name,
+                                         new_dentry->d_name.len);
+       } else { /* dest is an inode */
+               ret = __btrfs_unlink_inode(trans, dest, new_dir,
+                                          new_dentry->d_inode,
+                                          new_dentry->d_name.name,
+                                          new_dentry->d_name.len);
+               if (!ret)
+                       ret = btrfs_update_inode(trans, dest, new_inode);
+       }
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto out_fail;
+       }
+
+       ret = btrfs_add_link(trans, new_dir, old_inode,
+                            new_dentry->d_name.name,
+                            new_dentry->d_name.len, 0, old_idx);
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto out_fail;
+       }
+
+       ret = btrfs_add_link(trans, old_dir, new_inode,
+                            old_dentry->d_name.name,
+                            old_dentry->d_name.len, 0, new_idx);
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto out_fail;
+       }
+
+       if (old_inode->i_nlink == 1)
+               BTRFS_I(old_inode)->dir_index = old_idx;
+       if (new_inode->i_nlink == 1)
+               BTRFS_I(new_inode)->dir_index = new_idx;
+
+       if (root_log_pinned) {
+               parent = new_dentry->d_parent;
+               btrfs_log_new_name(trans, old_inode, old_dir, parent);
+               btrfs_end_log_trans(root);
+               root_log_pinned = false;
+       }
+       if (dest_log_pinned) {
+               parent = old_dentry->d_parent;
+               btrfs_log_new_name(trans, new_inode, new_dir, parent);
+               btrfs_end_log_trans(dest);
+               dest_log_pinned = false;
+       }
+out_fail:
+       /*
+        * If we have pinned a log and an error happened, we unpin tasks
+        * trying to sync the log and force them to fallback to a transaction
+        * commit if the log currently contains any of the inodes involved in
+        * this rename operation (to ensure we do not persist a log with an
+        * inconsistent state for any of these inodes or leading to any
+        * inconsistencies when replayed). If the transaction was aborted, the
+        * abortion reason is propagated to userspace when attempting to commit
+        * the transaction. If the log does not contain any of these inodes, we
+        * allow the tasks to sync it.
+        */
+       if (ret && (root_log_pinned || dest_log_pinned)) {
+               if (btrfs_inode_in_log(old_dir, root->fs_info->generation) ||
+                   btrfs_inode_in_log(new_dir, root->fs_info->generation) ||
+                   btrfs_inode_in_log(old_inode, root->fs_info->generation) ||
+                   (new_inode &&
+                    btrfs_inode_in_log(new_inode, root->fs_info->generation)))
+                   btrfs_set_log_full_commit(root->fs_info, trans);
+
+               if (root_log_pinned) {
+                       btrfs_end_log_trans(root);
+                       root_log_pinned = false;
+               }
+               if (dest_log_pinned) {
+                       btrfs_end_log_trans(dest);
+                       dest_log_pinned = false;
+               }
+       }
+       ret = btrfs_end_transaction(trans, root);
+out_notrans:
+       if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
+               up_read(&dest->fs_info->subvol_sem);
+       if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
+               up_read(&root->fs_info->subvol_sem);
+
+       return ret;
+}
+
+static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *root,
+                                    struct inode *dir,
+                                    struct dentry *dentry)
+{
+       int ret;
+       struct inode *inode;
+       u64 objectid;
+       u64 index;
+
+       ret = btrfs_find_free_ino(root, &objectid);
+       if (ret)
+               return ret;
+
+       inode = btrfs_new_inode(trans, root, dir,
+                               dentry->d_name.name,
+                               dentry->d_name.len,
+                               btrfs_ino(dir),
+                               objectid,
+                               S_IFCHR | WHITEOUT_MODE,
+                               &index);
+
+       if (IS_ERR(inode)) {
+               ret = PTR_ERR(inode);
+               return ret;
+       }
+
+       inode->i_op = &btrfs_special_inode_operations;
+       init_special_inode(inode, inode->i_mode,
+               WHITEOUT_DEV);
+
+       ret = btrfs_init_inode_security(trans, inode, dir,
+                               &dentry->d_name);
+       if (ret)
+               goto out;
+
+       ret = btrfs_add_nondir(trans, dir, dentry,
+                               inode, 0, index);
+       if (ret)
+               goto out;
+
+       ret = btrfs_update_inode(trans, root, inode);
+out:
+       unlock_new_inode(inode);
+       if (ret)
+               inode_dec_link_count(inode);
+       iput(inode);
+
+       return ret;
+}
+
  static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-                          struct inode *new_dir, struct dentry *new_dentry)
+                          struct inode *new_dir, struct dentry *new_dentry,
+                          unsigned int flags)
  {
         struct btrfs_trans_handle *trans;
+       unsigned int trans_num_items;
         struct btrfs_root *root = BTRFS_I(old_dir)->root;
         struct btrfs_root *dest = BTRFS_I(new_dir)->root;
         struct inode *new_inode = d_inode(new_dentry);
@@ -9399,6 +9693,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
         u64 root_objectid;
         int ret;
         u64 old_ino = btrfs_ino(old_inode);
+       bool log_pinned = false;
  
         if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
                 return -EPERM;
@@ -9449,15 +9744,21 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
          * We want to reserve the absolute worst case amount of items.  So if
          * both inodes are subvols and we need to unlink them then that would
          * require 4 item modifications, but if they are both normal inodes it
-        * would require 5 item modifications, so we'll assume their normal
+        * would require 5 item modifications, so we'll assume they are normal
          * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
          * should cover the worst case number of items we'll modify.
+        * If our rename has the whiteout flag, we need more 5 units for the
+        * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item
+        * when selinux is enabled).
          */
-       trans = btrfs_start_transaction(root, 11);
+       trans_num_items = 11;
+       if (flags & RENAME_WHITEOUT)
+               trans_num_items += 5;
+       trans = btrfs_start_transaction(root, trans_num_items);
         if (IS_ERR(trans)) {
-                ret = PTR_ERR(trans);
-                goto out_notrans;
-        }
+               ret = PTR_ERR(trans);
+               goto out_notrans;
+       }
  
         if (dest != root)
                 btrfs_record_root_in_trans(trans, dest);
@@ -9471,6 +9772,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                 /* force full log commit if subvolume involved. */
                 btrfs_set_log_full_commit(root->fs_info, trans);
         } else {
+               btrfs_pin_log_trans(root);
+               log_pinned = true;
                 ret = btrfs_insert_inode_ref(trans, dest,
                                              new_dentry->d_name.name,
                                              new_dentry->d_name.len,
@@ -9478,14 +9781,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                                              btrfs_ino(new_dir), index);
                 if (ret)
                         goto out_fail;
-               /*
-                * this is an ugly little race, but the rename is required
-                * to make sure that if we crash, the inode is either at the
-                * old name or the new one.  pinning the log transaction lets
-                * us make sure we don't allow a log commit to come in after
-                * we unlink the name but before we add the new name back in.
-                */
-               btrfs_pin_log_trans(root);
         }
  
         inode_inc_iversion(old_dir);
@@ -9552,12 +9847,46 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
         if (old_inode->i_nlink == 1)
                 BTRFS_I(old_inode)->dir_index = index;
  
-       if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
+       if (log_pinned) {
                 struct dentry *parent = new_dentry->d_parent;
+
                 btrfs_log_new_name(trans, old_inode, old_dir, parent);
                 btrfs_end_log_trans(root);
+               log_pinned = false;
+       }
+
+       if (flags & RENAME_WHITEOUT) {
+               ret = btrfs_whiteout_for_rename(trans, root, old_dir,
+                                               old_dentry);
+
+               if (ret) {
+                       btrfs_abort_transaction(trans, root, ret);
+                       goto out_fail;
+               }
         }
  out_fail:
+       /*
+        * If we have pinned the log and an error happened, we unpin tasks
+        * trying to sync the log and force them to fallback to a transaction
+        * commit if the log currently contains any of the inodes involved in
+        * this rename operation (to ensure we do not persist a log with an
+        * inconsistent state for any of these inodes or leading to any
+        * inconsistencies when replayed). If the transaction was aborted, the
+        * abortion reason is propagated to userspace when attempting to commit
+        * the transaction. If the log does not contain any of these inodes, we
+        * allow the tasks to sync it.
+        */
+       if (ret && log_pinned) {
+               if (btrfs_inode_in_log(old_dir, root->fs_info->generation) ||
+                   btrfs_inode_in_log(new_dir, root->fs_info->generation) ||
+                   btrfs_inode_in_log(old_inode, root->fs_info->generation) ||
+                   (new_inode &&
+                    btrfs_inode_in_log(new_inode, root->fs_info->generation)))
+                   btrfs_set_log_full_commit(root->fs_info, trans);
+
+               btrfs_end_log_trans(root);
+               log_pinned = false;
+       }
         btrfs_end_transaction(trans, root);
  out_notrans:
         if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
@@ -9570,10 +9899,14 @@ static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
                          struct inode *new_dir, struct dentry *new_dentry,
                          unsigned int flags)
  {
-       if (flags & ~RENAME_NOREPLACE)
+       if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
                 return -EINVAL;
  
-       return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry);
+       if (flags & RENAME_EXCHANGE)
+               return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
+                                         new_dentry);
+
+       return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
  }
  
  static void btrfs_run_delalloc_work(struct btrfs_work *work)
@@ -9942,6 +10275,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
                                 btrfs_end_transaction(trans, root);
                         break;
                 }
+               btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
  
                 last_alloc = ins.offset;
                 ret = insert_reserved_file_extent(trans, inode,
@@ -10184,7 +10518,7 @@ static const struct file_operations btrfs_dir_file_operations = {
         .iterate        = btrfs_real_readdir,
         .unlocked_ioctl = btrfs_ioctl,
  #ifdef CONFIG_COMPAT
-       .compat_ioctl   = btrfs_ioctl,
+       .compat_ioctl   = btrfs_compat_ioctl,
  #endif
         .release        = btrfs_release_file,
         .fsync          = btrfs_sync_file,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c

index 0b8ba71..4e70069 100644 (file)
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -125,10 +125,10 @@ static unsigned int btrfs_flags_to_ioctl(unsigned int flags)
         if (flags & BTRFS_INODE_NODATACOW)
                 iflags |= FS_NOCOW_FL;
  
-       if ((flags & BTRFS_INODE_COMPRESS) && !(flags & BTRFS_INODE_NOCOMPRESS))
-               iflags |= FS_COMPR_FL;
-       else if (flags & BTRFS_INODE_NOCOMPRESS)
+       if (flags & BTRFS_INODE_NOCOMPRESS)
                 iflags |= FS_NOCOMP_FL;
+       else if (flags & BTRFS_INODE_COMPRESS)
+               iflags |= FS_COMPR_FL;
  
         return iflags;
  }
@@ -439,7 +439,7 @@ static noinline int create_subvol(struct inode *dir,
  {
         struct btrfs_trans_handle *trans;
         struct btrfs_key key;
-       struct btrfs_root_item root_item;
+       struct btrfs_root_item *root_item;
         struct btrfs_inode_item *inode_item;
         struct extent_buffer *leaf;
         struct btrfs_root *root = BTRFS_I(dir)->root;
@@ -455,16 +455,22 @@ static noinline int create_subvol(struct inode *dir,
         u64 qgroup_reserved;
         uuid_le new_uuid;
  
+       root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
+       if (!root_item)
+               return -ENOMEM;
+
         ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid);
         if (ret)
-               return ret;
+               goto fail_free;
  
         /*
          * Don't create subvolume whose level is not zero. Or qgroup will be
          * screwed up since it assume subvolme qgroup's level to be 0.
          */
-       if (btrfs_qgroup_level(objectid))
-               return -ENOSPC;
+       if (btrfs_qgroup_level(objectid)) {
+               ret = -ENOSPC;
+               goto fail_free;
+       }
  
         btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
         /*
@@ -474,14 +480,14 @@ static noinline int create_subvol(struct inode *dir,
         ret = btrfs_subvolume_reserve_metadata(root, &block_rsv,
                                                8, &qgroup_reserved, false);
         if (ret)
-               return ret;
+               goto fail_free;
  
         trans = btrfs_start_transaction(root, 0);
         if (IS_ERR(trans)) {
                 ret = PTR_ERR(trans);
                 btrfs_subvolume_release_metadata(root, &block_rsv,
                                                  qgroup_reserved);
-               return ret;
+               goto fail_free;
         }
         trans->block_rsv = &block_rsv;
         trans->bytes_reserved = block_rsv.size;
@@ -509,47 +515,45 @@ static noinline int create_subvol(struct inode *dir,
                             BTRFS_UUID_SIZE);
         btrfs_mark_buffer_dirty(leaf);
  
-       memset(&root_item, 0, sizeof(root_item));
-
-       inode_item = &root_item.inode;
+       inode_item = &root_item->inode;
         btrfs_set_stack_inode_generation(inode_item, 1);
         btrfs_set_stack_inode_size(inode_item, 3);
         btrfs_set_stack_inode_nlink(inode_item, 1);
         btrfs_set_stack_inode_nbytes(inode_item, root->nodesize);
         btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
  
-       btrfs_set_root_flags(&root_item, 0);
-       btrfs_set_root_limit(&root_item, 0);
+       btrfs_set_root_flags(root_item, 0);
+       btrfs_set_root_limit(root_item, 0);
         btrfs_set_stack_inode_flags(inode_item, BTRFS_INODE_ROOT_ITEM_INIT);
  
-       btrfs_set_root_bytenr(&root_item, leaf->start);
-       btrfs_set_root_generation(&root_item, trans->transid);
-       btrfs_set_root_level(&root_item, 0);
-       btrfs_set_root_refs(&root_item, 1);
-       btrfs_set_root_used(&root_item, leaf->len);
-       btrfs_set_root_last_snapshot(&root_item, 0);
+       btrfs_set_root_bytenr(root_item, leaf->start);
+       btrfs_set_root_generation(root_item, trans->transid);
+       btrfs_set_root_level(root_item, 0);
+       btrfs_set_root_refs(root_item, 1);
+       btrfs_set_root_used(root_item, leaf->len);
+       btrfs_set_root_last_snapshot(root_item, 0);
  
-       btrfs_set_root_generation_v2(&root_item,
-                       btrfs_root_generation(&root_item));
+       btrfs_set_root_generation_v2(root_item,
+                       btrfs_root_generation(root_item));
         uuid_le_gen(&new_uuid);
-       memcpy(root_item.uuid, new_uuid.b, BTRFS_UUID_SIZE);
-       btrfs_set_stack_timespec_sec(&root_item.otime, cur_time.tv_sec);
-       btrfs_set_stack_timespec_nsec(&root_item.otime, cur_time.tv_nsec);
-       root_item.ctime = root_item.otime;
-       btrfs_set_root_ctransid(&root_item, trans->transid);
-       btrfs_set_root_otransid(&root_item, trans->transid);
+       memcpy(root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE);
+       btrfs_set_stack_timespec_sec(&root_item->otime, cur_time.tv_sec);
+       btrfs_set_stack_timespec_nsec(&root_item->otime, cur_time.tv_nsec);
+       root_item->ctime = root_item->otime;
+       btrfs_set_root_ctransid(root_item, trans->transid);
+       btrfs_set_root_otransid(root_item, trans->transid);
  
         btrfs_tree_unlock(leaf);
         free_extent_buffer(leaf);
         leaf = NULL;
  
-       btrfs_set_root_dirid(&root_item, new_dirid);
+       btrfs_set_root_dirid(root_item, new_dirid);
  
         key.objectid = objectid;
         key.offset = 0;
         key.type = BTRFS_ROOT_ITEM_KEY;
         ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
-                               &root_item);
+                               root_item);
         if (ret)
                 goto fail;
  
@@ -601,12 +605,13 @@ static noinline int create_subvol(struct inode *dir,
         BUG_ON(ret);
  
         ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root,
-                                 root_item.uuid, BTRFS_UUID_KEY_SUBVOL,
+                                 root_item->uuid, BTRFS_UUID_KEY_SUBVOL,
                                   objectid);
         if (ret)
                 btrfs_abort_transaction(trans, root, ret);
  
  fail:
+       kfree(root_item);
         trans->block_rsv = NULL;
         trans->bytes_reserved = 0;
         btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
@@ -629,6 +634,10 @@ fail:
                 d_instantiate(dentry, inode);
         }
         return ret;
+
+fail_free:
+       kfree(root_item);
+       return ret;
  }
  
  static void btrfs_wait_for_no_snapshoting_writes(struct btrfs_root *root)
@@ -681,7 +690,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
         if (ret)
                 goto dec_and_free;
  
-       btrfs_wait_ordered_extents(root, -1);
+       btrfs_wait_ordered_extents(root, -1, 0, (u64)-1);
  
         btrfs_init_block_rsv(&pending_snapshot->block_rsv,
                              BTRFS_BLOCK_RSV_TEMP);
@@ -2671,10 +2680,10 @@ out:
         return ret;
  }
  
-static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
+static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
  {
         struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
-       struct btrfs_ioctl_vol_args *vol_args;
+       struct btrfs_ioctl_vol_args_v2 *vol_args;
         int ret;
  
         if (!capable(CAP_SYS_ADMIN))
@@ -2690,7 +2699,9 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
                 goto err_drop;
         }
  
-       vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+       /* Check for compatibility reject unknown flags */
+       if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED)
+               return -EOPNOTSUPP;
  
         if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
                         1)) {
@@ -2699,13 +2710,23 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
         }
  
         mutex_lock(&root->fs_info->volume_mutex);
-       ret = btrfs_rm_device(root, vol_args->name);
+       if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) {
+               ret = btrfs_rm_device(root, NULL, vol_args->devid);
+       } else {
+               vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
+               ret = btrfs_rm_device(root, vol_args->name, 0);
+       }
         mutex_unlock(&root->fs_info->volume_mutex);
         atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
  
-       if (!ret)
-               btrfs_info(root->fs_info, "disk deleted %s",vol_args->name);
-
+       if (!ret) {
+               if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
+                       btrfs_info(root->fs_info, "device deleted: id %llu",
+                                       vol_args->devid);
+               else
+                       btrfs_info(root->fs_info, "device deleted: %s",
+                                       vol_args->name);
+       }
  out:
         kfree(vol_args);
  err_drop:
@@ -2713,6 +2734,47 @@ err_drop:
         return ret;
  }
  
+static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
+{
+       struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+       struct btrfs_ioctl_vol_args *vol_args;
+       int ret;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       ret = mnt_want_write_file(file);
+       if (ret)
+               return ret;
+
+       if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+                       1)) {
+               ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+               goto out_drop_write;
+       }
+
+       vol_args = memdup_user(arg, sizeof(*vol_args));
+       if (IS_ERR(vol_args)) {
+               ret = PTR_ERR(vol_args);
+               goto out;
+       }
+
+       vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+       mutex_lock(&root->fs_info->volume_mutex);
+       ret = btrfs_rm_device(root, vol_args->name, 0);
+       mutex_unlock(&root->fs_info->volume_mutex);
+
+       if (!ret)
+               btrfs_info(root->fs_info, "disk deleted %s",vol_args->name);
+       kfree(vol_args);
+out:
+       atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+out_drop_write:
+       mnt_drop_write_file(file);
+
+       return ret;
+}
+
  static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
  {
         struct btrfs_ioctl_fs_info_args *fi_args;
@@ -3472,13 +3534,16 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
         u64 last_dest_end = destoff;
  
         ret = -ENOMEM;
-       buf = vmalloc(root->nodesize);
-       if (!buf)
-               return ret;
+       buf = kmalloc(root->nodesize, GFP_KERNEL | __GFP_NOWARN);
+       if (!buf) {
+               buf = vmalloc(root->nodesize);
+               if (!buf)
+                       return ret;
+       }
  
         path = btrfs_alloc_path();
         if (!path) {
-               vfree(buf);
+               kvfree(buf);
                 return ret;
         }
  
@@ -3779,7 +3844,7 @@ process_slot:
  
  out:
         btrfs_free_path(path);
-       vfree(buf);
+       kvfree(buf);
         return ret;
  }
  
@@ -4380,7 +4445,7 @@ static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg)
                         1)) {
                         ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
                 } else {
-                       ret = btrfs_dev_replace_start(root, p);
+                       ret = btrfs_dev_replace_by_ioctl(root, p);
                         atomic_set(
                          &root->fs_info->mutually_exclusive_operation_running,
                          0);
@@ -4851,8 +4916,8 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
         /* update qgroup status and info */
         err = btrfs_run_qgroups(trans, root->fs_info);
         if (err < 0)
-               btrfs_std_error(root->fs_info, ret,
-                           "failed to update qgroup status and info\n");
+               btrfs_handle_fs_error(root->fs_info, err,
+                           "failed to update qgroup status and info");
         err = btrfs_end_transaction(trans, root);
         if (err && !ret)
                 ret = err;
@@ -5398,9 +5463,15 @@ static int btrfs_ioctl_set_features(struct file *file, void __user *arg)
         if (ret)
                 return ret;
  
+       ret = mnt_want_write_file(file);
+       if (ret)
+               return ret;
+
         trans = btrfs_start_transaction(root, 0);
-       if (IS_ERR(trans))
-               return PTR_ERR(trans);
+       if (IS_ERR(trans)) {
+               ret = PTR_ERR(trans);
+               goto out_drop_write;
+       }
  
         spin_lock(&root->fs_info->super_lock);
         newflags = btrfs_super_compat_flags(super_block);
@@ -5419,7 +5490,11 @@ static int btrfs_ioctl_set_features(struct file *file, void __user *arg)
         btrfs_set_super_incompat_flags(super_block, newflags);
         spin_unlock(&root->fs_info->super_lock);
  
-       return btrfs_commit_transaction(trans, root);
+       ret = btrfs_commit_transaction(trans, root);
+out_drop_write:
+       mnt_drop_write_file(file);
+
+       return ret;
  }
  
  long btrfs_ioctl(struct file *file, unsigned int
@@ -5463,6 +5538,8 @@ long btrfs_ioctl(struct file *file, unsigned int
                 return btrfs_ioctl_add_dev(root, argp);
         case BTRFS_IOC_RM_DEV:
                 return btrfs_ioctl_rm_dev(file, argp);
+       case BTRFS_IOC_RM_DEV_V2:
+               return btrfs_ioctl_rm_dev_v2(file, argp);
         case BTRFS_IOC_FS_INFO:
                 return btrfs_ioctl_fs_info(root, argp);
         case BTRFS_IOC_DEV_INFO:
@@ -5556,3 +5633,24 @@ long btrfs_ioctl(struct file *file, unsigned int
  
         return -ENOTTY;
  }
+
+#ifdef CONFIG_COMPAT
+long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+       switch (cmd) {
+       case FS_IOC32_GETFLAGS:
+               cmd = FS_IOC_GETFLAGS;
+               break;
+       case FS_IOC32_SETFLAGS:
+               cmd = FS_IOC_SETFLAGS;
+               break;
+       case FS_IOC32_GETVERSION:
+               cmd = FS_IOC_GETVERSION;
+               break;
+       default:
+               return -ENOIOCTLCMD;
+       }
+
+       return btrfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
+}
+#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c

index 0de7da5..5591704 100644 (file)
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -661,14 +661,15 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
   * wait for all the ordered extents in a root.  This is done when balancing
   * space between drives.
   */
-int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
+int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr,
+                              const u64 range_start, const u64 range_len)
  {
-       struct list_head splice, works;
+       LIST_HEAD(splice);
+       LIST_HEAD(skipped);
+       LIST_HEAD(works);
         struct btrfs_ordered_extent *ordered, *next;
         int count = 0;
-
-       INIT_LIST_HEAD(&splice);
-       INIT_LIST_HEAD(&works);
+       const u64 range_end = range_start + range_len;
  
         mutex_lock(&root->ordered_extent_mutex);
         spin_lock(&root->ordered_extent_lock);
@@ -676,6 +677,14 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
         while (!list_empty(&splice) && nr) {
                 ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
                                            root_extent_list);
+
+               if (range_end <= ordered->start ||
+                   ordered->start + ordered->disk_len <= range_start) {
+                       list_move_tail(&ordered->root_extent_list, &skipped);
+                       cond_resched_lock(&root->ordered_extent_lock);
+                       continue;
+               }
+
                 list_move_tail(&ordered->root_extent_list,
                                &root->ordered_extents);
                 atomic_inc(&ordered->refs);
@@ -694,6 +703,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
                         nr--;
                 count++;
         }
+       list_splice_tail(&skipped, &root->ordered_extents);
         list_splice_tail(&splice, &root->ordered_extents);
         spin_unlock(&root->ordered_extent_lock);
  
@@ -708,7 +718,8 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
         return count;
  }
  
-void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
+void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr,
+                             const u64 range_start, const u64 range_len)
  {
         struct btrfs_root *root;
         struct list_head splice;
@@ -728,7 +739,8 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
                                &fs_info->ordered_roots);
                 spin_unlock(&fs_info->ordered_root_lock);
  
-               done = btrfs_wait_ordered_extents(root, nr);
+               done = btrfs_wait_ordered_extents(root, nr,
+                                                 range_start, range_len);
                 btrfs_put_fs_root(root);
  
                 spin_lock(&fs_info->ordered_root_lock);
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h

index 23c9605..8ef1262 100644 (file)
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -197,8 +197,10 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
                                 struct btrfs_ordered_extent *ordered);
  int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
                            u32 *sum, int len);
-int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
-void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
+int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr,
+                              const u64 range_start, const u64 range_len);
+void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr,
+                             const u64 range_start, const u64 range_len);
  void btrfs_get_logged_extents(struct inode *inode,
                               struct list_head *logged_list,
                               const loff_t start,
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c

index 08ef890..1cfd35c 100644 (file)
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2418,7 +2418,7 @@ again:
         }
  out:
         if (ret) {
-               btrfs_std_error(root->fs_info, ret, NULL);
+               btrfs_handle_fs_error(root->fs_info, ret, NULL);
                 if (!list_empty(&reloc_roots))
                         free_reloc_roots(&reloc_roots);
  
@@ -4254,12 +4254,11 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
         btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu",
                rc->block_group->key.objectid, rc->block_group->flags);
  
-       ret = btrfs_start_delalloc_roots(fs_info, 0, -1);
-       if (ret < 0) {
-               err = ret;
-               goto out;
-       }
-       btrfs_wait_ordered_roots(fs_info, -1);
+       btrfs_wait_block_group_reservations(rc->block_group);
+       btrfs_wait_nocow_writers(rc->block_group);
+       btrfs_wait_ordered_roots(fs_info, -1,
+                                rc->block_group->key.objectid,
+                                rc->block_group->key.offset);
  
         while (1) {
                 mutex_lock(&fs_info->cleaner_mutex);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c

index 9fcd6df..b2b14e7 100644 (file)
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -284,7 +284,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
                         trans = btrfs_join_transaction(tree_root);
                         if (IS_ERR(trans)) {
                                 err = PTR_ERR(trans);
-                               btrfs_std_error(tree_root->fs_info, err,
+                               btrfs_handle_fs_error(tree_root->fs_info, err,
                                             "Failed to start trans to delete "
                                             "orphan item");
                                 break;
@@ -293,7 +293,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
                                                     root_key.objectid);
                         btrfs_end_transaction(trans, tree_root);
                         if (err) {
-                               btrfs_std_error(tree_root->fs_info, err,
+                               btrfs_handle_fs_error(tree_root->fs_info, err,
                                             "Failed to delete root orphan "
                                             "item");
                                 break;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c

index 4678f03..fa35cdc 100644 (file)
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1350,7 +1350,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
                 recover->bbio = bbio;
                 recover->map_length = mapped_length;
  
-               BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
+               BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK);
  
                 nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
  
@@ -2127,6 +2127,8 @@ static void scrub_missing_raid56_end_io(struct bio *bio)
         if (bio->bi_error)
                 sblock->no_io_error_seen = 0;
  
+       bio_put(bio);
+
         btrfs_queue_work(fs_info->scrub_workers, &sblock->work);
  }
  
@@ -2860,7 +2862,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
         int extent_mirror_num;
         int stop_loop = 0;
  
-       nsectors = map->stripe_len / root->sectorsize;
+       nsectors = div_u64(map->stripe_len, root->sectorsize);
         bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
         sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
                           GFP_NOFS);
@@ -3070,7 +3072,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
         int slot;
         u64 nstripes;
         struct extent_buffer *l;
-       struct btrfs_key key;
         u64 physical;
         u64 logical;
         u64 logic_end;
@@ -3079,7 +3080,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
         int mirror_num;
         struct reada_control *reada1;
         struct reada_control *reada2;
-       struct btrfs_key key_start;
+       struct btrfs_key key;
         struct btrfs_key key_end;
         u64 increment = map->stripe_len;
         u64 offset;
@@ -3158,21 +3159,21 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
         scrub_blocked_if_needed(fs_info);
  
         /* FIXME it might be better to start readahead at commit root */
-       key_start.objectid = logical;
-       key_start.type = BTRFS_EXTENT_ITEM_KEY;
-       key_start.offset = (u64)0;
+       key.objectid = logical;
+       key.type = BTRFS_EXTENT_ITEM_KEY;
+       key.offset = (u64)0;
         key_end.objectid = logic_end;
         key_end.type = BTRFS_METADATA_ITEM_KEY;
         key_end.offset = (u64)-1;
-       reada1 = btrfs_reada_add(root, &key_start, &key_end);
+       reada1 = btrfs_reada_add(root, &key, &key_end);
  
-       key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
-       key_start.type = BTRFS_EXTENT_CSUM_KEY;
-       key_start.offset = logical;
+       key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+       key.type = BTRFS_EXTENT_CSUM_KEY;
+       key.offset = logical;
         key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
         key_end.type = BTRFS_EXTENT_CSUM_KEY;
         key_end.offset = logic_end;
-       reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
+       reada2 = btrfs_reada_add(csum_root, &key, &key_end);
  
         if (!IS_ERR(reada1))
                 btrfs_reada_wait(reada1);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c

index 8d358c5..6a8c860 100644 (file)
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -5939,6 +5939,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
         u32 i;
         u64 *clone_sources_tmp = NULL;
         int clone_sources_to_rollback = 0;
+       unsigned alloc_size;
         int sort_clone_roots = 0;
         int index;
  
@@ -5978,6 +5979,12 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
                 goto out;
         }
  
+       if (arg->clone_sources_count >
+           ULLONG_MAX / sizeof(*arg->clone_sources)) {
+               ret = -EINVAL;
+               goto out;
+       }
+
         if (!access_ok(VERIFY_READ, arg->clone_sources,
                         sizeof(*arg->clone_sources) *
                         arg->clone_sources_count)) {
@@ -6022,40 +6029,53 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
         sctx->clone_roots_cnt = arg->clone_sources_count;
  
         sctx->send_max_size = BTRFS_SEND_BUF_SIZE;
-       sctx->send_buf = vmalloc(sctx->send_max_size);
+       sctx->send_buf = kmalloc(sctx->send_max_size, GFP_KERNEL | __GFP_NOWARN);
         if (!sctx->send_buf) {
-               ret = -ENOMEM;
-               goto out;
+               sctx->send_buf = vmalloc(sctx->send_max_size);
+               if (!sctx->send_buf) {
+                       ret = -ENOMEM;
+                       goto out;
+               }
         }
  
-       sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE);
+       sctx->read_buf = kmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL | __GFP_NOWARN);
         if (!sctx->read_buf) {
-               ret = -ENOMEM;
-               goto out;
+               sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE);
+               if (!sctx->read_buf) {
+                       ret = -ENOMEM;
+                       goto out;
+               }
         }
  
         sctx->pending_dir_moves = RB_ROOT;
         sctx->waiting_dir_moves = RB_ROOT;
         sctx->orphan_dirs = RB_ROOT;
  
-       sctx->clone_roots = vzalloc(sizeof(struct clone_root) *
-                       (arg->clone_sources_count + 1));
+       alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1);
+
+       sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN);
         if (!sctx->clone_roots) {
-               ret = -ENOMEM;
-               goto out;
+               sctx->clone_roots = vzalloc(alloc_size);
+               if (!sctx->clone_roots) {
+                       ret = -ENOMEM;
+                       goto out;
+               }
         }
  
+       alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources);
+
         if (arg->clone_sources_count) {
-               clone_sources_tmp = vmalloc(arg->clone_sources_count *
-                               sizeof(*arg->clone_sources));
+               clone_sources_tmp = kmalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN);
                 if (!clone_sources_tmp) {
-                       ret = -ENOMEM;
-                       goto out;
+                       clone_sources_tmp = vmalloc(alloc_size);
+                       if (!clone_sources_tmp) {
+                               ret = -ENOMEM;
+                               goto out;
+                       }
                 }
  
                 ret = copy_from_user(clone_sources_tmp, arg->clone_sources,
-                               arg->clone_sources_count *
-                               sizeof(*arg->clone_sources));
+                               alloc_size);
                 if (ret) {
                         ret = -EFAULT;
                         goto out;
@@ -6089,7 +6109,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
                         sctx->clone_roots[i].root = clone_root;
                         clone_sources_to_rollback = i + 1;
                 }
-               vfree(clone_sources_tmp);
+               kvfree(clone_sources_tmp);
                 clone_sources_tmp = NULL;
         }
  
@@ -6207,15 +6227,15 @@ out:
                 btrfs_root_dec_send_in_progress(sctx->parent_root);
  
         kfree(arg);
-       vfree(clone_sources_tmp);
+       kvfree(clone_sources_tmp);
  
         if (sctx) {
                 if (sctx->send_filp)
                         fput(sctx->send_filp);
  
-               vfree(sctx->clone_roots);
-               vfree(sctx->send_buf);
-               vfree(sctx->read_buf);
+               kvfree(sctx->clone_roots);
+               kvfree(sctx->send_buf);
+               kvfree(sctx->read_buf);
  
                 name_cache_free(sctx);
  
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c

index 00b8f37..bf71071 100644 (file)
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -97,15 +97,6 @@ const char *btrfs_decode_error(int errno)
         return errstr;
  }
  
-static void save_error_info(struct btrfs_fs_info *fs_info)
-{
-       /*
-        * today we only save the error info into ram.  Long term we'll
-        * also send it down to the disk
-        */
-       set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
-}
-
  /* btrfs handle error by forcing the filesystem readonly */
  static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
  {
@@ -131,11 +122,11 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
  }
  
  /*
- * __btrfs_std_error decodes expected errors from the caller and
+ * __btrfs_handle_fs_error decodes expected errors from the caller and
   * invokes the approciate error response.
   */
  __cold
-void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
+void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
                        unsigned int line, int errno, const char *fmt, ...)
  {
         struct super_block *sb = fs_info->sb;
@@ -170,8 +161,13 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
         }
  #endif
  
+       /*
+        * Today we only save the error info to memory.  Long term we'll
+        * also send it down to the disk
+        */
+       set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
+
         /* Don't go through full error handling during mount */
-       save_error_info(fs_info);
         if (sb->s_flags & MS_BORN)
                 btrfs_handle_error(fs_info);
  }
@@ -252,7 +248,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
         /* Wake up anybody who may be waiting on this transaction */
         wake_up(&root->fs_info->transaction_wait);
         wake_up(&root->fs_info->transaction_blocked_wait);
-       __btrfs_std_error(root->fs_info, function, line, errno, NULL);
+       __btrfs_handle_fs_error(root->fs_info, function, line, errno, NULL);
  }
  /*
   * __btrfs_panic decodes unexpected, fatal errors from the caller,
@@ -1160,7 +1156,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
                 return 0;
         }
  
-       btrfs_wait_ordered_roots(fs_info, -1);
+       btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1);
  
         trans = btrfs_attach_transaction_barrier(root);
         if (IS_ERR(trans)) {
@@ -1488,10 +1484,10 @@ static int setup_security_options(struct btrfs_fs_info *fs_info,
                 memcpy(&fs_info->security_opts, sec_opts, sizeof(*sec_opts));
         } else {
                 /*
-                * Since SELinux(the only one supports security_mnt_opts) does
-                * NOT support changing context during remount/mount same sb,
-                * This must be the same or part of the same security options,
-                * just free it.
+                * Since SELinux (the only one supporting security_mnt_opts)
+                * does NOT support changing context during remount/mount of
+                * the same sb, this must be the same or part of the same
+                * security options, just free it.
                  */
                 security_free_mnt_opts(sec_opts);
         }
@@ -1669,8 +1665,8 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
                                          unsigned long old_opts)
  {
         /*
-        * We need cleanup all defragable inodes if the autodefragment is
-        * close or the fs is R/O.
+        * We need to cleanup all defragable inodes if the autodefragment is
+        * close or the filesystem is read only.
          */
         if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
             (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
@@ -2051,9 +2047,10 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
         struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
         int ret;
         u64 thresh = 0;
+       int mixed = 0;
  
         /*
-        * holding chunk_muext to avoid allocating new chunks, holding
+        * holding chunk_mutex to avoid allocating new chunks, holding
          * device_list_mutex to avoid the device being removed
          */
         rcu_read_lock();
@@ -2076,8 +2073,17 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
                                 }
                         }
                 }
-               if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
-                       total_free_meta += found->disk_total - found->disk_used;
+
+               /*
+                * Metadata in mixed block goup profiles are accounted in data
+                */
+               if (!mixed && found->flags & BTRFS_BLOCK_GROUP_METADATA) {
+                       if (found->flags & BTRFS_BLOCK_GROUP_DATA)
+                               mixed = 1;
+                       else
+                               total_free_meta += found->disk_total -
+                                       found->disk_used;
+               }
  
                 total_used += found->disk_used;
         }
@@ -2090,7 +2096,11 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
  
         /* Account global block reserve as used, it's in logical size already */
         spin_lock(&block_rsv->lock);
-       buf->f_bfree -= block_rsv->size >> bits;
+       /* Mixed block groups accounting is not byte-accurate, avoid overflow */
+       if (buf->f_bfree >= block_rsv->size >> bits)
+               buf->f_bfree -= block_rsv->size >> bits;
+       else
+               buf->f_bfree = 0;
         spin_unlock(&block_rsv->lock);
  
         buf->f_bavail = div_u64(total_free_data, factor);
@@ -2115,7 +2125,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
          */
         thresh = 4 * 1024 * 1024;
  
-       if (total_free_meta - thresh < block_rsv->size)
+       if (!mixed && total_free_meta - thresh < block_rsv->size)
                 buf->f_bavail = 0;
  
         buf->f_type = BTRFS_SUPER_MAGIC;
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c

index 539e7b5..4879656 100644 (file)
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -120,6 +120,9 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
         if (!fs_info)
                 return -EPERM;
  
+       if (fs_info->sb->s_flags & MS_RDONLY)
+               return -EROFS;
+
         ret = kstrtoul(skip_spaces(buf), 0, &val);
         if (ret)
                 return ret;
@@ -364,7 +367,13 @@ static ssize_t btrfs_label_show(struct kobject *kobj,
  {
         struct btrfs_fs_info *fs_info = to_fs_info(kobj);
         char *label = fs_info->super_copy->label;
-       return snprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label);
+       ssize_t ret;
+
+       spin_lock(&fs_info->super_lock);
+       ret = snprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label);
+       spin_unlock(&fs_info->super_lock);
+
+       return ret;
  }
  
  static ssize_t btrfs_label_store(struct kobject *kobj,
@@ -374,6 +383,9 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
         struct btrfs_fs_info *fs_info = to_fs_info(kobj);
         size_t p_len;
  
+       if (!fs_info)
+               return -EPERM;
+
         if (fs_info->sb->s_flags & MS_RDONLY)
                 return -EROFS;
  
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c

index 43885e5..5b0b758 100644 (file)
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -311,10 +311,11 @@ loop:
   * when the transaction commits
   */
  static int record_root_in_trans(struct btrfs_trans_handle *trans,
-                              struct btrfs_root *root)
+                              struct btrfs_root *root,
+                              int force)
  {
-       if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
-           root->last_trans < trans->transid) {
+       if ((test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+           root->last_trans < trans->transid) || force) {
                 WARN_ON(root == root->fs_info->extent_root);
                 WARN_ON(root->commit_root != root->node);
  
@@ -331,7 +332,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
                 smp_wmb();
  
                 spin_lock(&root->fs_info->fs_roots_radix_lock);
-               if (root->last_trans == trans->transid) {
+               if (root->last_trans == trans->transid && !force) {
                         spin_unlock(&root->fs_info->fs_roots_radix_lock);
                         return 0;
                 }
@@ -402,7 +403,7 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
                 return 0;
  
         mutex_lock(&root->fs_info->reloc_mutex);
-       record_root_in_trans(trans, root);
+       record_root_in_trans(trans, root, 0);
         mutex_unlock(&root->fs_info->reloc_mutex);
  
         return 0;
@@ -1310,6 +1311,97 @@ int btrfs_defrag_root(struct btrfs_root *root)
         return ret;
  }
  
+/* Bisesctability fixup, remove in 4.8 */
+#ifndef btrfs_std_error
+#define btrfs_std_error btrfs_handle_fs_error
+#endif
+
+/*
+ * Do all special snapshot related qgroup dirty hack.
+ *
+ * Will do all needed qgroup inherit and dirty hack like switch commit
+ * roots inside one transaction and write all btree into disk, to make
+ * qgroup works.
+ */
+static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
+                                  struct btrfs_root *src,
+                                  struct btrfs_root *parent,
+                                  struct btrfs_qgroup_inherit *inherit,
+                                  u64 dst_objectid)
+{
+       struct btrfs_fs_info *fs_info = src->fs_info;
+       int ret;
+
+       /*
+        * Save some performance in the case that qgroups are not
+        * enabled. If this check races with the ioctl, rescan will
+        * kick in anyway.
+        */
+       mutex_lock(&fs_info->qgroup_ioctl_lock);
+       if (!fs_info->quota_enabled) {
+               mutex_unlock(&fs_info->qgroup_ioctl_lock);
+               return 0;
+       }
+       mutex_unlock(&fs_info->qgroup_ioctl_lock);
+
+       /*
+        * We are going to commit transaction, see btrfs_commit_transaction()
+        * comment for reason locking tree_log_mutex
+        */
+       mutex_lock(&fs_info->tree_log_mutex);
+
+       ret = commit_fs_roots(trans, src);
+       if (ret)
+               goto out;
+       ret = btrfs_qgroup_prepare_account_extents(trans, fs_info);
+       if (ret < 0)
+               goto out;
+       ret = btrfs_qgroup_account_extents(trans, fs_info);
+       if (ret < 0)
+               goto out;
+
+       /* Now qgroup are all updated, we can inherit it to new qgroups */
+       ret = btrfs_qgroup_inherit(trans, fs_info,
+                                  src->root_key.objectid, dst_objectid,
+                                  inherit);
+       if (ret < 0)
+               goto out;
+
+       /*
+        * Now we do a simplified commit transaction, which will:
+        * 1) commit all subvolume and extent tree
+        *    To ensure all subvolume and extent tree have a valid
+        *    commit_root to accounting later insert_dir_item()
+        * 2) write all btree blocks onto disk
+        *    This is to make sure later btree modification will be cowed
+        *    Or commit_root can be populated and cause wrong qgroup numbers
+        * In this simplified commit, we don't really care about other trees
+        * like chunk and root tree, as they won't affect qgroup.
+        * And we don't write super to avoid half committed status.
+        */
+       ret = commit_cowonly_roots(trans, src);
+       if (ret)
+               goto out;
+       switch_commit_roots(trans->transaction, fs_info);
+       ret = btrfs_write_and_wait_transaction(trans, src);
+       if (ret)
+               btrfs_std_error(fs_info, ret,
+                       "Error while writing out transaction for qgroup");
+
+out:
+       mutex_unlock(&fs_info->tree_log_mutex);
+
+       /*
+        * Force parent root to be updated, as we recorded it before so its
+        * last_trans == cur_transid.
+        * Or it won't be committed again onto disk after later
+        * insert_dir_item()
+        */
+       if (!ret)
+               record_root_in_trans(trans, parent, 1);
+       return ret;
+}
+
  /*
   * new snapshots need to be created at a very specific time in the
   * transaction commit.  This does the actual creation.
@@ -1383,7 +1475,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
         dentry = pending->dentry;
         parent_inode = pending->dir;
         parent_root = BTRFS_I(parent_inode)->root;
-       record_root_in_trans(trans, parent_root);
+       record_root_in_trans(trans, parent_root, 0);
  
         cur_time = current_fs_time(parent_inode->i_sb);
  
@@ -1420,7 +1512,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
                 goto fail;
         }
  
-       record_root_in_trans(trans, root);
+       record_root_in_trans(trans, root, 0);
         btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
         memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
         btrfs_check_and_init_root_item(new_root_item);
@@ -1516,6 +1608,17 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
                 goto fail;
         }
  
+       /*
+        * Do special qgroup accounting for snapshot, as we do some qgroup
+        * snapshot hack to do fast snapshot.
+        * To co-operate with that hack, we do hack again.
+        * Or snapshot will be greatly slowed down by a subtree qgroup rescan
+        */
+       ret = qgroup_account_snapshot(trans, root, parent_root,
+                                     pending->inherit, objectid);
+       if (ret < 0)
+               goto fail;
+
         ret = btrfs_insert_dir_item(trans, parent_root,
                                     dentry->d_name.name, dentry->d_name.len,
                                     parent_inode, &key,
@@ -1559,23 +1662,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
                 goto fail;
         }
  
-       /*
-        * account qgroup counters before qgroup_inherit()
-        */
-       ret = btrfs_qgroup_prepare_account_extents(trans, fs_info);
-       if (ret)
-               goto fail;
-       ret = btrfs_qgroup_account_extents(trans, fs_info);
-       if (ret)
-               goto fail;
-       ret = btrfs_qgroup_inherit(trans, fs_info,
-                                  root->root_key.objectid,
-                                  objectid, pending->inherit);
-       if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
-               goto fail;
-       }
-
  fail:
         pending->error = ret;
  dir_item_existed:
@@ -1821,7 +1907,7 @@ static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
  static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
  {
         if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
-               btrfs_wait_ordered_roots(fs_info, -1);
+               btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1);
  }
  
  static inline void
@@ -2145,7 +2231,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
  
         ret = btrfs_write_and_wait_transaction(trans, root);
         if (ret) {
-               btrfs_std_error(root->fs_info, ret,
+               btrfs_handle_fs_error(root->fs_info, ret,
                             "Error while writing out transaction");
                 mutex_unlock(&root->fs_info->tree_log_mutex);
                 goto scrub_continue;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c

index e692eea..8aaca5c 100644 (file)
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4141,6 +4141,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
  
         INIT_LIST_HEAD(&extents);
  
+       down_write(&BTRFS_I(inode)->dio_sem);
         write_lock(&tree->lock);
         test_gen = root->fs_info->last_trans_committed;
  
@@ -4169,13 +4170,20 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
         }
  
         list_sort(NULL, &extents, extent_cmp);
+       btrfs_get_logged_extents(inode, logged_list, start, end);
         /*
-        * Collect any new ordered extents within the range. This is to
-        * prevent logging file extent items without waiting for the disk
-        * location they point to being written. We do this only to deal
-        * with races against concurrent lockless direct IO writes.
+        * Some ordered extents started by fsync might have completed
+        * before we could collect them into the list logged_list, which
+        * means they're gone, not in our logged_list nor in the inode's
+        * ordered tree. We want the application/user space to know an
+        * error happened while attempting to persist file data so that
+        * it can take proper action. If such error happened, we leave
+        * without writing to the log tree and the fsync must report the
+        * file data write error and not commit the current transaction.
          */
-       btrfs_get_logged_extents(inode, logged_list, start, end);
+       ret = btrfs_inode_check_errors(inode);
+       if (ret)
+               ctx->io_err = ret;
  process:
         while (!list_empty(&extents)) {
                 em = list_entry(extents.next, struct extent_map, list);
@@ -4202,6 +4210,7 @@ process:
         }
         WARN_ON(!list_empty(&extents));
         write_unlock(&tree->lock);
+       up_write(&BTRFS_I(inode)->dio_sem);
  
         btrfs_release_path(path);
         return ret;
@@ -4622,23 +4631,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
  
         mutex_lock(&BTRFS_I(inode)->log_mutex);
  
-       /*
-        * Collect ordered extents only if we are logging data. This is to
-        * ensure a subsequent request to log this inode in LOG_INODE_ALL mode
-        * will process the ordered extents if they still exists at the time,
-        * because when we collect them we test and set for the flag
-        * BTRFS_ORDERED_LOGGED to prevent multiple log requests to process the
-        * same ordered extents. The consequence for the LOG_INODE_ALL log mode
-        * not processing the ordered extents is that we end up logging the
-        * corresponding file extent items, based on the extent maps in the
-        * inode's extent_map_tree's modified_list, without logging the
-        * respective checksums (since the may still be only attached to the
-        * ordered extents and have not been inserted in the csum tree by
-        * btrfs_finish_ordered_io() yet).
-        */
-       if (inode_only == LOG_INODE_ALL)
-               btrfs_get_logged_extents(inode, &logged_list, start, end);
-
         /*
          * a brute force approach to making sure we get the most uptodate
          * copies of everything.
@@ -4846,21 +4838,6 @@ log_extents:
                         goto out_unlock;
         }
         if (fast_search) {
-               /*
-                * Some ordered extents started by fsync might have completed
-                * before we collected the ordered extents in logged_list, which
-                * means they're gone, not in our logged_list nor in the inode's
-                * ordered tree. We want the application/user space to know an
-                * error happened while attempting to persist file data so that
-                * it can take proper action. If such error happened, we leave
-                * without writing to the log tree and the fsync must report the
-                * file data write error and not commit the current transaction.
-                */
-               err = btrfs_inode_check_errors(inode);
-               if (err) {
-                       ctx->io_err = err;
-                       goto out_unlock;
-               }
                 ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
                                                 &logged_list, ctx, start, end);
                 if (ret) {
@@ -5158,7 +5135,7 @@ process_leaf:
                         }
  
                         ctx->log_new_dentries = false;
-                       if (type == BTRFS_FT_DIR)
+                       if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
                                 log_mode = LOG_INODE_ALL;
                         btrfs_release_path(path);
                         ret = btrfs_log_inode(trans, root, di_inode,
@@ -5278,11 +5255,16 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
                         if (IS_ERR(dir_inode))
                                 continue;
  
+                       if (ctx)
+                               ctx->log_new_dentries = false;
                         ret = btrfs_log_inode(trans, root, dir_inode,
                                               LOG_INODE_ALL, 0, LLONG_MAX, ctx);
                         if (!ret &&
                             btrfs_must_commit_transaction(trans, dir_inode))
                                 ret = 1;
+                       if (!ret && ctx && ctx->log_new_dentries)
+                               ret = log_new_dir_dentries(trans, root,
+                                                          dir_inode, ctx);
                         iput(dir_inode);
                         if (ret)
                                 goto out;
@@ -5519,7 +5501,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
  
         ret = walk_log_tree(trans, log_root_tree, &wc);
         if (ret) {
-               btrfs_std_error(fs_info, ret, "Failed to pin buffers while "
+               btrfs_handle_fs_error(fs_info, ret, "Failed to pin buffers while "
                             "recovering log root tree.");
                 goto error;
         }
@@ -5533,7 +5515,7 @@ again:
                 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
  
                 if (ret < 0) {
-                       btrfs_std_error(fs_info, ret,
+                       btrfs_handle_fs_error(fs_info, ret,
                                     "Couldn't find tree log root.");
                         goto error;
                 }
@@ -5551,7 +5533,7 @@ again:
                 log = btrfs_read_fs_root(log_root_tree, &found_key);
                 if (IS_ERR(log)) {
                         ret = PTR_ERR(log);
-                       btrfs_std_error(fs_info, ret,
+                       btrfs_handle_fs_error(fs_info, ret,
                                     "Couldn't read tree log root.");
                         goto error;
                 }
@@ -5566,7 +5548,7 @@ again:
                         free_extent_buffer(log->node);
                         free_extent_buffer(log->commit_root);
                         kfree(log);
-                       btrfs_std_error(fs_info, ret, "Couldn't read target root "
+                       btrfs_handle_fs_error(fs_info, ret, "Couldn't read target root "
                                     "for tree log recovery.");
                         goto error;
                 }
@@ -5652,11 +5634,9 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
          * into the file.  When the file is logged we check it and
          * don't log the parents if the file is fully on disk.
          */
-       if (S_ISREG(inode->i_mode)) {
-               mutex_lock(&BTRFS_I(inode)->log_mutex);
-               BTRFS_I(inode)->last_unlink_trans = trans->transid;
-               mutex_unlock(&BTRFS_I(inode)->log_mutex);
-       }
+       mutex_lock(&BTRFS_I(inode)->log_mutex);
+       BTRFS_I(inode)->last_unlink_trans = trans->transid;
+       mutex_unlock(&BTRFS_I(inode)->log_mutex);
  
         /*
          * if this directory was already logged any new
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c

index bfb80da..2b88127 100644 (file)
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -118,6 +118,21 @@ const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
         [BTRFS_RAID_RAID6]  = BTRFS_BLOCK_GROUP_RAID6,
  };
  
+/*
+ * Table to convert BTRFS_RAID_* to the error code if minimum number of devices
+ * condition is not met. Zero means there's no corresponding
+ * BTRFS_ERROR_DEV_*_NOT_MET value.
+ */
+const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = {
+       [BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
+       [BTRFS_RAID_RAID1]  = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
+       [BTRFS_RAID_DUP]    = 0,
+       [BTRFS_RAID_RAID0]  = 0,
+       [BTRFS_RAID_SINGLE] = 0,
+       [BTRFS_RAID_RAID5]  = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
+       [BTRFS_RAID_RAID6]  = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
+};
+
  static int init_first_rw_device(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 struct btrfs_device *device);
@@ -699,7 +714,8 @@ static noinline int device_list_add(const char *path,
          * if there is new btrfs on an already registered device,
          * then remove the stale device entry.
          */
-       btrfs_free_stale_device(device);
+       if (ret > 0)
+               btrfs_free_stale_device(device);
  
         *fs_devices_ret = fs_devices;
  
@@ -988,6 +1004,56 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
         return ret;
  }
  
+void btrfs_release_disk_super(struct page *page)
+{
+       kunmap(page);
+       put_page(page);
+}
+
+int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
+               struct page **page, struct btrfs_super_block **disk_super)
+{
+       void *p;
+       pgoff_t index;
+
+       /* make sure our super fits in the device */
+       if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
+               return 1;
+
+       /* make sure our super fits in the page */
+       if (sizeof(**disk_super) > PAGE_SIZE)
+               return 1;
+
+       /* make sure our super doesn't straddle pages on disk */
+       index = bytenr >> PAGE_SHIFT;
+       if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index)
+               return 1;
+
+       /* pull in the page with our super */
+       *page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
+                                  index, GFP_KERNEL);
+
+       if (IS_ERR_OR_NULL(*page))
+               return 1;
+
+       p = kmap(*page);
+
+       /* align our pointer to the offset of the super block */
+       *disk_super = p + (bytenr & ~PAGE_MASK);
+
+       if (btrfs_super_bytenr(*disk_super) != bytenr ||
+           btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
+               btrfs_release_disk_super(*page);
+               return 1;
+       }
+
+       if ((*disk_super)->label[0] &&
+               (*disk_super)->label[BTRFS_LABEL_SIZE - 1])
+               (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0';
+
+       return 0;
+}
+
  /*
   * Look for a btrfs signature on a device. This may be called out of the mount path
   * and we are not allowed to call set_blocksize during the scan. The superblock
@@ -999,13 +1065,11 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
         struct btrfs_super_block *disk_super;
         struct block_device *bdev;
         struct page *page;
-       void *p;
         int ret = -EINVAL;
         u64 devid;
         u64 transid;
         u64 total_devices;
         u64 bytenr;
-       pgoff_t index;
  
         /*
          * we would like to check all the supers, but that would make
@@ -1018,41 +1082,14 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
         mutex_lock(&uuid_mutex);
  
         bdev = blkdev_get_by_path(path, flags, holder);
-
         if (IS_ERR(bdev)) {
                 ret = PTR_ERR(bdev);
                 goto error;
         }
  
-       /* make sure our super fits in the device */
-       if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
-               goto error_bdev_put;
-
-       /* make sure our super fits in the page */
-       if (sizeof(*disk_super) > PAGE_SIZE)
-               goto error_bdev_put;
-
-       /* make sure our super doesn't straddle pages on disk */
-       index = bytenr >> PAGE_SHIFT;
-       if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
-               goto error_bdev_put;
-
-       /* pull in the page with our super */
-       page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
-                                  index, GFP_NOFS);
-
-       if (IS_ERR_OR_NULL(page))
+       if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super))
                 goto error_bdev_put;
  
-       p = kmap(page);
-
-       /* align our pointer to the offset of the super block */
-       disk_super = p + (bytenr & ~PAGE_MASK);
-
-       if (btrfs_super_bytenr(disk_super) != bytenr ||
-           btrfs_super_magic(disk_super) != BTRFS_MAGIC)
-               goto error_unmap;
-
         devid = btrfs_stack_device_id(&disk_super->dev_item);
         transid = btrfs_super_generation(disk_super);
         total_devices = btrfs_super_num_devices(disk_super);
@@ -1060,8 +1097,6 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
         ret = device_list_add(path, disk_super, devid, fs_devices_ret);
         if (ret > 0) {
                 if (disk_super->label[0]) {
-                       if (disk_super->label[BTRFS_LABEL_SIZE - 1])
-                               disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
                         printk(KERN_INFO "BTRFS: device label %s ", disk_super->label);
                 } else {
                         printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid);
@@ -1073,9 +1108,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
         if (!ret && fs_devices_ret)
                 (*fs_devices_ret)->total_devices = total_devices;
  
-error_unmap:
-       kunmap(page);
-       put_page(page);
+       btrfs_release_disk_super(page);
  
  error_bdev_put:
         blkdev_put(bdev, flags);
@@ -1454,7 +1487,7 @@ again:
                 extent = btrfs_item_ptr(leaf, path->slots[0],
                                         struct btrfs_dev_extent);
         } else {
-               btrfs_std_error(root->fs_info, ret, "Slot search failed");
+               btrfs_handle_fs_error(root->fs_info, ret, "Slot search failed");
                 goto out;
         }
  
@@ -1462,7 +1495,7 @@ again:
  
         ret = btrfs_del_item(trans, root, path);
         if (ret) {
-               btrfs_std_error(root->fs_info, ret,
+               btrfs_handle_fs_error(root->fs_info, ret,
                             "Failed to remove dev extent item");
         } else {
                 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
@@ -1688,32 +1721,92 @@ out:
         return ret;
  }
  
-int btrfs_rm_device(struct btrfs_root *root, char *device_path)
+/*
+ * Verify that @num_devices satisfies the RAID profile constraints in the whole
+ * filesystem. It's up to the caller to adjust that number regarding eg. device
+ * replace.
+ */
+static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
+               u64 num_devices)
+{
+       u64 all_avail;
+       unsigned seq;
+       int i;
+
+       do {
+               seq = read_seqbegin(&fs_info->profiles_lock);
+
+               all_avail = fs_info->avail_data_alloc_bits |
+                           fs_info->avail_system_alloc_bits |
+                           fs_info->avail_metadata_alloc_bits;
+       } while (read_seqretry(&fs_info->profiles_lock, seq));
+
+       for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
+               if (!(all_avail & btrfs_raid_group[i]))
+                       continue;
+
+               if (num_devices < btrfs_raid_array[i].devs_min) {
+                       int ret = btrfs_raid_mindev_error[i];
+
+                       if (ret)
+                               return ret;
+               }
+       }
+
+       return 0;
+}
+
+struct btrfs_device *btrfs_find_next_active_device(struct btrfs_fs_devices *fs_devs,
+                                       struct btrfs_device *device)
  {
-       struct btrfs_device *device;
         struct btrfs_device *next_device;
-       struct block_device *bdev;
-       struct buffer_head *bh = NULL;
-       struct btrfs_super_block *disk_super;
+
+       list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
+               if (next_device != device &&
+                       !next_device->missing && next_device->bdev)
+                       return next_device;
+       }
+
+       return NULL;
+}
+
+/*
+ * Helper function to check if the given device is part of s_bdev / latest_bdev
+ * and replace it with the provided or the next active device, in the context
+ * where this function called, there should be always be another device (or
+ * this_dev) which is active.
+ */
+void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info,
+               struct btrfs_device *device, struct btrfs_device *this_dev)
+{
+       struct btrfs_device *next_device;
+
+       if (this_dev)
+               next_device = this_dev;
+       else
+               next_device = btrfs_find_next_active_device(fs_info->fs_devices,
+                                                               device);
+       ASSERT(next_device);
+
+       if (fs_info->sb->s_bdev &&
+                       (fs_info->sb->s_bdev == device->bdev))
+               fs_info->sb->s_bdev = next_device->bdev;
+
+       if (fs_info->fs_devices->latest_bdev == device->bdev)
+               fs_info->fs_devices->latest_bdev = next_device->bdev;
+}
+
+int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid)
+{
+       struct btrfs_device *device;
         struct btrfs_fs_devices *cur_devices;
-       u64 all_avail;
-       u64 devid;
         u64 num_devices;
-       u8 *dev_uuid;
-       unsigned seq;
         int ret = 0;
         bool clear_super = false;
+       char *dev_name = NULL;
  
         mutex_lock(&uuid_mutex);
  
-       do {
-               seq = read_seqbegin(&root->fs_info->profiles_lock);
-
-               all_avail = root->fs_info->avail_data_alloc_bits |
-                           root->fs_info->avail_system_alloc_bits |
-                           root->fs_info->avail_metadata_alloc_bits;
-       } while (read_seqretry(&root->fs_info->profiles_lock, seq));
-
         num_devices = root->fs_info->fs_devices->num_devices;
         btrfs_dev_replace_lock(&root->fs_info->dev_replace, 0);
         if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
@@ -1722,78 +1815,23 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
         }
         btrfs_dev_replace_unlock(&root->fs_info->dev_replace, 0);
  
-       if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
-               ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET;
-               goto out;
-       }
-
-       if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
-               ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET;
+       ret = btrfs_check_raid_min_devices(root->fs_info, num_devices - 1);
+       if (ret)
                 goto out;
-       }
  
-       if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
-           root->fs_info->fs_devices->rw_devices <= 2) {
-               ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET;
-               goto out;
-       }
-       if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
-           root->fs_info->fs_devices->rw_devices <= 3) {
-               ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET;
+       ret = btrfs_find_device_by_devspec(root, devid, device_path,
+                               &device);
+       if (ret)
                 goto out;
-       }
-
-       if (strcmp(device_path, "missing") == 0) {
-               struct list_head *devices;
-               struct btrfs_device *tmp;
-
-               device = NULL;
-               devices = &root->fs_info->fs_devices->devices;
-               /*
-                * It is safe to read the devices since the volume_mutex
-                * is held.
-                */
-               list_for_each_entry(tmp, devices, dev_list) {
-                       if (tmp->in_fs_metadata &&
-                           !tmp->is_tgtdev_for_dev_replace &&
-                           !tmp->bdev) {
-                               device = tmp;
-                               break;
-                       }
-               }
-               bdev = NULL;
-               bh = NULL;
-               disk_super = NULL;
-               if (!device) {
-                       ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
-                       goto out;
-               }
-       } else {
-               ret = btrfs_get_bdev_and_sb(device_path,
-                                           FMODE_WRITE | FMODE_EXCL,
-                                           root->fs_info->bdev_holder, 0,
-                                           &bdev, &bh);
-               if (ret)
-                       goto out;
-               disk_super = (struct btrfs_super_block *)bh->b_data;
-               devid = btrfs_stack_device_id(&disk_super->dev_item);
-               dev_uuid = disk_super->dev_item.uuid;
-               device = btrfs_find_device(root->fs_info, devid, dev_uuid,
-                                          disk_super->fsid);
-               if (!device) {
-                       ret = -ENOENT;
-                       goto error_brelse;
-               }
-       }
  
         if (device->is_tgtdev_for_dev_replace) {
                 ret = BTRFS_ERROR_DEV_TGT_REPLACE;
-               goto error_brelse;
+               goto out;
         }
  
         if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
                 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
-               goto error_brelse;
+               goto out;
         }
  
         if (device->writeable) {
@@ -1801,6 +1839,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                 list_del_init(&device->dev_alloc_list);
                 device->fs_devices->rw_devices--;
                 unlock_chunks(root);
+               dev_name = kstrdup(device->name->str, GFP_KERNEL);
+               if (!dev_name) {
+                       ret = -ENOMEM;
+                       goto error_undo;
+               }
                 clear_super = true;
         }
  
@@ -1842,12 +1885,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
         if (device->missing)
                 device->fs_devices->missing_devices--;
  
-       next_device = list_entry(root->fs_info->fs_devices->devices.next,
-                                struct btrfs_device, dev_list);
-       if (device->bdev == root->fs_info->sb->s_bdev)
-               root->fs_info->sb->s_bdev = next_device->bdev;
-       if (device->bdev == root->fs_info->fs_devices->latest_bdev)
-               root->fs_info->fs_devices->latest_bdev = next_device->bdev;
+       btrfs_assign_next_active_device(root->fs_info, device, NULL);
  
         if (device->bdev) {
                 device->fs_devices->open_devices--;
@@ -1883,63 +1921,23 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
          * at this point, the device is zero sized.  We want to
          * remove it from the devices list and zero out the old super
          */
-       if (clear_super && disk_super) {
-               u64 bytenr;
-               int i;
-
-               /* make sure this device isn't detected as part of
-                * the FS anymore
-                */
-               memset(&disk_super->magic, 0, sizeof(disk_super->magic));
-               set_buffer_dirty(bh);
-               sync_dirty_buffer(bh);
-
-               /* clear the mirror copies of super block on the disk
-                * being removed, 0th copy is been taken care above and
-                * the below would take of the rest
-                */
-               for (i = 1; i < BTRFS_SUPER_MIRROR_MAX; i++) {
-                       bytenr = btrfs_sb_offset(i);
-                       if (bytenr + BTRFS_SUPER_INFO_SIZE >=
-                                       i_size_read(bdev->bd_inode))
-                               break;
-
-                       brelse(bh);
-                       bh = __bread(bdev, bytenr / 4096,
-                                       BTRFS_SUPER_INFO_SIZE);
-                       if (!bh)
-                               continue;
-
-                       disk_super = (struct btrfs_super_block *)bh->b_data;
-
-                       if (btrfs_super_bytenr(disk_super) != bytenr ||
-                               btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
-                               continue;
-                       }
-                       memset(&disk_super->magic, 0,
-                                               sizeof(disk_super->magic));
-                       set_buffer_dirty(bh);
-                       sync_dirty_buffer(bh);
+       if (clear_super) {
+               struct block_device *bdev;
+
+               bdev = blkdev_get_by_path(dev_name, FMODE_READ | FMODE_EXCL,
+                                               root->fs_info->bdev_holder);
+               if (!IS_ERR(bdev)) {
+                       btrfs_scratch_superblocks(bdev, dev_name);
+                       blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
                 }
         }
  
-       ret = 0;
-
-       if (bdev) {
-               /* Notify udev that device has changed */
-               btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
-
-               /* Update ctime/mtime for device path for libblkid */
-               update_dev_time(device_path);
-       }
-
-error_brelse:
-       brelse(bh);
-       if (bdev)
-               blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
  out:
+       kfree(dev_name);
+
         mutex_unlock(&uuid_mutex);
         return ret;
+
  error_undo:
         if (device->writeable) {
                 lock_chunks(root);
@@ -1948,7 +1946,7 @@ error_undo:
                 device->fs_devices->rw_devices++;
                 unlock_chunks(root);
         }
-       goto error_brelse;
+       goto out;
  }
  
  void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
@@ -1972,11 +1970,8 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
         if (srcdev->missing)
                 fs_devices->missing_devices--;
  
-       if (srcdev->writeable) {
+       if (srcdev->writeable)
                 fs_devices->rw_devices--;
-               /* zero out the old super if it is writable */
-               btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
-       }
  
         if (srcdev->bdev)
                 fs_devices->open_devices--;
@@ -1987,6 +1982,10 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
  {
         struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
  
+       if (srcdev->writeable) {
+               /* zero out the old super if it is writable */
+               btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
+       }
         call_rcu(&srcdev->rcu, free_device);
  
         /*
@@ -2016,32 +2015,33 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
  void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
                                       struct btrfs_device *tgtdev)
  {
-       struct btrfs_device *next_device;
-
         mutex_lock(&uuid_mutex);
         WARN_ON(!tgtdev);
         mutex_lock(&fs_info->fs_devices->device_list_mutex);
  
         btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
  
-       if (tgtdev->bdev) {
-               btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
+       if (tgtdev->bdev)
                 fs_info->fs_devices->open_devices--;
-       }
+
         fs_info->fs_devices->num_devices--;
  
-       next_device = list_entry(fs_info->fs_devices->devices.next,
-                                struct btrfs_device, dev_list);
-       if (tgtdev->bdev == fs_info->sb->s_bdev)
-               fs_info->sb->s_bdev = next_device->bdev;
-       if (tgtdev->bdev == fs_info->fs_devices->latest_bdev)
-               fs_info->fs_devices->latest_bdev = next_device->bdev;
-       list_del_rcu(&tgtdev->dev_list);
+       btrfs_assign_next_active_device(fs_info, tgtdev, NULL);
  
-       call_rcu(&tgtdev->rcu, free_device);
+       list_del_rcu(&tgtdev->dev_list);
  
         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
         mutex_unlock(&uuid_mutex);
+
+       /*
+        * The update_dev_time() with in btrfs_scratch_superblocks()
+        * may lead to a call to btrfs_show_devname() which will try
+        * to hold device_list_mutex. And here this device
+        * is already out of device list, so we don't have to hold
+        * the device_list_mutex lock.
+        */
+       btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
+       call_rcu(&tgtdev->rcu, free_device);
  }
  
  static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
@@ -2101,6 +2101,31 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
         }
  }
  
+/*
+ * Lookup a device given by device id, or the path if the id is 0.
+ */
+int btrfs_find_device_by_devspec(struct btrfs_root *root, u64 devid,
+                                        char *devpath,
+                                        struct btrfs_device **device)
+{
+       int ret;
+
+       if (devid) {
+               ret = 0;
+               *device = btrfs_find_device(root->fs_info, devid, NULL,
+                                           NULL);
+               if (!*device)
+                       ret = -ENOENT;
+       } else {
+               if (!devpath || !devpath[0])
+                       return -EINVAL;
+
+               ret = btrfs_find_device_missing_or_by_path(root, devpath,
+                                                          device);
+       }
+       return ret;
+}
+
  /*
   * does all the dirty work required for changing file system's UUID.
   */
@@ -2418,7 +2443,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
  
                 ret = btrfs_relocate_sys_chunks(root);
                 if (ret < 0)
-                       btrfs_std_error(root->fs_info, ret,
+                       btrfs_handle_fs_error(root->fs_info, ret,
                                     "Failed to relocate sys chunks after "
                                     "device initialization. This can be fixed "
                                     "using the \"btrfs balance\" command.");
@@ -2663,7 +2688,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
         if (ret < 0)
                 goto out;
         else if (ret > 0) { /* Logic error or corruption */
-               btrfs_std_error(root->fs_info, -ENOENT,
+               btrfs_handle_fs_error(root->fs_info, -ENOENT,
                             "Failed lookup while freeing chunk.");
                 ret = -ENOENT;
                 goto out;
@@ -2671,7 +2696,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
  
         ret = btrfs_del_item(trans, root, path);
         if (ret < 0)
-               btrfs_std_error(root->fs_info, ret,
+               btrfs_handle_fs_error(root->fs_info, ret,
                             "Failed to delete chunk item.");
  out:
         btrfs_free_path(path);
@@ -2857,7 +2882,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset)
                                                      chunk_offset);
         if (IS_ERR(trans)) {
                 ret = PTR_ERR(trans);
-               btrfs_std_error(root->fs_info, ret, NULL);
+               btrfs_handle_fs_error(root->fs_info, ret, NULL);
                 return ret;
         }
  
@@ -3402,6 +3427,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
         u32 count_meta = 0;
         u32 count_sys = 0;
         int chunk_reserved = 0;
+       u64 bytes_used = 0;
  
         /* step one make some room on all the devices */
         devices = &fs_info->fs_devices->devices;
@@ -3540,7 +3566,13 @@ again:
                         goto loop;
                 }
  
-               if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) && !chunk_reserved) {
+               ASSERT(fs_info->data_sinfo);
+               spin_lock(&fs_info->data_sinfo->lock);
+               bytes_used = fs_info->data_sinfo->bytes_used;
+               spin_unlock(&fs_info->data_sinfo->lock);
+
+               if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
+                   !chunk_reserved && !bytes_used) {
                         trans = btrfs_start_transaction(chunk_root, 0);
                         if (IS_ERR(trans)) {
                                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
@@ -3632,7 +3664,7 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info)
         unset_balance_control(fs_info);
         ret = del_balance_item(fs_info->tree_root);
         if (ret)
-               btrfs_std_error(fs_info, ret, NULL);
+               btrfs_handle_fs_error(fs_info, ret, NULL);
  
         atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
  }
@@ -3693,10 +3725,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
                 num_devices--;
         }
         btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
-       allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
-       if (num_devices == 1)
-               allowed |= BTRFS_BLOCK_GROUP_DUP;
-       else if (num_devices > 1)
+       allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP;
+       if (num_devices > 1)
                 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
         if (num_devices > 2)
                 allowed |= BTRFS_BLOCK_GROUP_RAID5;
@@ -5278,7 +5308,15 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
         stripe_nr = div64_u64(stripe_nr, stripe_len);
  
         stripe_offset = stripe_nr * stripe_len;
-       BUG_ON(offset < stripe_offset);
+       if (offset < stripe_offset) {
+               btrfs_crit(fs_info, "stripe math has gone wrong, "
+                          "stripe_offset=%llu, offset=%llu, start=%llu, "
+                          "logical=%llu, stripe_len=%llu",
+                          stripe_offset, offset, em->start, logical,
+                          stripe_len);
+               free_extent_map(em);
+               return -EINVAL;
+       }
  
         /* stripe_offset is the offset of this block in its stripe*/
         stripe_offset = offset - stripe_offset;
@@ -5519,7 +5557,13 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                                 &stripe_index);
                 mirror_num = stripe_index + 1;
         }
-       BUG_ON(stripe_index >= map->num_stripes);
+       if (stripe_index >= map->num_stripes) {
+               btrfs_crit(fs_info, "stripe index math went horribly wrong, "
+                          "got stripe_index=%u, num_stripes=%u",
+                          stripe_index, map->num_stripes);
+               ret = -EINVAL;
+               goto out;
+       }
  
         num_alloc_stripes = num_stripes;
         if (dev_replace_is_ongoing) {
@@ -6242,7 +6286,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
                         "invalid chunk length %llu", length);
                 return -EIO;
         }
-       if (!is_power_of_2(stripe_len)) {
+       if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) {
                 btrfs_err(root->fs_info, "invalid chunk stripe length: %llu",
                           stripe_len);
                 return -EIO;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h

index 1939ebd..0ac90f8 100644 (file)
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -340,14 +340,14 @@ struct btrfs_raid_attr {
  };
  
  extern const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES];
-
+extern const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES];
  extern const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES];
  
  struct map_lookup {
         u64 type;
         int io_align;
         int io_width;
-       int stripe_len;
+       u64 stripe_len;
         int sector_size;
         int num_stripes;
         int sub_stripes;
@@ -357,52 +357,6 @@ struct map_lookup {
  #define map_lookup_size(n) (sizeof(struct map_lookup) + \
                             (sizeof(struct btrfs_bio_stripe) * (n)))
  
-/*
- * Restriper's general type filter
- */
-#define BTRFS_BALANCE_DATA             (1ULL << 0)
-#define BTRFS_BALANCE_SYSTEM           (1ULL << 1)
-#define BTRFS_BALANCE_METADATA         (1ULL << 2)
-
-#define BTRFS_BALANCE_TYPE_MASK                (BTRFS_BALANCE_DATA |       \
-                                        BTRFS_BALANCE_SYSTEM |     \
-                                        BTRFS_BALANCE_METADATA)
-
-#define BTRFS_BALANCE_FORCE            (1ULL << 3)
-#define BTRFS_BALANCE_RESUME           (1ULL << 4)
-
-/*
- * Balance filters
- */
-#define BTRFS_BALANCE_ARGS_PROFILES    (1ULL << 0)
-#define BTRFS_BALANCE_ARGS_USAGE       (1ULL << 1)
-#define BTRFS_BALANCE_ARGS_DEVID       (1ULL << 2)
-#define BTRFS_BALANCE_ARGS_DRANGE      (1ULL << 3)
-#define BTRFS_BALANCE_ARGS_VRANGE      (1ULL << 4)
-#define BTRFS_BALANCE_ARGS_LIMIT       (1ULL << 5)
-#define BTRFS_BALANCE_ARGS_LIMIT_RANGE (1ULL << 6)
-#define BTRFS_BALANCE_ARGS_STRIPES_RANGE (1ULL << 7)
-#define BTRFS_BALANCE_ARGS_USAGE_RANGE (1ULL << 10)
-
-#define BTRFS_BALANCE_ARGS_MASK                        \
-       (BTRFS_BALANCE_ARGS_PROFILES |          \
-        BTRFS_BALANCE_ARGS_USAGE |             \
-        BTRFS_BALANCE_ARGS_DEVID |             \
-        BTRFS_BALANCE_ARGS_DRANGE |            \
-        BTRFS_BALANCE_ARGS_VRANGE |            \
-        BTRFS_BALANCE_ARGS_LIMIT |             \
-        BTRFS_BALANCE_ARGS_LIMIT_RANGE |       \
-        BTRFS_BALANCE_ARGS_STRIPES_RANGE |     \
-        BTRFS_BALANCE_ARGS_USAGE_RANGE)
-
-/*
- * Profile changing flags.  When SOFT is set we won't relocate chunk if
- * it already has the target profile (even though it may be
- * half-filled).
- */
-#define BTRFS_BALANCE_ARGS_CONVERT     (1ULL << 8)
-#define BTRFS_BALANCE_ARGS_SOFT                (1ULL << 9)
-
  struct btrfs_balance_args;
  struct btrfs_balance_progress;
  struct btrfs_balance_control {
@@ -445,13 +399,18 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
                           struct btrfs_fs_devices **fs_devices_ret);
  int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
  void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step);
+void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info,
+               struct btrfs_device *device, struct btrfs_device *this_dev);
  int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
                                          char *device_path,
                                          struct btrfs_device **device);
+int btrfs_find_device_by_devspec(struct btrfs_root *root, u64 devid,
+                                        char *devpath,
+                                        struct btrfs_device **device);
  struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
                                         const u64 *devid,
                                         const u8 *uuid);
-int btrfs_rm_device(struct btrfs_root *root, char *device_path);
+int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid);
  void btrfs_cleanup_fs_uuids(void);
  int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
  int btrfs_grow_device(struct btrfs_trans_handle *trans,
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h

index dea8931..23c6960 100644 (file)
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -23,6 +23,7 @@
  
  #define BTRFS_IOCTL_MAGIC 0x94
  #define BTRFS_VOL_NAME_MAX 255
+#define BTRFS_LABEL_SIZE 256
  
  /* this should be 4k */
  #define BTRFS_PATH_NAME_MAX 4087
@@ -33,14 +34,31 @@ struct btrfs_ioctl_vol_args {
  
  #define BTRFS_DEVICE_PATH_NAME_MAX 1024
  
-#define BTRFS_SUBVOL_CREATE_ASYNC      (1ULL << 0)
-#define BTRFS_SUBVOL_RDONLY            (1ULL << 1)
-#define BTRFS_SUBVOL_QGROUP_INHERIT    (1ULL << 2)
+#define BTRFS_DEVICE_SPEC_BY_ID                (1ULL << 3)
+
+#define BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED               \
+                       (BTRFS_SUBVOL_CREATE_ASYNC |    \
+                       BTRFS_SUBVOL_RDONLY |           \
+                       BTRFS_SUBVOL_QGROUP_INHERIT |   \
+                       BTRFS_DEVICE_SPEC_BY_ID)
+
  #define BTRFS_FSID_SIZE 16
  #define BTRFS_UUID_SIZE 16
  #define BTRFS_UUID_UNPARSED_SIZE       37
  
-#define BTRFS_QGROUP_INHERIT_SET_LIMITS        (1ULL << 0)
+/*
+ * flags definition for qgroup limits
+ *
+ * Used by:
+ * struct btrfs_qgroup_limit.flags
+ * struct btrfs_qgroup_limit_item.flags
+ */
+#define BTRFS_QGROUP_LIMIT_MAX_RFER    (1ULL << 0)
+#define BTRFS_QGROUP_LIMIT_MAX_EXCL    (1ULL << 1)
+#define BTRFS_QGROUP_LIMIT_RSV_RFER    (1ULL << 2)
+#define BTRFS_QGROUP_LIMIT_RSV_EXCL    (1ULL << 3)
+#define BTRFS_QGROUP_LIMIT_RFER_CMPR   (1ULL << 4)
+#define BTRFS_QGROUP_LIMIT_EXCL_CMPR   (1ULL << 5)
  
  struct btrfs_qgroup_limit {
         __u64   flags;
@@ -50,6 +68,14 @@ struct btrfs_qgroup_limit {
         __u64   rsv_excl;
  };
  
+/*
+ * flags definition for qgroup inheritance
+ *
+ * Used by:
+ * struct btrfs_qgroup_inherit.flags
+ */
+#define BTRFS_QGROUP_INHERIT_SET_LIMITS        (1ULL << 0)
+
  struct btrfs_qgroup_inherit {
         __u64   flags;
         __u64   num_qgroups;
@@ -64,6 +90,20 @@ struct btrfs_ioctl_qgroup_limit_args {
         struct btrfs_qgroup_limit lim;
  };
  
+/*
+ * flags for subvolumes
+ *
+ * Used by:
+ * struct btrfs_ioctl_vol_args_v2.flags
+ *
+ * BTRFS_SUBVOL_RDONLY is also provided/consumed by the following ioctls:
+ * - BTRFS_IOC_SUBVOL_GETFLAGS
+ * - BTRFS_IOC_SUBVOL_SETFLAGS
+ */
+#define BTRFS_SUBVOL_CREATE_ASYNC      (1ULL << 0)
+#define BTRFS_SUBVOL_RDONLY            (1ULL << 1)
+#define BTRFS_SUBVOL_QGROUP_INHERIT    (1ULL << 2)
+
  #define BTRFS_SUBVOL_NAME_MAX 4039
  struct btrfs_ioctl_vol_args_v2 {
         __s64 fd;
@@ -76,7 +116,10 @@ struct btrfs_ioctl_vol_args_v2 {
                 };
                 __u64 unused[4];
         };
-       char name[BTRFS_SUBVOL_NAME_MAX + 1];
+       union {
+               char name[BTRFS_SUBVOL_NAME_MAX + 1];
+               u64 devid;
+       };
  };
  
  /*
@@ -190,6 +233,37 @@ struct btrfs_ioctl_fs_info_args {
         __u64 reserved[122];                    /* pad to 1k */
  };
  
+/*
+ * feature flags
+ *
+ * Used by:
+ * struct btrfs_ioctl_feature_flags
+ */
+#define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE        (1ULL << 0)
+
+#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF   (1ULL << 0)
+#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL  (1ULL << 1)
+#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS    (1ULL << 2)
+#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO    (1ULL << 3)
+/*
+ * some patches floated around with a second compression method
+ * lets save that incompat here for when they do get in
+ * Note we don't actually support it, we're just reserving the
+ * number
+ */
+#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZOv2  (1ULL << 4)
+
+/*
+ * older kernels tried to do bigger metadata blocks, but the
+ * code was pretty buggy.  Lets not let them try anymore.
+ */
+#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA    (1ULL << 5)
+
+#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF   (1ULL << 6)
+#define BTRFS_FEATURE_INCOMPAT_RAID56          (1ULL << 7)
+#define BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA (1ULL << 8)
+#define BTRFS_FEATURE_INCOMPAT_NO_HOLES                (1ULL << 9)
+
  struct btrfs_ioctl_feature_flags {
         __u64 compat_flags;
         __u64 compat_ro_flags;
@@ -254,6 +328,70 @@ struct btrfs_balance_progress {
         __u64 completed;        /* # of chunks relocated so far */
  };
  
+/*
+ * flags definition for balance
+ *
+ * Restriper's general type filter
+ *
+ * Used by:
+ * btrfs_ioctl_balance_args.flags
+ * btrfs_balance_control.flags (internal)
+ */
+#define BTRFS_BALANCE_DATA             (1ULL << 0)
+#define BTRFS_BALANCE_SYSTEM           (1ULL << 1)
+#define BTRFS_BALANCE_METADATA         (1ULL << 2)
+
+#define BTRFS_BALANCE_TYPE_MASK                (BTRFS_BALANCE_DATA |       \
+                                        BTRFS_BALANCE_SYSTEM |     \
+                                        BTRFS_BALANCE_METADATA)
+
+#define BTRFS_BALANCE_FORCE            (1ULL << 3)
+#define BTRFS_BALANCE_RESUME           (1ULL << 4)
+
+/*
+ * flags definitions for per-type balance args
+ *
+ * Balance filters
+ *
+ * Used by:
+ * struct btrfs_balance_args
+ */
+#define BTRFS_BALANCE_ARGS_PROFILES    (1ULL << 0)
+#define BTRFS_BALANCE_ARGS_USAGE       (1ULL << 1)
+#define BTRFS_BALANCE_ARGS_DEVID       (1ULL << 2)
+#define BTRFS_BALANCE_ARGS_DRANGE      (1ULL << 3)
+#define BTRFS_BALANCE_ARGS_VRANGE      (1ULL << 4)
+#define BTRFS_BALANCE_ARGS_LIMIT       (1ULL << 5)
+#define BTRFS_BALANCE_ARGS_LIMIT_RANGE (1ULL << 6)
+#define BTRFS_BALANCE_ARGS_STRIPES_RANGE (1ULL << 7)
+#define BTRFS_BALANCE_ARGS_USAGE_RANGE (1ULL << 10)
+
+#define BTRFS_BALANCE_ARGS_MASK                        \
+       (BTRFS_BALANCE_ARGS_PROFILES |          \
+        BTRFS_BALANCE_ARGS_USAGE |             \
+        BTRFS_BALANCE_ARGS_DEVID |             \
+        BTRFS_BALANCE_ARGS_DRANGE |            \
+        BTRFS_BALANCE_ARGS_VRANGE |            \
+        BTRFS_BALANCE_ARGS_LIMIT |             \
+        BTRFS_BALANCE_ARGS_LIMIT_RANGE |       \
+        BTRFS_BALANCE_ARGS_STRIPES_RANGE |     \
+        BTRFS_BALANCE_ARGS_USAGE_RANGE)
+
+/*
+ * Profile changing flags.  When SOFT is set we won't relocate chunk if
+ * it already has the target profile (even though it may be
+ * half-filled).
+ */
+#define BTRFS_BALANCE_ARGS_CONVERT     (1ULL << 8)
+#define BTRFS_BALANCE_ARGS_SOFT                (1ULL << 9)
+
+
+/*
+ * flags definition for balance state
+ *
+ * Used by:
+ * struct btrfs_ioctl_balance_args.state
+ */
  #define BTRFS_BALANCE_STATE_RUNNING    (1ULL << 0)
  #define BTRFS_BALANCE_STATE_PAUSE_REQ  (1ULL << 1)
  #define BTRFS_BALANCE_STATE_CANCEL_REQ (1ULL << 2)
@@ -347,9 +485,45 @@ struct btrfs_ioctl_clone_range_args {
    __u64 dest_offset;
  };
  
-/* flags for the defrag range ioctl */
+/*
+ * flags definition for the defrag range ioctl
+ *
+ * Used by:
+ * struct btrfs_ioctl_defrag_range_args.flags
+ */
  #define BTRFS_DEFRAG_RANGE_COMPRESS 1
  #define BTRFS_DEFRAG_RANGE_START_IO 2
+struct btrfs_ioctl_defrag_range_args {
+       /* start of the defrag operation */
+       __u64 start;
+
+       /* number of bytes to defrag, use (u64)-1 to say all */
+       __u64 len;
+
+       /*
+        * flags for the operation, which can include turning
+        * on compression for this one defrag
+        */
+       __u64 flags;
+
+       /*
+        * any extent bigger than this will be considered
+        * already defragged.  Use 0 to take the kernel default
+        * Use 1 to say every single extent must be rewritten
+        */
+       __u32 extent_thresh;
+
+       /*
+        * which compression method to use if turning on compression
+        * for this defrag operation.  If unspecified, zlib will
+        * be used
+        */
+       __u32 compress_type;
+
+       /* spare for later */
+       __u32 unused[4];
+};
+
  
  #define BTRFS_SAME_DATA_DIFFERS        1
  /* For extent-same ioctl */
@@ -659,5 +833,7 @@ static inline char *btrfs_err_str(enum btrfs_err_code err_code)
                                    struct btrfs_ioctl_feature_flags[2])
  #define BTRFS_IOC_GET_SUPPORTED_FEATURES _IOR(BTRFS_IOCTL_MAGIC, 57, \
                                    struct btrfs_ioctl_feature_flags[3])
+#define BTRFS_IOC_RM_DEV_V2 _IOW(BTRFS_IOCTL_MAGIC, 58, \
+                                  struct btrfs_ioctl_vol_args_v2)
  
  #endif /* _UAPI_LINUX_BTRFS_H */
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h

new file mode 100644 (file)

index 0000000..d5ad15a
--- /dev/null
+++ b/include/uapi/linux/btrfs_tree.h
@@ -0,0 +1,966 @@
+#ifndef _BTRFS_CTREE_H_
+#define _BTRFS_CTREE_H_
+
+/*
+ * This header contains the structure definitions and constants used
+ * by file system objects that can be retrieved using
+ * the BTRFS_IOC_SEARCH_TREE ioctl.  That means basically anything that
+ * is needed to describe a leaf node's key or item contents.
+ */
+
+/* holds pointers to all of the tree roots */
+#define BTRFS_ROOT_TREE_OBJECTID 1ULL
+
+/* stores information about which extents are in use, and reference counts */
+#define BTRFS_EXTENT_TREE_OBJECTID 2ULL
+
+/*
+ * chunk tree stores translations from logical -> physical block numbering
+ * the super block points to the chunk tree
+ */
+#define BTRFS_CHUNK_TREE_OBJECTID 3ULL
+
+/*
+ * stores information about which areas of a given device are in use.
+ * one per device.  The tree of tree roots points to the device tree
+ */
+#define BTRFS_DEV_TREE_OBJECTID 4ULL
+
+/* one per subvolume, storing files and directories */
+#define BTRFS_FS_TREE_OBJECTID 5ULL
+
+/* directory objectid inside the root tree */
+#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL
+
+/* holds checksums of all the data extents */
+#define BTRFS_CSUM_TREE_OBJECTID 7ULL
+
+/* holds quota configuration and tracking */
+#define BTRFS_QUOTA_TREE_OBJECTID 8ULL
+
+/* for storing items that use the BTRFS_UUID_KEY* types */
+#define BTRFS_UUID_TREE_OBJECTID 9ULL
+
+/* tracks free space in block groups. */
+#define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL
+
+/* device stats in the device tree */
+#define BTRFS_DEV_STATS_OBJECTID 0ULL
+
+/* for storing balance parameters in the root tree */
+#define BTRFS_BALANCE_OBJECTID -4ULL
+
+/* orhpan objectid for tracking unlinked/truncated files */
+#define BTRFS_ORPHAN_OBJECTID -5ULL
+
+/* does write ahead logging to speed up fsyncs */
+#define BTRFS_TREE_LOG_OBJECTID -6ULL
+#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL
+
+/* for space balancing */
+#define BTRFS_TREE_RELOC_OBJECTID -8ULL
+#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL
+
+/*
+ * extent checksums all have this objectid
+ * this allows them to share the logging tree
+ * for fsyncs
+ */
+#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL
+
+/* For storing free space cache */
+#define BTRFS_FREE_SPACE_OBJECTID -11ULL
+
+/*
+ * The inode number assigned to the special inode for storing
+ * free ino cache
+ */
+#define BTRFS_FREE_INO_OBJECTID -12ULL
+
+/* dummy objectid represents multiple objectids */
+#define BTRFS_MULTIPLE_OBJECTIDS -255ULL
+
+/*
+ * All files have objectids in this range.
+ */
+#define BTRFS_FIRST_FREE_OBJECTID 256ULL
+#define BTRFS_LAST_FREE_OBJECTID -256ULL
+#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL
+
+
+/*
+ * the device items go into the chunk tree.  The key is in the form
+ * [ 1 BTRFS_DEV_ITEM_KEY device_id ]
+ */
+#define BTRFS_DEV_ITEMS_OBJECTID 1ULL
+
+#define BTRFS_BTREE_INODE_OBJECTID 1
+
+#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
+
+#define BTRFS_DEV_REPLACE_DEVID 0ULL
+
+/*
+ * inode items have the data typically returned from stat and store other
+ * info about object characteristics.  There is one for every file and dir in
+ * the FS
+ */
+#define BTRFS_INODE_ITEM_KEY           1
+#define BTRFS_INODE_REF_KEY            12
+#define BTRFS_INODE_EXTREF_KEY         13
+#define BTRFS_XATTR_ITEM_KEY           24
+#define BTRFS_ORPHAN_ITEM_KEY          48
+/* reserve 2-15 close to the inode for later flexibility */
+
+/*
+ * dir items are the name -> inode pointers in a directory.  There is one
+ * for every name in a directory.
+ */
+#define BTRFS_DIR_LOG_ITEM_KEY  60
+#define BTRFS_DIR_LOG_INDEX_KEY 72
+#define BTRFS_DIR_ITEM_KEY     84
+#define BTRFS_DIR_INDEX_KEY    96
+/*
+ * extent data is for file data
+ */
+#define BTRFS_EXTENT_DATA_KEY  108
+
+/*
+ * extent csums are stored in a separate tree and hold csums for
+ * an entire extent on disk.
+ */
+#define BTRFS_EXTENT_CSUM_KEY  128
+
+/*
+ * root items point to tree roots.  They are typically in the root
+ * tree used by the super block to find all the other trees
+ */
+#define BTRFS_ROOT_ITEM_KEY    132
+
+/*
+ * root backrefs tie subvols and snapshots to the directory entries that
+ * reference them
+ */
+#define BTRFS_ROOT_BACKREF_KEY 144
+
+/*
+ * root refs make a fast index for listing all of the snapshots and
+ * subvolumes referenced by a given root.  They point directly to the
+ * directory item in the root that references the subvol
+ */
+#define BTRFS_ROOT_REF_KEY     156
+
+/*
+ * extent items are in the extent map tree.  These record which blocks
+ * are used, and how many references there are to each block
+ */
+#define BTRFS_EXTENT_ITEM_KEY  168
+
+/*
+ * The same as the BTRFS_EXTENT_ITEM_KEY, except it's metadata we already know
+ * the length, so we save the level in key->offset instead of the length.
+ */
+#define BTRFS_METADATA_ITEM_KEY        169
+
+#define BTRFS_TREE_BLOCK_REF_KEY       176
+
+#define BTRFS_EXTENT_DATA_REF_KEY      178
+
+#define BTRFS_EXTENT_REF_V0_KEY                180
+
+#define BTRFS_SHARED_BLOCK_REF_KEY     182
+
+#define BTRFS_SHARED_DATA_REF_KEY      184
+
+/*
+ * block groups give us hints into the extent allocation trees.  Which
+ * blocks are free etc etc
+ */
+#define BTRFS_BLOCK_GROUP_ITEM_KEY 192
+
+/*
+ * Every block group is represented in the free space tree by a free space info
+ * item, which stores some accounting information. It is keyed on
+ * (block_group_start, FREE_SPACE_INFO, block_group_length).
+ */
+#define BTRFS_FREE_SPACE_INFO_KEY 198
+
+/*
+ * A free space extent tracks an extent of space that is free in a block group.
+ * It is keyed on (start, FREE_SPACE_EXTENT, length).
+ */
+#define BTRFS_FREE_SPACE_EXTENT_KEY 199
+
+/*
+ * When a block group becomes very fragmented, we convert it to use bitmaps
+ * instead of extents. A free space bitmap is keyed on
+ * (start, FREE_SPACE_BITMAP, length); the corresponding item is a bitmap with
+ * (length / sectorsize) bits.
+ */
+#define BTRFS_FREE_SPACE_BITMAP_KEY 200
+
+#define BTRFS_DEV_EXTENT_KEY   204
+#define BTRFS_DEV_ITEM_KEY     216
+#define BTRFS_CHUNK_ITEM_KEY   228
+
+/*
+ * Records the overall state of the qgroups.
+ * There's only one instance of this key present,
+ * (0, BTRFS_QGROUP_STATUS_KEY, 0)
+ */
+#define BTRFS_QGROUP_STATUS_KEY         240
+/*
+ * Records the currently used space of the qgroup.
+ * One key per qgroup, (0, BTRFS_QGROUP_INFO_KEY, qgroupid).
+ */
+#define BTRFS_QGROUP_INFO_KEY           242
+/*
+ * Contains the user configured limits for the qgroup.
+ * One key per qgroup, (0, BTRFS_QGROUP_LIMIT_KEY, qgroupid).
+ */
+#define BTRFS_QGROUP_LIMIT_KEY          244
+/*
+ * Records the child-parent relationship of qgroups. For
+ * each relation, 2 keys are present:
+ * (childid, BTRFS_QGROUP_RELATION_KEY, parentid)
+ * (parentid, BTRFS_QGROUP_RELATION_KEY, childid)
+ */
+#define BTRFS_QGROUP_RELATION_KEY       246
+
+/*
+ * Obsolete name, see BTRFS_TEMPORARY_ITEM_KEY.
+ */
+#define BTRFS_BALANCE_ITEM_KEY 248
+
+/*
+ * The key type for tree items that are stored persistently, but do not need to
+ * exist for extended period of time. The items can exist in any tree.
+ *
+ * [subtype, BTRFS_TEMPORARY_ITEM_KEY, data]
+ *
+ * Existing items:
+ *
+ * - balance status item
+ *   (BTRFS_BALANCE_OBJECTID, BTRFS_TEMPORARY_ITEM_KEY, 0)
+ */
+#define BTRFS_TEMPORARY_ITEM_KEY       248
+
+/*
+ * Obsolete name, see BTRFS_PERSISTENT_ITEM_KEY
+ */
+#define BTRFS_DEV_STATS_KEY            249
+
+/*
+ * The key type for tree items that are stored persistently and usually exist
+ * for a long period, eg. filesystem lifetime. The item kinds can be status
+ * information, stats or preference values. The item can exist in any tree.
+ *
+ * [subtype, BTRFS_PERSISTENT_ITEM_KEY, data]
+ *
+ * Existing items:
+ *
+ * - device statistics, store IO stats in the device tree, one key for all
+ *   stats
+ *   (BTRFS_DEV_STATS_OBJECTID, BTRFS_DEV_STATS_KEY, 0)
+ */
+#define BTRFS_PERSISTENT_ITEM_KEY      249
+
+/*
+ * Persistantly stores the device replace state in the device tree.
+ * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0).
+ */
+#define BTRFS_DEV_REPLACE_KEY  250
+
+/*
+ * Stores items that allow to quickly map UUIDs to something else.
+ * These items are part of the filesystem UUID tree.
+ * The key is built like this:
+ * (UUID_upper_64_bits, BTRFS_UUID_KEY*, UUID_lower_64_bits).
+ */
+#if BTRFS_UUID_SIZE != 16
+#error "UUID items require BTRFS_UUID_SIZE == 16!"
+#endif
+#define BTRFS_UUID_KEY_SUBVOL  251     /* for UUIDs assigned to subvols */
+#define BTRFS_UUID_KEY_RECEIVED_SUBVOL 252     /* for UUIDs assigned to
+                                                * received subvols */
+
+/*
+ * string items are for debugging.  They just store a short string of
+ * data in the FS
+ */
+#define BTRFS_STRING_ITEM_KEY  253
+
+
+
+/* 32 bytes in various csum fields */
+#define BTRFS_CSUM_SIZE 32
+
+/* csum types */
+#define BTRFS_CSUM_TYPE_CRC32  0
+
+/*
+ * flags definitions for directory entry item type
+ *
+ * Used by:
+ * struct btrfs_dir_item.type
+ */
+#define BTRFS_FT_UNKNOWN       0
+#define BTRFS_FT_REG_FILE      1
+#define BTRFS_FT_DIR           2
+#define BTRFS_FT_CHRDEV                3
+#define BTRFS_FT_BLKDEV                4
+#define BTRFS_FT_FIFO          5
+#define BTRFS_FT_SOCK          6
+#define BTRFS_FT_SYMLINK       7
+#define BTRFS_FT_XATTR         8
+#define BTRFS_FT_MAX           9
+
+/*
+ * The key defines the order in the tree, and so it also defines (optimal)
+ * block layout.
+ *
+ * objectid corresponds to the inode number.
+ *
+ * type tells us things about the object, and is a kind of stream selector.
+ * so for a given inode, keys with type of 1 might refer to the inode data,
+ * type of 2 may point to file data in the btree and type == 3 may point to
+ * extents.
+ *
+ * offset is the starting byte offset for this key in the stream.
+ *
+ * btrfs_disk_key is in disk byte order.  struct btrfs_key is always
+ * in cpu native order.  Otherwise they are identical and their sizes
+ * should be the same (ie both packed)
+ */
+struct btrfs_disk_key {
+       __le64 objectid;
+       __u8 type;
+       __le64 offset;
+} __attribute__ ((__packed__));
+
+struct btrfs_key {
+       __u64 objectid;
+       __u8 type;
+       __u64 offset;
+} __attribute__ ((__packed__));
+
+struct btrfs_dev_item {
+       /* the internal btrfs device id */
+       __le64 devid;
+
+       /* size of the device */
+       __le64 total_bytes;
+
+       /* bytes used */
+       __le64 bytes_used;
+
+       /* optimal io alignment for this device */
+       __le32 io_align;
+
+       /* optimal io width for this device */
+       __le32 io_width;
+
+       /* minimal io size for this device */
+       __le32 sector_size;
+
+       /* type and info about this device */
+       __le64 type;
+
+       /* expected generation for this device */
+       __le64 generation;
+
+       /*
+        * starting byte of this partition on the device,
+        * to allow for stripe alignment in the future
+        */
+       __le64 start_offset;
+
+       /* grouping information for allocation decisions */
+       __le32 dev_group;
+
+       /* seek speed 0-100 where 100 is fastest */
+       __u8 seek_speed;
+
+       /* bandwidth 0-100 where 100 is fastest */
+       __u8 bandwidth;
+
+       /* btrfs generated uuid for this device */
+       __u8 uuid[BTRFS_UUID_SIZE];
+
+       /* uuid of FS who owns this device */
+       __u8 fsid[BTRFS_UUID_SIZE];
+} __attribute__ ((__packed__));
+
+struct btrfs_stripe {
+       __le64 devid;
+       __le64 offset;
+       __u8 dev_uuid[BTRFS_UUID_SIZE];
+} __attribute__ ((__packed__));
+
+struct btrfs_chunk {
+       /* size of this chunk in bytes */
+       __le64 length;
+
+       /* objectid of the root referencing this chunk */
+       __le64 owner;
+
+       __le64 stripe_len;
+       __le64 type;
+
+       /* optimal io alignment for this chunk */
+       __le32 io_align;
+
+       /* optimal io width for this chunk */
+       __le32 io_width;
+
+       /* minimal io size for this chunk */
+       __le32 sector_size;
+
+       /* 2^16 stripes is quite a lot, a second limit is the size of a single
+        * item in the btree
+        */
+       __le16 num_stripes;
+
+       /* sub stripes only matter for raid10 */
+       __le16 sub_stripes;
+       struct btrfs_stripe stripe;
+       /* additional stripes go here */
+} __attribute__ ((__packed__));
+
+#define BTRFS_FREE_SPACE_EXTENT        1
+#define BTRFS_FREE_SPACE_BITMAP        2
+
+struct btrfs_free_space_entry {
+       __le64 offset;
+       __le64 bytes;
+       __u8 type;
+} __attribute__ ((__packed__));
+
+struct btrfs_free_space_header {
+       struct btrfs_disk_key location;
+       __le64 generation;
+       __le64 num_entries;
+       __le64 num_bitmaps;
+} __attribute__ ((__packed__));
+
+#define BTRFS_HEADER_FLAG_WRITTEN      (1ULL << 0)
+#define BTRFS_HEADER_FLAG_RELOC                (1ULL << 1)
+
+/* Super block flags */
+/* Errors detected */
+#define BTRFS_SUPER_FLAG_ERROR         (1ULL << 2)
+
+#define BTRFS_SUPER_FLAG_SEEDING       (1ULL << 32)
+#define BTRFS_SUPER_FLAG_METADUMP      (1ULL << 33)
+
+
+/*
+ * items in the extent btree are used to record the objectid of the
+ * owner of the block and the number of references
+ */
+
+struct btrfs_extent_item {
+       __le64 refs;
+       __le64 generation;
+       __le64 flags;
+} __attribute__ ((__packed__));
+
+struct btrfs_extent_item_v0 {
+       __le32 refs;
+} __attribute__ ((__packed__));
+
+
+#define BTRFS_EXTENT_FLAG_DATA         (1ULL << 0)
+#define BTRFS_EXTENT_FLAG_TREE_BLOCK   (1ULL << 1)
+
+/* following flags only apply to tree blocks */
+
+/* use full backrefs for extent pointers in the block */
+#define BTRFS_BLOCK_FLAG_FULL_BACKREF  (1ULL << 8)
+
+/*
+ * this flag is only used internally by scrub and may be changed at any time
+ * it is only declared here to avoid collisions
+ */
+#define BTRFS_EXTENT_FLAG_SUPER                (1ULL << 48)
+
+struct btrfs_tree_block_info {
+       struct btrfs_disk_key key;
+       __u8 level;
+} __attribute__ ((__packed__));
+
+struct btrfs_extent_data_ref {
+       __le64 root;
+       __le64 objectid;
+       __le64 offset;
+       __le32 count;
+} __attribute__ ((__packed__));
+
+struct btrfs_shared_data_ref {
+       __le32 count;
+} __attribute__ ((__packed__));
+
+struct btrfs_extent_inline_ref {
+       __u8 type;
+       __le64 offset;
+} __attribute__ ((__packed__));
+
+/* old style backrefs item */
+struct btrfs_extent_ref_v0 {
+       __le64 root;
+       __le64 generation;
+       __le64 objectid;
+       __le32 count;
+} __attribute__ ((__packed__));
+
+
+/* dev extents record free space on individual devices.  The owner
+ * field points back to the chunk allocation mapping tree that allocated
+ * the extent.  The chunk tree uuid field is a way to double check the owner
+ */
+struct btrfs_dev_extent {
+       __le64 chunk_tree;
+       __le64 chunk_objectid;
+       __le64 chunk_offset;
+       __le64 length;
+       __u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
+} __attribute__ ((__packed__));
+
+struct btrfs_inode_ref {
+       __le64 index;
+       __le16 name_len;
+       /* name goes here */
+} __attribute__ ((__packed__));
+
+struct btrfs_inode_extref {
+       __le64 parent_objectid;
+       __le64 index;
+       __le16 name_len;
+       __u8   name[0];
+       /* name goes here */
+} __attribute__ ((__packed__));
+
+struct btrfs_timespec {
+       __le64 sec;
+       __le32 nsec;
+} __attribute__ ((__packed__));
+
+struct btrfs_inode_item {
+       /* nfs style generation number */
+       __le64 generation;
+       /* transid that last touched this inode */
+       __le64 transid;
+       __le64 size;
+       __le64 nbytes;
+       __le64 block_group;
+       __le32 nlink;
+       __le32 uid;
+       __le32 gid;
+       __le32 mode;
+       __le64 rdev;
+       __le64 flags;
+
+       /* modification sequence number for NFS */
+       __le64 sequence;
+
+       /*
+        * a little future expansion, for more than this we can
+        * just grow the inode item and version it
+        */
+       __le64 reserved[4];
+       struct btrfs_timespec atime;
+       struct btrfs_timespec ctime;
+       struct btrfs_timespec mtime;
+       struct btrfs_timespec otime;
+} __attribute__ ((__packed__));
+
+struct btrfs_dir_log_item {
+       __le64 end;
+} __attribute__ ((__packed__));
+
+struct btrfs_dir_item {
+       struct btrfs_disk_key location;
+       __le64 transid;
+       __le16 data_len;
+       __le16 name_len;
+       __u8 type;
+} __attribute__ ((__packed__));
+
+#define BTRFS_ROOT_SUBVOL_RDONLY       (1ULL << 0)
+
+/*
+ * Internal in-memory flag that a subvolume has been marked for deletion but
+ * still visible as a directory
+ */
+#define BTRFS_ROOT_SUBVOL_DEAD         (1ULL << 48)
+
+struct btrfs_root_item {
+       struct btrfs_inode_item inode;
+       __le64 generation;
+       __le64 root_dirid;
+       __le64 bytenr;
+       __le64 byte_limit;
+       __le64 bytes_used;
+       __le64 last_snapshot;
+       __le64 flags;
+       __le32 refs;
+       struct btrfs_disk_key drop_progress;
+       __u8 drop_level;
+       __u8 level;
+
+       /*
+        * The following fields appear after subvol_uuids+subvol_times
+        * were introduced.
+        */
+
+       /*
+        * This generation number is used to test if the new fields are valid
+        * and up to date while reading the root item. Every time the root item
+        * is written out, the "generation" field is copied into this field. If
+        * anyone ever mounted the fs with an older kernel, we will have
+        * mismatching generation values here and thus must invalidate the
+        * new fields. See btrfs_update_root and btrfs_find_last_root for
+        * details.
+        * the offset of generation_v2 is also used as the start for the memset
+        * when invalidating the fields.
+        */
+       __le64 generation_v2;
+       __u8 uuid[BTRFS_UUID_SIZE];
+       __u8 parent_uuid[BTRFS_UUID_SIZE];
+       __u8 received_uuid[BTRFS_UUID_SIZE];
+       __le64 ctransid; /* updated when an inode changes */
+       __le64 otransid; /* trans when created */
+       __le64 stransid; /* trans when sent. non-zero for received subvol */
+       __le64 rtransid; /* trans when received. non-zero for received subvol */
+       struct btrfs_timespec ctime;
+       struct btrfs_timespec otime;
+       struct btrfs_timespec stime;
+       struct btrfs_timespec rtime;
+       __le64 reserved[8]; /* for future */
+} __attribute__ ((__packed__));
+
+/*
+ * this is used for both forward and backward root refs
+ */
+struct btrfs_root_ref {
+       __le64 dirid;
+       __le64 sequence;
+       __le16 name_len;
+} __attribute__ ((__packed__));
+
+struct btrfs_disk_balance_args {
+       /*
+        * profiles to operate on, single is denoted by
+        * BTRFS_AVAIL_ALLOC_BIT_SINGLE
+        */
+       __le64 profiles;
+
+       /*
+        * usage filter
+        * BTRFS_BALANCE_ARGS_USAGE with a single value means '0..N'
+        * BTRFS_BALANCE_ARGS_USAGE_RANGE - range syntax, min..max
+        */
+       union {
+               __le64 usage;
+               struct {
+                       __le32 usage_min;
+                       __le32 usage_max;
+               };
+       };
+
+       /* devid filter */
+       __le64 devid;
+
+       /* devid subset filter [pstart..pend) */
+       __le64 pstart;
+       __le64 pend;
+
+       /* btrfs virtual address space subset filter [vstart..vend) */
+       __le64 vstart;
+       __le64 vend;
+
+       /*
+        * profile to convert to, single is denoted by
+        * BTRFS_AVAIL_ALLOC_BIT_SINGLE
+        */
+       __le64 target;
+
+       /* BTRFS_BALANCE_ARGS_* */
+       __le64 flags;
+
+       /*
+        * BTRFS_BALANCE_ARGS_LIMIT with value 'limit'
+        * BTRFS_BALANCE_ARGS_LIMIT_RANGE - the extend version can use minimum
+        * and maximum
+        */
+       union {
+               __le64 limit;
+               struct {
+                       __le32 limit_min;
+                       __le32 limit_max;
+               };
+       };
+
+       /*
+        * Process chunks that cross stripes_min..stripes_max devices,
+        * BTRFS_BALANCE_ARGS_STRIPES_RANGE
+        */
+       __le32 stripes_min;
+       __le32 stripes_max;
+
+       __le64 unused[6];
+} __attribute__ ((__packed__));
+
+/*
+ * store balance parameters to disk so that balance can be properly
+ * resumed after crash or unmount
+ */
+struct btrfs_balance_item {
+       /* BTRFS_BALANCE_* */
+       __le64 flags;
+
+       struct btrfs_disk_balance_args data;
+       struct btrfs_disk_balance_args meta;
+       struct btrfs_disk_balance_args sys;
+
+       __le64 unused[4];
+} __attribute__ ((__packed__));
+
+#define BTRFS_FILE_EXTENT_INLINE 0
+#define BTRFS_FILE_EXTENT_REG 1
+#define BTRFS_FILE_EXTENT_PREALLOC 2
+
+struct btrfs_file_extent_item {
+       /*
+        * transaction id that created this extent
+        */
+       __le64 generation;
+       /*
+        * max number of bytes to hold this extent in ram
+        * when we split a compressed extent we can't know how big
+        * each of the resulting pieces will be.  So, this is
+        * an upper limit on the size of the extent in ram instead of
+        * an exact limit.
+        */
+       __le64 ram_bytes;
+
+       /*
+        * 32 bits for the various ways we might encode the data,
+        * including compression and encryption.  If any of these
+        * are set to something a given disk format doesn't understand
+        * it is treated like an incompat flag for reading and writing,
+        * but not for stat.
+        */
+       __u8 compression;
+       __u8 encryption;
+       __le16 other_encoding; /* spare for later use */
+
+       /* are we inline data or a real extent? */
+       __u8 type;
+
+       /*
+        * disk space consumed by the extent, checksum blocks are included
+        * in these numbers
+        *
+        * At this offset in the structure, the inline extent data start.
+        */
+       __le64 disk_bytenr;
+       __le64 disk_num_bytes;
+       /*
+        * the logical offset in file blocks (no csums)
+        * this extent record is for.  This allows a file extent to point
+        * into the middle of an existing extent on disk, sharing it
+        * between two snapshots (useful if some bytes in the middle of the
+        * extent have changed
+        */
+       __le64 offset;
+       /*
+        * the logical number of file blocks (no csums included).  This
+        * always reflects the size uncompressed and without encoding.
+        */
+       __le64 num_bytes;
+
+} __attribute__ ((__packed__));
+
+struct btrfs_csum_item {
+       __u8 csum;
+} __attribute__ ((__packed__));
+
+struct btrfs_dev_stats_item {
+       /*
+        * grow this item struct at the end for future enhancements and keep
+        * the existing values unchanged
+        */
+       __le64 values[BTRFS_DEV_STAT_VALUES_MAX];
+} __attribute__ ((__packed__));
+
+#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS    0
+#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID     1
+#define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED     0
+#define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED           1
+#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED         2
+#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED          3
+#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED          4
+
+struct btrfs_dev_replace_item {
+       /*
+        * grow this item struct at the end for future enhancements and keep
+        * the existing values unchanged
+        */
+       __le64 src_devid;
+       __le64 cursor_left;
+       __le64 cursor_right;
+       __le64 cont_reading_from_srcdev_mode;
+
+       __le64 replace_state;
+       __le64 time_started;
+       __le64 time_stopped;
+       __le64 num_write_errors;
+       __le64 num_uncorrectable_read_errors;
+} __attribute__ ((__packed__));
+
+/* different types of block groups (and chunks) */
+#define BTRFS_BLOCK_GROUP_DATA         (1ULL << 0)
+#define BTRFS_BLOCK_GROUP_SYSTEM       (1ULL << 1)
+#define BTRFS_BLOCK_GROUP_METADATA     (1ULL << 2)
+#define BTRFS_BLOCK_GROUP_RAID0                (1ULL << 3)
+#define BTRFS_BLOCK_GROUP_RAID1                (1ULL << 4)
+#define BTRFS_BLOCK_GROUP_DUP          (1ULL << 5)
+#define BTRFS_BLOCK_GROUP_RAID10       (1ULL << 6)
+#define BTRFS_BLOCK_GROUP_RAID5         (1ULL << 7)
+#define BTRFS_BLOCK_GROUP_RAID6         (1ULL << 8)
+#define BTRFS_BLOCK_GROUP_RESERVED     (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \
+                                        BTRFS_SPACE_INFO_GLOBAL_RSV)
+
+enum btrfs_raid_types {
+       BTRFS_RAID_RAID10,
+       BTRFS_RAID_RAID1,
+       BTRFS_RAID_DUP,
+       BTRFS_RAID_RAID0,
+       BTRFS_RAID_SINGLE,
+       BTRFS_RAID_RAID5,
+       BTRFS_RAID_RAID6,
+       BTRFS_NR_RAID_TYPES
+};
+
+#define BTRFS_BLOCK_GROUP_TYPE_MASK    (BTRFS_BLOCK_GROUP_DATA |    \
+                                        BTRFS_BLOCK_GROUP_SYSTEM |  \
+                                        BTRFS_BLOCK_GROUP_METADATA)
+
+#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 |   \
+                                        BTRFS_BLOCK_GROUP_RAID1 |   \
+                                        BTRFS_BLOCK_GROUP_RAID5 |   \
+                                        BTRFS_BLOCK_GROUP_RAID6 |   \
+                                        BTRFS_BLOCK_GROUP_DUP |     \
+                                        BTRFS_BLOCK_GROUP_RAID10)
+#define BTRFS_BLOCK_GROUP_RAID56_MASK  (BTRFS_BLOCK_GROUP_RAID5 |   \
+                                        BTRFS_BLOCK_GROUP_RAID6)
+
+/*
+ * We need a bit for restriper to be able to tell when chunks of type
+ * SINGLE are available.  This "extended" profile format is used in
+ * fs_info->avail_*_alloc_bits (in-memory) and balance item fields
+ * (on-disk).  The corresponding on-disk bit in chunk.type is reserved
+ * to avoid remappings between two formats in future.
+ */
+#define BTRFS_AVAIL_ALLOC_BIT_SINGLE   (1ULL << 48)
+
+/*
+ * A fake block group type that is used to communicate global block reserve
+ * size to userspace via the SPACE_INFO ioctl.
+ */
+#define BTRFS_SPACE_INFO_GLOBAL_RSV    (1ULL << 49)
+
+#define BTRFS_EXTENDED_PROFILE_MASK    (BTRFS_BLOCK_GROUP_PROFILE_MASK | \
+                                        BTRFS_AVAIL_ALLOC_BIT_SINGLE)
+
+static inline __u64 chunk_to_extended(__u64 flags)
+{
+       if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0)
+               flags |= BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+       return flags;
+}
+static inline __u64 extended_to_chunk(__u64 flags)
+{
+       return flags & ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+}
+
+struct btrfs_block_group_item {
+       __le64 used;
+       __le64 chunk_objectid;
+       __le64 flags;
+} __attribute__ ((__packed__));
+
+struct btrfs_free_space_info {
+       __le32 extent_count;
+       __le32 flags;
+} __attribute__ ((__packed__));
+
+#define BTRFS_FREE_SPACE_USING_BITMAPS (1ULL << 0)
+
+#define BTRFS_QGROUP_LEVEL_SHIFT               48
+static inline __u64 btrfs_qgroup_level(__u64 qgroupid)
+{
+       return qgroupid >> BTRFS_QGROUP_LEVEL_SHIFT;
+}
+
+/*
+ * is subvolume quota turned on?
+ */
+#define BTRFS_QGROUP_STATUS_FLAG_ON            (1ULL << 0)
+/*
+ * RESCAN is set during the initialization phase
+ */
+#define BTRFS_QGROUP_STATUS_FLAG_RESCAN                (1ULL << 1)
+/*
+ * Some qgroup entries are known to be out of date,
+ * either because the configuration has changed in a way that
+ * makes a rescan necessary, or because the fs has been mounted
+ * with a non-qgroup-aware version.
+ * Turning qouta off and on again makes it inconsistent, too.
+ */
+#define BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT  (1ULL << 2)
+
+#define BTRFS_QGROUP_STATUS_VERSION        1
+
+struct btrfs_qgroup_status_item {
+       __le64 version;
+       /*
+        * the generation is updated during every commit. As older
+        * versions of btrfs are not aware of qgroups, it will be
+        * possible to detect inconsistencies by checking the
+        * generation on mount time
+        */
+       __le64 generation;
+
+       /* flag definitions see above */
+       __le64 flags;
+
+       /*
+        * only used during scanning to record the progress
+        * of the scan. It contains a logical address
+        */
+       __le64 rescan;
+} __attribute__ ((__packed__));
+
+struct btrfs_qgroup_info_item {
+       __le64 generation;
+       __le64 rfer;
+       __le64 rfer_cmpr;
+       __le64 excl;
+       __le64 excl_cmpr;
+} __attribute__ ((__packed__));
+
+struct btrfs_qgroup_limit_item {
+       /*
+        * only updated when any of the other values change
+        */
+       __le64 flags;
+       __le64 max_rfer;
+       __le64 max_excl;
+       __le64 rsv_rfer;
+       __le64 rsv_excl;
+} __attribute__ ((__packed__));
+
+#endif /* _BTRFS_CTREE_H_ */
author	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 21 May 2016 17:49:22 +0000 (10:49 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 21 May 2016 17:49:22 +0000 (10:49 -0700)
fs/btrfs/backref.c		patch \| blob \| history
fs/btrfs/btrfs_inode.h		patch \| blob \| history
fs/btrfs/compression.c		patch \| blob \| history
fs/btrfs/ctree.c		patch \| blob \| history
fs/btrfs/ctree.h		patch \| blob \| history
fs/btrfs/delayed-inode.c		patch \| blob \| history
fs/btrfs/dev-replace.c		patch \| blob \| history
fs/btrfs/dev-replace.h		patch \| blob \| history
fs/btrfs/disk-io.c		patch \| blob \| history
fs/btrfs/extent-tree.c		patch \| blob \| history
fs/btrfs/extent_io.c		patch \| blob \| history
fs/btrfs/extent_io.h		patch \| blob \| history
fs/btrfs/file.c		patch \| blob \| history
fs/btrfs/inode-item.c		patch \| blob \| history
fs/btrfs/inode.c		patch \| blob \| history
fs/btrfs/ioctl.c		patch \| blob \| history
fs/btrfs/ordered-data.c		patch \| blob \| history
fs/btrfs/ordered-data.h		patch \| blob \| history
fs/btrfs/relocation.c		patch \| blob \| history
fs/btrfs/root-tree.c		patch \| blob \| history
fs/btrfs/scrub.c		patch \| blob \| history
fs/btrfs/send.c		patch \| blob \| history
fs/btrfs/super.c		patch \| blob \| history
fs/btrfs/sysfs.c		patch \| blob \| history
fs/btrfs/transaction.c		patch \| blob \| history
fs/btrfs/tree-log.c		patch \| blob \| history
fs/btrfs/volumes.c		patch \| blob \| history
fs/btrfs/volumes.h		patch \| blob \| history
include/uapi/linux/btrfs.h		patch \| blob \| history
include/uapi/linux/btrfs_tree.h	[new file with mode: 0644]	patch \| blob