Merge branch 'qgroup' of git://git.jan-o-sch.net/btrfs-unstable into for-linus

author Chris Mason <chris.mason@fusionio.com>

Wed, 25 Jul 2012 20:11:38 +0000 (16:11 -0400)

committer Chris Mason <chris.mason@fusionio.com>

Wed, 25 Jul 2012 20:11:38 +0000 (16:11 -0400)
author Chris Mason <chris.mason@fusionio.com>
Wed, 25 Jul 2012 20:11:38 +0000 (16:11 -0400)
committer Chris Mason <chris.mason@fusionio.com>
Wed, 25 Jul 2012 20:11:38 +0000 (16:11 -0400)
diff --combined fs/btrfs/ctree.h

index a0ee2f8,2ba03b9..00f9a50
--- 1/fs/btrfs/ctree.h
--- 2/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@@ -91,6 -91,9 +91,9 @@@ struct btrfs_ordered_sum
   /* for storing balance parameters in the root tree */
   #define BTRFS_BALANCE_OBJECTID -4ULL
   
+ /* holds quota configuration and tracking */
+ #define BTRFS_QUOTA_TREE_OBJECTID 8ULL
+ 
   /* orhpan objectid for tracking unlinked/truncated files */
   #define BTRFS_ORPHAN_OBJECTID -5ULL
   
@@@ -883,6 -886,72 +886,72 @@@ struct btrfs_block_group_item 
         __le64 flags;
   } __attribute__ ((__packed__));
   
+ /*
+  * is subvolume quota turned on?
+  */
+ #define BTRFS_QGROUP_STATUS_FLAG_ON           (1ULL << 0)
+ /*
+  * SCANNING is set during the initialization phase
+  */
+ #define BTRFS_QGROUP_STATUS_FLAG_SCANNING     (1ULL << 1)
+ /*
+  * Some qgroup entries are known to be out of date,
+  * either because the configuration has changed in a way that
+  * makes a rescan necessary, or because the fs has been mounted
+  * with a non-qgroup-aware version.
+  * Turning qouta off and on again makes it inconsistent, too.
+  */
+ #define BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT (1ULL << 2)
+ 
+ #define BTRFS_QGROUP_STATUS_VERSION        1
+ 
+ struct btrfs_qgroup_status_item {
+       __le64 version;
+       /*
+        * the generation is updated during every commit. As older
+        * versions of btrfs are not aware of qgroups, it will be
+        * possible to detect inconsistencies by checking the
+        * generation on mount time
+        */
+       __le64 generation;
+ 
+       /* flag definitions see above */
+       __le64 flags;
+ 
+       /*
+        * only used during scanning to record the progress
+        * of the scan. It contains a logical address
+        */
+       __le64 scan;
+ } __attribute__ ((__packed__));
+ 
+ struct btrfs_qgroup_info_item {
+       __le64 generation;
+       __le64 rfer;
+       __le64 rfer_cmpr;
+       __le64 excl;
+       __le64 excl_cmpr;
+ } __attribute__ ((__packed__));
+ 
+ /* flags definition for qgroup limits */
+ #define BTRFS_QGROUP_LIMIT_MAX_RFER   (1ULL << 0)
+ #define BTRFS_QGROUP_LIMIT_MAX_EXCL   (1ULL << 1)
+ #define BTRFS_QGROUP_LIMIT_RSV_RFER   (1ULL << 2)
+ #define BTRFS_QGROUP_LIMIT_RSV_EXCL   (1ULL << 3)
+ #define BTRFS_QGROUP_LIMIT_RFER_CMPR  (1ULL << 4)
+ #define BTRFS_QGROUP_LIMIT_EXCL_CMPR  (1ULL << 5)
+ 
+ struct btrfs_qgroup_limit_item {
+       /*
+        * only updated when any of the other values change
+        */
+       __le64 flags;
+       __le64 max_rfer;
+       __le64 max_excl;
+       __le64 rsv_rfer;
+       __le64 rsv_excl;
+ } __attribute__ ((__packed__));
+ 
   struct btrfs_space_info {
         u64 flags;
   
@@@ -1030,6 -1099,13 +1099,13 @@@ struct btrfs_block_group_cache 
         struct list_head cluster_list;
   };
   
+ /* delayed seq elem */
+ struct seq_list {
+       struct list_head list;
+       u64 seq;
+ };
+ 
+ /* fs_info */
   struct reloc_control;
   struct btrfs_device;
   struct btrfs_fs_devices;
@@@ -1044,6 -1120,7 +1120,7 @@@ struct btrfs_fs_info 
         struct btrfs_root *dev_root;
         struct btrfs_root *fs_root;
         struct btrfs_root *csum_root;
+       struct btrfs_root *quota_root;
   
         /* the log root tree is a directory of all the other log roots */
         struct btrfs_root *log_root_tree;
@@@ -1144,6 -1221,8 +1221,8 @@@
         spinlock_t tree_mod_seq_lock;
         atomic_t tree_mod_seq;
         struct list_head tree_mod_seq_list;
+       struct seq_list tree_mod_seq_elem;
+       wait_queue_head_t tree_mod_seq_wait;
   
         /* this protects tree_mod_log */
         rwlock_t tree_mod_log_lock;
@@@ -1240,8 -1319,6 +1319,8 @@@
          */
         struct list_head space_info;
   
+ +      struct btrfs_space_info *data_sinfo;
+ +
         struct reloc_control *reloc_ctl;
   
         spinlock_t delalloc_lock;
@@@ -1298,6 -1375,29 +1377,29 @@@
   #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
         u32 check_integrity_print_mask;
   #endif
+       /*
+        * quota information
+        */
+       unsigned int quota_enabled:1;
+ 
+       /*
+        * quota_enabled only changes state after a commit. This holds the
+        * next state.
+        */
+       unsigned int pending_quota_state:1;
+ 
+       /* is qgroup tracking in a consistent state? */
+       u64 qgroup_flags;
+ 
+       /* holds configuration and tracking. Protected by qgroup_lock */
+       struct rb_root qgroup_tree;
+       spinlock_t qgroup_lock;
+ 
+       /* list of dirty qgroups to be written at next commit */
+       struct list_head dirty_qgroups;
+ 
+       /* used by btrfs_qgroup_record_ref for an efficient tree traversal */
+       u64 qgroup_seq;
   
         /* filesystem state */
         u64 fs_state;
@@@ -1527,6 -1627,30 +1629,30 @@@ struct btrfs_ioctl_defrag_range_args 
   #define BTRFS_DEV_ITEM_KEY    216
   #define BTRFS_CHUNK_ITEM_KEY  228
   
+ /*
+  * Records the overall state of the qgroups.
+  * There's only one instance of this key present,
+  * (0, BTRFS_QGROUP_STATUS_KEY, 0)
+  */
+ #define BTRFS_QGROUP_STATUS_KEY         240
+ /*
+  * Records the currently used space of the qgroup.
+  * One key per qgroup, (0, BTRFS_QGROUP_INFO_KEY, qgroupid).
+  */
+ #define BTRFS_QGROUP_INFO_KEY           242
+ /*
+  * Contains the user configured limits for the qgroup.
+  * One key per qgroup, (0, BTRFS_QGROUP_LIMIT_KEY, qgroupid).
+  */
+ #define BTRFS_QGROUP_LIMIT_KEY          244
+ /*
+  * Records the child-parent relationship of qgroups. For
+  * each relation, 2 keys are present:
+  * (childid, BTRFS_QGROUP_RELATION_KEY, parentid)
+  * (parentid, BTRFS_QGROUP_RELATION_KEY, childid)
+  */
+ #define BTRFS_QGROUP_RELATION_KEY       246
+ 
   #define BTRFS_BALANCE_ITEM_KEY        248
   
   /*
@@@ -1623,54 -1747,13 +1749,54 @@@ static inline void btrfs_init_map_toke
                             offsetof(type, member),                     \
                            sizeof(((type *)0)->member)))
   
- -#ifndef BTRFS_SETGET_FUNCS
+ +#define DECLARE_BTRFS_SETGET_BITS(bits)                                       \
+ +u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr,   \
+ +                             unsigned long off,                       \
+ +                              struct btrfs_map_token *token);         \
+ +void btrfs_set_token_##bits(struct extent_buffer *eb, void *ptr,      \
+ +                          unsigned long off, u##bits val,             \
+ +                          struct btrfs_map_token *token);             \
+ +static inline u##bits btrfs_get_##bits(struct extent_buffer *eb, void *ptr, \
+ +                                     unsigned long off)               \
+ +{                                                                     \
+ +      return btrfs_get_token_##bits(eb, ptr, off, NULL);              \
+ +}                                                                     \
+ +static inline void btrfs_set_##bits(struct extent_buffer *eb, void *ptr, \
+ +                                  unsigned long off, u##bits val)     \
+ +{                                                                     \
+ +       btrfs_set_token_##bits(eb, ptr, off, val, NULL);                       \
+ +}
+ +
+ +DECLARE_BTRFS_SETGET_BITS(8)
+ +DECLARE_BTRFS_SETGET_BITS(16)
+ +DECLARE_BTRFS_SETGET_BITS(32)
+ +DECLARE_BTRFS_SETGET_BITS(64)
+ +
   #define BTRFS_SETGET_FUNCS(name, type, member, bits)                  \
- -u##bits btrfs_##name(struct extent_buffer *eb, type *s);              \
- -u##bits btrfs_token_##name(struct extent_buffer *eb, type *s, struct btrfs_map_token *token);         \
- -void btrfs_set_token_##name(struct extent_buffer *eb, type *s, u##bits val, struct btrfs_map_token *token);\
- -void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val);
- -#endif
+ +static inline u##bits btrfs_##name(struct extent_buffer *eb, type *s) \
+ +{                                                                     \
+ +      BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member);   \
+ +      return btrfs_get_##bits(eb, s, offsetof(type, member));         \
+ +}                                                                     \
+ +static inline void btrfs_set_##name(struct extent_buffer *eb, type *s,        \
+ +                                  u##bits val)                        \
+ +{                                                                     \
+ +      BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member);   \
+ +      btrfs_set_##bits(eb, s, offsetof(type, member), val);           \
+ +}                                                                     \
+ +static inline u##bits btrfs_token_##name(struct extent_buffer *eb, type *s, \
+ +                                       struct btrfs_map_token *token) \
+ +{                                                                     \
+ +      BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member);   \
+ +      return btrfs_get_token_##bits(eb, s, offsetof(type, member), token); \
+ +}                                                                     \
+ +static inline void btrfs_set_token_##name(struct extent_buffer *eb,   \
+ +                                        type *s, u##bits val,         \
+ +                                         struct btrfs_map_token *token)       \
+ +{                                                                     \
+ +      BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member);   \
+ +      btrfs_set_token_##bits(eb, s, offsetof(type, member), val, token); \
+ +}
   
   #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits)           \
   static inline u##bits btrfs_##name(struct extent_buffer *eb)          \
@@@ -2508,6 -2591,49 +2634,49 @@@ static inline void btrfs_set_dev_stats_
                             sizeof(val));
   }
   
+ /* btrfs_qgroup_status_item */
+ BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item,
+                  generation, 64);
+ BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item,
+                  version, 64);
+ BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item,
+                  flags, 64);
+ BTRFS_SETGET_FUNCS(qgroup_status_scan, struct btrfs_qgroup_status_item,
+                  scan, 64);
+ 
+ /* btrfs_qgroup_info_item */
+ BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item,
+                  generation, 64);
+ BTRFS_SETGET_FUNCS(qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64);
+ BTRFS_SETGET_FUNCS(qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item,
+                  rfer_cmpr, 64);
+ BTRFS_SETGET_FUNCS(qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64);
+ BTRFS_SETGET_FUNCS(qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item,
+                  excl_cmpr, 64);
+ 
+ BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_generation,
+                        struct btrfs_qgroup_info_item, generation, 64);
+ BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer, struct btrfs_qgroup_info_item,
+                        rfer, 64);
+ BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer_cmpr,
+                        struct btrfs_qgroup_info_item, rfer_cmpr, 64);
+ BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl, struct btrfs_qgroup_info_item,
+                        excl, 64);
+ BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl_cmpr,
+                        struct btrfs_qgroup_info_item, excl_cmpr, 64);
+ 
+ /* btrfs_qgroup_limit_item */
+ BTRFS_SETGET_FUNCS(qgroup_limit_flags, struct btrfs_qgroup_limit_item,
+                  flags, 64);
+ BTRFS_SETGET_FUNCS(qgroup_limit_max_rfer, struct btrfs_qgroup_limit_item,
+                  max_rfer, 64);
+ BTRFS_SETGET_FUNCS(qgroup_limit_max_excl, struct btrfs_qgroup_limit_item,
+                  max_excl, 64);
+ BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item,
+                  rsv_rfer, 64);
+ BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item,
+                  rsv_excl, 64);
+ 
   static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
   {
         return sb->s_fs_info;
@@@ -2650,6 -2776,7 +2819,6 @@@ int btrfs_remove_block_group(struct btr
                              struct btrfs_root *root, u64 group_start);
   u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
   u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
- -void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
   void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
   int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
   void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
@@@ -2703,6 -2830,8 +2872,8 @@@ int btrfs_force_chunk_alloc(struct btrf
   int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range);
   
   int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
+ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
+                                        struct btrfs_fs_info *fs_info);
   /* ctree.c */
   int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
                      int level, int *slot);
@@@ -2753,6 -2882,9 +2924,9 @@@ int btrfs_search_slot(struct btrfs_tran
                       ins_len, int cow);
   int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
                           struct btrfs_path *p, u64 time_seq);
+ int btrfs_search_slot_for_read(struct btrfs_root *root,
+                              struct btrfs_key *key, struct btrfs_path *p,
+                              int find_higher, int return_any);
   int btrfs_realloc_node(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root, struct extent_buffer *parent,
                        int start_slot, int cache_only, u64 *last_ret,
@@@ -2835,11 -2967,22 +3009,22 @@@ static inline void free_fs_info(struct 
         kfree(fs_info->chunk_root);
         kfree(fs_info->dev_root);
         kfree(fs_info->csum_root);
+       kfree(fs_info->quota_root);
         kfree(fs_info->super_copy);
         kfree(fs_info->super_for_commit);
         kfree(fs_info);
   }
   
+ /* tree mod log functions from ctree.c */
+ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
+                          struct seq_list *elem);
+ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
+                           struct seq_list *elem);
+ static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
+ {
+       return atomic_inc_return(&fs_info->tree_mod_seq);
+ }
+ 
   /* root-item.c */
   int btrfs_find_root_ref(struct btrfs_root *tree_root,
                         struct btrfs_path *path,
@@@ -3023,6 -3166,7 +3208,6 @@@ int btrfs_readpage(struct file *file, s
   void btrfs_evict_inode(struct inode *inode);
   int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
   int btrfs_dirty_inode(struct inode *inode);
- -int btrfs_update_time(struct file *file);
   struct inode *btrfs_alloc_inode(struct super_block *sb);
   void btrfs_destroy_inode(struct inode *inode);
   int btrfs_drop_inode(struct inode *inode);
@@@ -3198,17 -3342,49 +3383,49 @@@ void btrfs_reada_detach(void *handle)
   int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
                          u64 start, int err);
   
- /* delayed seq elem */
- struct seq_list {
+ /* qgroup.c */
+ struct qgroup_update {
         struct list_head list;
-       u64 seq;
-       u32 flags;
+       struct btrfs_delayed_ref_node *node;
+       struct btrfs_delayed_extent_op *extent_op;
   };
   
- void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
-                           struct seq_list *elem);
- void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
-                           struct seq_list *elem);
+ int btrfs_quota_enable(struct btrfs_trans_handle *trans,
+                      struct btrfs_fs_info *fs_info);
+ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
+                       struct btrfs_fs_info *fs_info);
+ int btrfs_quota_rescan(struct btrfs_fs_info *fs_info);
+ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
+                             struct btrfs_fs_info *fs_info, u64 src, u64 dst);
+ int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
+                             struct btrfs_fs_info *fs_info, u64 src, u64 dst);
+ int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
+                       struct btrfs_fs_info *fs_info, u64 qgroupid,
+                       char *name);
+ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
+                             struct btrfs_fs_info *fs_info, u64 qgroupid);
+ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
+                      struct btrfs_fs_info *fs_info, u64 qgroupid,
+                      struct btrfs_qgroup_limit *limit);
+ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
+ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
+ struct btrfs_delayed_extent_op;
+ int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
+                           struct btrfs_delayed_ref_node *node,
+                           struct btrfs_delayed_extent_op *extent_op);
+ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
+                            struct btrfs_fs_info *fs_info,
+                            struct btrfs_delayed_ref_node *node,
+                            struct btrfs_delayed_extent_op *extent_op);
+ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
+                     struct btrfs_fs_info *fs_info);
+ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
+                        struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
+                        struct btrfs_qgroup_inherit *inherit);
+ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
+ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes);
+ 
+ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
   
   static inline int is_fstree(u64 rootid)
   {
diff --combined fs/btrfs/disk-io.c

index 1a4a2a9,87d9391..05f4fb6
--- 1/fs/btrfs/disk-io.c
--- 2/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@@ -407,7 -407,7 +407,7 @@@ static int btree_read_extent_buffer_pag
                         break;
         }
   
- -      if (failed && !ret)
+ +      if (failed && !ret && failed_mirror)
                 repair_eb_io_failure(root, eb, failed_mirror);
   
         return ret;
@@@ -1225,6 -1225,82 +1225,82 @@@ static struct btrfs_root *btrfs_alloc_r
         return root;
   }
   
+ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
+                                    struct btrfs_fs_info *fs_info,
+                                    u64 objectid)
+ {
+       struct extent_buffer *leaf;
+       struct btrfs_root *tree_root = fs_info->tree_root;
+       struct btrfs_root *root;
+       struct btrfs_key key;
+       int ret = 0;
+       u64 bytenr;
+ 
+       root = btrfs_alloc_root(fs_info);
+       if (!root)
+               return ERR_PTR(-ENOMEM);
+ 
+       __setup_root(tree_root->nodesize, tree_root->leafsize,
+                    tree_root->sectorsize, tree_root->stripesize,
+                    root, fs_info, objectid);
+       root->root_key.objectid = objectid;
+       root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+       root->root_key.offset = 0;
+ 
+       leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
+                                     0, objectid, NULL, 0, 0, 0);
+       if (IS_ERR(leaf)) {
+               ret = PTR_ERR(leaf);
+               goto fail;
+       }
+ 
+       bytenr = leaf->start;
+       memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
+       btrfs_set_header_bytenr(leaf, leaf->start);
+       btrfs_set_header_generation(leaf, trans->transid);
+       btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
+       btrfs_set_header_owner(leaf, objectid);
+       root->node = leaf;
+ 
+       write_extent_buffer(leaf, fs_info->fsid,
+                           (unsigned long)btrfs_header_fsid(leaf),
+                           BTRFS_FSID_SIZE);
+       write_extent_buffer(leaf, fs_info->chunk_tree_uuid,
+                           (unsigned long)btrfs_header_chunk_tree_uuid(leaf),
+                           BTRFS_UUID_SIZE);
+       btrfs_mark_buffer_dirty(leaf);
+ 
+       root->commit_root = btrfs_root_node(root);
+       root->track_dirty = 1;
+ 
+ 
+       root->root_item.flags = 0;
+       root->root_item.byte_limit = 0;
+       btrfs_set_root_bytenr(&root->root_item, leaf->start);
+       btrfs_set_root_generation(&root->root_item, trans->transid);
+       btrfs_set_root_level(&root->root_item, 0);
+       btrfs_set_root_refs(&root->root_item, 1);
+       btrfs_set_root_used(&root->root_item, leaf->len);
+       btrfs_set_root_last_snapshot(&root->root_item, 0);
+       btrfs_set_root_dirid(&root->root_item, 0);
+       root->root_item.drop_level = 0;
+ 
+       key.objectid = objectid;
+       key.type = BTRFS_ROOT_ITEM_KEY;
+       key.offset = 0;
+       ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item);
+       if (ret)
+               goto fail;
+ 
+       btrfs_tree_unlock(leaf);
+ 
+ fail:
+       if (ret)
+               return ERR_PTR(ret);
+ 
+       return root;
+ }
+ 
   static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
                                          struct btrfs_fs_info *fs_info)
   {
@@@ -1396,6 -1472,9 +1472,9 @@@ struct btrfs_root *btrfs_read_fs_root_n
                 return fs_info->dev_root;
         if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
                 return fs_info->csum_root;
+       if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID)
+               return fs_info->quota_root ? fs_info->quota_root :
+                                            ERR_PTR(-ENOENT);
   again:
         spin_lock(&fs_info->fs_roots_radix_lock);
         root = radix_tree_lookup(&fs_info->fs_roots_radix,
@@@ -1823,6 -1902,10 +1902,10 @@@ static void free_root_pointers(struct b
         free_extent_buffer(info->extent_root->commit_root);
         free_extent_buffer(info->csum_root->node);
         free_extent_buffer(info->csum_root->commit_root);
+       if (info->quota_root) {
+               free_extent_buffer(info->quota_root->node);
+               free_extent_buffer(info->quota_root->commit_root);
+       }
   
         info->tree_root->node = NULL;
         info->tree_root->commit_root = NULL;
@@@ -1832,6 -1915,10 +1915,10 @@@
         info->extent_root->commit_root = NULL;
         info->csum_root->node = NULL;
         info->csum_root->commit_root = NULL;
+       if (info->quota_root) {
+               info->quota_root->node = NULL;
+               info->quota_root->commit_root = NULL;
+       }
   
         if (chunk_root) {
                 free_extent_buffer(info->chunk_root->node);
@@@ -1862,6 -1949,7 +1949,7 @@@ int open_ctree(struct super_block *sb
         struct btrfs_root *csum_root;
         struct btrfs_root *chunk_root;
         struct btrfs_root *dev_root;
+       struct btrfs_root *quota_root;
         struct btrfs_root *log_tree_root;
         int ret;
         int err = -EINVAL;
@@@ -1873,9 -1961,10 +1961,10 @@@
         csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info);
         chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
         dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info);
+       quota_root = fs_info->quota_root = btrfs_alloc_root(fs_info);
   
         if (!tree_root || !extent_root || !csum_root ||
-           !chunk_root || !dev_root) {
+           !chunk_root || !dev_root || !quota_root) {
                 err = -ENOMEM;
                 goto fail;
         }
@@@ -1944,6 -2033,8 +2033,8 @@@
         fs_info->free_chunk_space = 0;
         fs_info->tree_mod_log = RB_ROOT;
   
+       init_waitqueue_head(&fs_info->tree_mod_seq_wait);
+ 
         /* readahead state */
         INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
         spin_lock_init(&fs_info->reada_lock);
@@@ -2032,6 -2123,13 +2123,13 @@@
         init_rwsem(&fs_info->cleanup_work_sem);
         init_rwsem(&fs_info->subvol_sem);
   
+       spin_lock_init(&fs_info->qgroup_lock);
+       fs_info->qgroup_tree = RB_ROOT;
+       INIT_LIST_HEAD(&fs_info->dirty_qgroups);
+       fs_info->qgroup_seq = 1;
+       fs_info->quota_enabled = 0;
+       fs_info->pending_quota_state = 0;
+ 
         btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
         btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
   
@@@ -2244,7 -2342,7 +2342,7 @@@
         ret |= btrfs_start_workers(&fs_info->caching_workers);
         ret |= btrfs_start_workers(&fs_info->readahead_workers);
         if (ret) {
- -              ret = -ENOMEM;
+ +              err = -ENOMEM;
                 goto fail_sb_buffer;
         }
   
@@@ -2356,6 -2454,17 +2454,17 @@@ retry_root_backup
                 goto recovery_tree_root;
         csum_root->track_dirty = 1;
   
+       ret = find_and_setup_root(tree_root, fs_info,
+                                 BTRFS_QUOTA_TREE_OBJECTID, quota_root);
+       if (ret) {
+               kfree(quota_root);
+               quota_root = fs_info->quota_root = NULL;
+       } else {
+               quota_root->track_dirty = 1;
+               fs_info->quota_enabled = 1;
+               fs_info->pending_quota_state = 1;
+       }
+ 
         fs_info->generation = generation;
         fs_info->last_trans_committed = generation;
   
@@@ -2415,6 -2524,9 +2524,9 @@@
                                " integrity check module %s\n", sb->s_id);
         }
   #endif
+       ret = btrfs_read_qgroup_config(fs_info);
+       if (ret)
+               goto fail_trans_kthread;
   
         /* do not make disk changes in broken FS */
         if (btrfs_super_log_root(disk_super) != 0 &&
@@@ -2425,7 -2537,7 +2537,7 @@@
                         printk(KERN_WARNING "Btrfs log replay required "
                                "on RO media\n");
                         err = -EIO;
-                       goto fail_trans_kthread;
+                       goto fail_qgroup;
                 }
                 blocksize =
                      btrfs_level_size(tree_root,
@@@ -2434,7 -2546,7 +2546,7 @@@
                 log_tree_root = btrfs_alloc_root(fs_info);
                 if (!log_tree_root) {
                         err = -ENOMEM;
-                       goto fail_trans_kthread;
+                       goto fail_qgroup;
                 }
   
                 __setup_root(nodesize, leafsize, sectorsize, stripesize,
@@@ -2466,15 -2578,15 +2578,15 @@@
   
         if (!(sb->s_flags & MS_RDONLY)) {
                 ret = btrfs_cleanup_fs_roots(fs_info);
- -              if (ret) {
- -                      }
+ +              if (ret)
+ +                      goto fail_trans_kthread;
   
                 ret = btrfs_recover_relocation(tree_root);
                 if (ret < 0) {
                         printk(KERN_WARNING
                                "btrfs: failed to recover relocation\n");
                         err = -EINVAL;
-                       goto fail_trans_kthread;
+                       goto fail_qgroup;
                 }
         }
   
@@@ -2484,10 -2596,10 +2596,10 @@@
   
         fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
         if (!fs_info->fs_root)
-               goto fail_trans_kthread;
+               goto fail_qgroup;
         if (IS_ERR(fs_info->fs_root)) {
                 err = PTR_ERR(fs_info->fs_root);
-               goto fail_trans_kthread;
+               goto fail_qgroup;
         }
   
         if (sb->s_flags & MS_RDONLY)
@@@ -2511,6 -2623,8 +2623,8 @@@
   
         return 0;
   
+ fail_qgroup:
+       btrfs_free_qgroup_config(fs_info);
   fail_trans_kthread:
         kthread_stop(fs_info->transaction_kthread);
   fail_cleaner:
@@@ -2781,7 -2895,7 +2895,7 @@@ static int write_dev_flush(struct btrfs
          * one reference for us, and we leave it for the
          * caller
          */
- -      device->flush_bio = NULL;;
+ +      device->flush_bio = NULL;
         bio = bio_alloc(GFP_NOFS, 0);
         if (!bio)
                 return -ENOMEM;
@@@ -3109,6 -3223,8 +3223,8 @@@ int close_ctree(struct btrfs_root *root
         fs_info->closing = 2;
         smp_mb();
   
+       btrfs_free_qgroup_config(root->fs_info);
+ 
         if (fs_info->delalloc_bytes) {
                 printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
                        (unsigned long long)fs_info->delalloc_bytes);
@@@ -3128,6 -3244,10 +3244,10 @@@
         free_extent_buffer(fs_info->dev_root->commit_root);
         free_extent_buffer(fs_info->csum_root->node);
         free_extent_buffer(fs_info->csum_root->commit_root);
+       if (fs_info->quota_root) {
+               free_extent_buffer(fs_info->quota_root->node);
+               free_extent_buffer(fs_info->quota_root->commit_root);
+       }
   
         btrfs_free_block_groups(fs_info);
   
@@@ -3258,7 -3378,7 +3378,7 @@@ int btrfs_read_buffer(struct extent_buf
         return btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
   }
   
- static int btree_lock_page_hook(struct page *page, void *data,
+ int btree_lock_page_hook(struct page *page, void *data,
                                 void (*flush_fn)(void *))
   {
         struct inode *inode = page->mapping->host;
diff --combined fs/btrfs/extent-tree.c

index 71b2d1c,2ce16f9..44f0620
--- 1/fs/btrfs/extent-tree.c
--- 2/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@@ -34,6 -34,8 +34,8 @@@
   #include "locking.h"
   #include "free-space-cache.h"
   
+ #undef SCRAMBLE_DELAYED_REFS
+ 
   /*
    * control flags for do_chunk_alloc's force field
    * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
@@@ -2217,6 -2219,7 +2219,7 @@@ static noinline int run_clustered_refs(
         struct btrfs_delayed_ref_node *ref;
         struct btrfs_delayed_ref_head *locked_ref = NULL;
         struct btrfs_delayed_extent_op *extent_op;
+       struct btrfs_fs_info *fs_info = root->fs_info;
         int ret;
         int count = 0;
         int must_insert_reserved = 0;
@@@ -2255,7 -2258,7 +2258,7 @@@
                 ref = select_delayed_ref(locked_ref);
   
                 if (ref && ref->seq &&
-                   btrfs_check_delayed_seq(delayed_refs, ref->seq)) {
+                   btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
                         /*
                          * there are still refs with lower seq numbers in the
                          * process of being added. Don't run this ref yet.
@@@ -2337,7 -2340,7 +2340,7 @@@
                 }
   
   next:
-               do_chunk_alloc(trans, root->fs_info->extent_root,
+               do_chunk_alloc(trans, fs_info->extent_root,
                                2 * 1024 * 1024,
                                btrfs_get_alloc_profile(root, 0),
                                CHUNK_ALLOC_NO_FORCE);
@@@ -2347,21 -2350,99 +2350,99 @@@
         return count;
   }
   
- static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs,
+ static void wait_for_more_refs(struct btrfs_fs_info *fs_info,
+                              struct btrfs_delayed_ref_root *delayed_refs,
                                unsigned long num_refs,
                                struct list_head *first_seq)
   {
         spin_unlock(&delayed_refs->lock);
         pr_debug("waiting for more refs (num %ld, first %p)\n",
                  num_refs, first_seq);
-       wait_event(delayed_refs->seq_wait,
+       wait_event(fs_info->tree_mod_seq_wait,
                    num_refs != delayed_refs->num_entries ||
-                  delayed_refs->seq_head.next != first_seq);
+                  fs_info->tree_mod_seq_list.next != first_seq);
         pr_debug("done waiting for more refs (num %ld, first %p)\n",
-                delayed_refs->num_entries, delayed_refs->seq_head.next);
+                delayed_refs->num_entries, fs_info->tree_mod_seq_list.next);
         spin_lock(&delayed_refs->lock);
   }
   
+ #ifdef SCRAMBLE_DELAYED_REFS
+ /*
+  * Normally delayed refs get processed in ascending bytenr order. This
+  * correlates in most cases to the order added. To expose dependencies on this
+  * order, we start to process the tree in the middle instead of the beginning
+  */
+ static u64 find_middle(struct rb_root *root)
+ {
+       struct rb_node *n = root->rb_node;
+       struct btrfs_delayed_ref_node *entry;
+       int alt = 1;
+       u64 middle;
+       u64 first = 0, last = 0;
+ 
+       n = rb_first(root);
+       if (n) {
+               entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+               first = entry->bytenr;
+       }
+       n = rb_last(root);
+       if (n) {
+               entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+               last = entry->bytenr;
+       }
+       n = root->rb_node;
+ 
+       while (n) {
+               entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+               WARN_ON(!entry->in_tree);
+ 
+               middle = entry->bytenr;
+ 
+               if (alt)
+                       n = n->rb_left;
+               else
+                       n = n->rb_right;
+ 
+               alt = 1 - alt;
+       }
+       return middle;
+ }
+ #endif
+ 
+ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
+                                        struct btrfs_fs_info *fs_info)
+ {
+       struct qgroup_update *qgroup_update;
+       int ret = 0;
+ 
+       if (list_empty(&trans->qgroup_ref_list) !=
+           !trans->delayed_ref_elem.seq) {
+               /* list without seq or seq without list */
+               printk(KERN_ERR "btrfs: qgroup accounting update error, list is%s empty, seq is %llu\n",
+                       list_empty(&trans->qgroup_ref_list) ? "" : " not",
+                       trans->delayed_ref_elem.seq);
+               BUG();
+       }
+ 
+       if (!trans->delayed_ref_elem.seq)
+               return 0;
+ 
+       while (!list_empty(&trans->qgroup_ref_list)) {
+               qgroup_update = list_first_entry(&trans->qgroup_ref_list,
+                                                struct qgroup_update, list);
+               list_del(&qgroup_update->list);
+               if (!ret)
+                       ret = btrfs_qgroup_account_ref(
+                                       trans, fs_info, qgroup_update->node,
+                                       qgroup_update->extent_op);
+               kfree(qgroup_update);
+       }
+ 
+       btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
+ 
+       return ret;
+ }
+ 
   /*
    * this starts processing the delayed reference count updates and
    * extent insertions we have queued up so far.  count can be
@@@ -2398,11 -2479,18 +2479,18 @@@ int btrfs_run_delayed_refs(struct btrfs
                        2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0),
                        CHUNK_ALLOC_NO_FORCE);
   
+       btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
+ 
         delayed_refs = &trans->transaction->delayed_refs;
         INIT_LIST_HEAD(&cluster);
   again:
         consider_waiting = 0;
         spin_lock(&delayed_refs->lock);
+ 
+ #ifdef SCRAMBLE_DELAYED_REFS
+       delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
+ #endif
+ 
         if (count == 0) {
                 count = delayed_refs->num_entries * 2;
                 run_most = 1;
@@@ -2437,7 -2525,7 +2525,7 @@@
                                 num_refs = delayed_refs->num_entries;
                                 first_seq = root->fs_info->tree_mod_seq_list.next;
                         } else {
-                               wait_for_more_refs(delayed_refs,
+                               wait_for_more_refs(root->fs_info, delayed_refs,
                                                    num_refs, first_seq);
                                 /*
                                  * after waiting, things have changed. we
@@@ -2502,6 -2590,7 +2590,7 @@@
         }
   out:
         spin_unlock(&delayed_refs->lock);
+       assert_qgroups_uptodate(trans);
         return 0;
   }
   
@@@ -2581,10 -2670,8 +2670,10 @@@ static noinline int check_delayed_ref(s
   
         node = rb_prev(node);
         if (node) {
+ +              int seq = ref->seq;
+ +
                 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
- -              if (ref->bytenr == bytenr)
+ +              if (ref->bytenr == bytenr && ref->seq == seq)
                         goto out_unlock;
         }
   
@@@ -2905,13 -2992,8 +2994,13 @@@ again
         }
   
         spin_lock(&block_group->lock);
- -      if (block_group->cached != BTRFS_CACHE_FINISHED) {
- -              /* We're not cached, don't bother trying to write stuff out */
+ +      if (block_group->cached != BTRFS_CACHE_FINISHED ||
+ +          !btrfs_test_opt(root, SPACE_CACHE)) {
+ +              /*
+ +               * don't bother trying to write stuff out _if_
+ +               * a) we're not cached,
+ +               * b) we're with nospace_cache mount option.
+ +               */
                 dcs = BTRFS_DC_WRITTEN;
                 spin_unlock(&block_group->lock);
                 goto out_put;
@@@ -3141,8 -3223,6 +3230,8 @@@ static int update_space_info(struct btr
         init_waitqueue_head(&found->wait);
         *space_info = found;
         list_add_rcu(&found->list, &info->space_info);
+ +      if (flags & BTRFS_BLOCK_GROUP_DATA)
+ +              info->data_sinfo = found;
         return 0;
   }
   
@@@ -3272,6 -3352,12 +3361,6 @@@ u64 btrfs_get_alloc_profile(struct btrf
         return get_alloc_profile(root, flags);
   }
   
- -void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
- -{
- -      BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
- -                                                     BTRFS_BLOCK_GROUP_DATA);
- -}
- -
   /*
    * This will check the space that the inode allocates from to make sure we have
    * enough space for bytes.
@@@ -3280,7 -3366,6 +3369,7 @@@ int btrfs_check_data_free_space(struct 
   {
         struct btrfs_space_info *data_sinfo;
         struct btrfs_root *root = BTRFS_I(inode)->root;
+ +      struct btrfs_fs_info *fs_info = root->fs_info;
         u64 used;
         int ret = 0, committed = 0, alloc_chunk = 1;
   
@@@ -3293,7 -3378,7 +3382,7 @@@
                 committed = 1;
         }
   
- -      data_sinfo = BTRFS_I(inode)->space_info;
+ +      data_sinfo = fs_info->data_sinfo;
         if (!data_sinfo)
                 goto alloc;
   
@@@ -3334,9 -3419,10 +3423,9 @@@ alloc
                                         goto commit_trans;
                         }
   
- -                      if (!data_sinfo) {
- -                              btrfs_set_inode_space_info(root, inode);
- -                              data_sinfo = BTRFS_I(inode)->space_info;
- -                      }
+ +                      if (!data_sinfo)
+ +                              data_sinfo = fs_info->data_sinfo;
+ +
                         goto again;
                 }
   
@@@ -3383,7 -3469,7 +3472,7 @@@ void btrfs_free_reserved_data_space(str
         /* make sure bytes are sectorsize aligned */
         bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
   
- -      data_sinfo = BTRFS_I(inode)->space_info;
+ +      data_sinfo = root->fs_info->data_sinfo;
         spin_lock(&data_sinfo->lock);
         data_sinfo->bytes_may_use -= bytes;
         trace_btrfs_space_reservation(root->fs_info, "space_info",
@@@ -3589,58 -3675,89 +3678,58 @@@ out
   /*
    * shrink metadata reservation for delalloc
    */
- -static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim,
- -                         bool wait_ordered)
+ +static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
+ +                          bool wait_ordered)
   {
         struct btrfs_block_rsv *block_rsv;
         struct btrfs_space_info *space_info;
         struct btrfs_trans_handle *trans;
- -      u64 reserved;
+ +      u64 delalloc_bytes;
         u64 max_reclaim;
- -      u64 reclaimed = 0;
         long time_left;
         unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
         int loops = 0;
- -      unsigned long progress;
   
         trans = (struct btrfs_trans_handle *)current->journal_info;
         block_rsv = &root->fs_info->delalloc_block_rsv;
         space_info = block_rsv->space_info;
   
         smp_mb();
- -      reserved = space_info->bytes_may_use;
- -      progress = space_info->reservation_progress;
- -
- -      if (reserved == 0)
- -              return 0;
- -
- -      smp_mb();
- -      if (root->fs_info->delalloc_bytes == 0) {
+ +      delalloc_bytes = root->fs_info->delalloc_bytes;
+ +      if (delalloc_bytes == 0) {
                 if (trans)
- -                      return 0;
+ +                      return;
                 btrfs_wait_ordered_extents(root, 0, 0);
- -              return 0;
+ +              return;
         }
   
- -      max_reclaim = min(reserved, to_reclaim);
- -      nr_pages = max_t(unsigned long, nr_pages,
- -                       max_reclaim >> PAGE_CACHE_SHIFT);
- -      while (loops < 1024) {
- -              /* have the flusher threads jump in and do some IO */
- -              smp_mb();
- -              nr_pages = min_t(unsigned long, nr_pages,
- -                     root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
+ +      while (delalloc_bytes && loops < 3) {
+ +              max_reclaim = min(delalloc_bytes, to_reclaim);
+ +              nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
                 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
- -                                              WB_REASON_FS_FREE_SPACE);
+ +                                             WB_REASON_FS_FREE_SPACE);
   
                 spin_lock(&space_info->lock);
- -              if (reserved > space_info->bytes_may_use)
- -                      reclaimed += reserved - space_info->bytes_may_use;
- -              reserved = space_info->bytes_may_use;
+ +              if (space_info->bytes_used + space_info->bytes_reserved +
+ +                  space_info->bytes_pinned + space_info->bytes_readonly +
+ +                  space_info->bytes_may_use + orig <=
+ +                  space_info->total_bytes) {
+ +                      spin_unlock(&space_info->lock);
+ +                      break;
+ +              }
                 spin_unlock(&space_info->lock);
   
                 loops++;
- -
- -              if (reserved == 0 || reclaimed >= max_reclaim)
- -                      break;
- -
- -              if (trans && trans->transaction->blocked)
- -                      return -EAGAIN;
- -
                 if (wait_ordered && !trans) {
                         btrfs_wait_ordered_extents(root, 0, 0);
                 } else {
- -                      time_left = schedule_timeout_interruptible(1);
- -
- -                      /* We were interrupted, exit */
+ +                      time_left = schedule_timeout_killable(1);
                         if (time_left)
                                 break;
                 }
- -
- -              /* we've kicked the IO a few times, if anything has been freed,
- -               * exit.  There is no sense in looping here for a long time
- -               * when we really need to commit the transaction, or there are
- -               * just too many writers without enough free space
- -               */
- -
- -              if (loops > 3) {
- -                      smp_mb();
- -                      if (progress != space_info->reservation_progress)
- -                              break;
- -              }
- -
+ +              smp_mb();
+ +              delalloc_bytes = root->fs_info->delalloc_bytes;
         }
- -
- -      return reclaimed >= to_reclaim;
   }
   
   /**
@@@ -3700,58 -3817,6 +3789,58 @@@ commit
         return btrfs_commit_transaction(trans, root);
   }
   
+ +enum flush_state {
+ +      FLUSH_DELALLOC          =       1,
+ +      FLUSH_DELALLOC_WAIT     =       2,
+ +      FLUSH_DELAYED_ITEMS_NR  =       3,
+ +      FLUSH_DELAYED_ITEMS     =       4,
+ +      COMMIT_TRANS            =       5,
+ +};
+ +
+ +static int flush_space(struct btrfs_root *root,
+ +                     struct btrfs_space_info *space_info, u64 num_bytes,
+ +                     u64 orig_bytes, int state)
+ +{
+ +      struct btrfs_trans_handle *trans;
+ +      int nr;
+ +      int ret = 0;
+ +
+ +      switch (state) {
+ +      case FLUSH_DELALLOC:
+ +      case FLUSH_DELALLOC_WAIT:
+ +              shrink_delalloc(root, num_bytes, orig_bytes,
+ +                              state == FLUSH_DELALLOC_WAIT);
+ +              break;
+ +      case FLUSH_DELAYED_ITEMS_NR:
+ +      case FLUSH_DELAYED_ITEMS:
+ +              if (state == FLUSH_DELAYED_ITEMS_NR) {
+ +                      u64 bytes = btrfs_calc_trans_metadata_size(root, 1);
+ +
+ +                      nr = (int)div64_u64(num_bytes, bytes);
+ +                      if (!nr)
+ +                              nr = 1;
+ +                      nr *= 2;
+ +              } else {
+ +                      nr = -1;
+ +              }
+ +              trans = btrfs_join_transaction(root);
+ +              if (IS_ERR(trans)) {
+ +                      ret = PTR_ERR(trans);
+ +                      break;
+ +              }
+ +              ret = btrfs_run_delayed_items_nr(trans, root, nr);
+ +              btrfs_end_transaction(trans, root);
+ +              break;
+ +      case COMMIT_TRANS:
+ +              ret = may_commit_transaction(root, space_info, orig_bytes, 0);
+ +              break;
+ +      default:
+ +              ret = -ENOSPC;
+ +              break;
+ +      }
+ +
+ +      return ret;
+ +}
   /**
    * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
    * @root - the root we're allocating for
@@@ -3773,10 -3838,11 +3862,10 @@@ static int reserve_metadata_bytes(struc
         struct btrfs_space_info *space_info = block_rsv->space_info;
         u64 used;
         u64 num_bytes = orig_bytes;
- -      int retries = 0;
+ +      int flush_state = FLUSH_DELALLOC;
         int ret = 0;
- -      bool committed = false;
         bool flushing = false;
- -      bool wait_ordered = false;
+ +      bool committed = false;
   
   again:
         ret = 0;
@@@ -3835,8 -3901,9 +3924,8 @@@
                  * amount plus the amount of bytes that we need for this
                  * reservation.
                  */
- -              wait_ordered = true;
                 num_bytes = used - space_info->total_bytes +
- -                      (orig_bytes * (retries + 1));
+ +                      (orig_bytes * 2);
         }
   
         if (ret) {
@@@ -3889,6 -3956,8 +3978,6 @@@
                         trace_btrfs_space_reservation(root->fs_info,
                                 "space_info", space_info->flags, orig_bytes, 1);
                         ret = 0;
- -              } else {
- -                      wait_ordered = true;
                 }
         }
   
@@@ -3907,13 -3976,36 +3996,13 @@@
         if (!ret || !flush)
                 goto out;
   
- -      /*
- -       * We do synchronous shrinking since we don't actually unreserve
- -       * metadata until after the IO is completed.
- -       */
- -      ret = shrink_delalloc(root, num_bytes, wait_ordered);
- -      if (ret < 0)
- -              goto out;
- -
- -      ret = 0;
- -
- -      /*
- -       * So if we were overcommitted it's possible that somebody else flushed
- -       * out enough space and we simply didn't have enough space to reclaim,
- -       * so go back around and try again.
- -       */
- -      if (retries < 2) {
- -              wait_ordered = true;
- -              retries++;
+ +      ret = flush_space(root, space_info, num_bytes, orig_bytes,
+ +                        flush_state);
+ +      flush_state++;
+ +      if (!ret)
                 goto again;
- -      }
- -
- -      ret = -ENOSPC;
- -      if (committed)
- -              goto out;
- -
- -      ret = may_commit_transaction(root, space_info, orig_bytes, 0);
- -      if (!ret) {
- -              committed = true;
+ +      else if (flush_state <= COMMIT_TRANS)
                 goto again;
- -      }
   
   out:
         if (flushing) {
@@@ -3931,10 -4023,7 +4020,10 @@@ static struct btrfs_block_rsv *get_bloc
   {
         struct btrfs_block_rsv *block_rsv = NULL;
   
- -      if (root->ref_cows || root == root->fs_info->csum_root)
+ +      if (root->ref_cows)
+ +              block_rsv = trans->block_rsv;
+ +
+ +      if (root == root->fs_info->csum_root && trans->adding_csums)
                 block_rsv = trans->block_rsv;
   
         if (!block_rsv)
@@@ -4286,9 -4375,6 +4375,9 @@@ static void release_global_block_rsv(st
   void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root)
   {
+ +      if (!trans->block_rsv)
+ +              return;
+ +
         if (!trans->bytes_reserved)
                 return;
   
@@@ -4447,7 -4533,7 +4536,7 @@@ int btrfs_delalloc_reserve_metadata(str
         int ret;
   
         /* Need to be holding the i_mutex here if we aren't free space cache */
- -      if (btrfs_is_free_space_inode(root, inode))
+ +      if (btrfs_is_free_space_inode(inode))
                 flush = 0;
   
         if (flush && btrfs_transaction_in_commit(root->fs_info))
@@@ -4479,6 -4565,13 +4568,13 @@@
         csum_bytes = BTRFS_I(inode)->csum_bytes;
         spin_unlock(&BTRFS_I(inode)->lock);
   
+       if (root->fs_info->quota_enabled) {
+               ret = btrfs_qgroup_reserve(root, num_bytes +
+                                          nr_extents * root->leafsize);
+               if (ret)
+                       return ret;
+       }
+ 
         ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
         if (ret) {
                 u64 to_free = 0;
@@@ -4557,6 -4650,11 +4653,11 @@@ void btrfs_delalloc_release_metadata(st
   
         trace_btrfs_space_reservation(root->fs_info, "delalloc",
                                       btrfs_ino(inode), to_free, 0);
+       if (root->fs_info->quota_enabled) {
+               btrfs_qgroup_free(root, num_bytes +
+                                       dropped * root->leafsize);
+       }
+ 
         btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
                                 to_free);
   }
@@@ -5193,8 -5291,8 +5294,8 @@@ static noinline int check_ref_cleanup(s
         rb_erase(&head->node.rb_node, &delayed_refs->root);
   
         delayed_refs->num_entries--;
-       if (waitqueue_active(&delayed_refs->seq_wait))
-               wake_up(&delayed_refs->seq_wait);
+       if (waitqueue_active(&root->fs_info->tree_mod_seq_wait))
+               wake_up(&root->fs_info->tree_mod_seq_wait);
   
         /*
          * we don't take a ref on the node because we're removing it from the
@@@ -5751,11 -5849,7 +5852,11 @@@ loop
                                 ret = do_chunk_alloc(trans, root, num_bytes +
                                                      2 * 1024 * 1024, data,
                                                      CHUNK_ALLOC_LIMITED);
- -                              if (ret < 0) {
+ +                              /*
+ +                               * Do not bail out on ENOSPC since we
+ +                               * can do more things.
+ +                               */
+ +                              if (ret < 0 && ret != -ENOSPC) {
                                         btrfs_abort_transaction(trans,
                                                                 root, ret);
                                         goto out;
@@@ -5823,13 -5917,13 +5924,13 @@@ static void dump_space_info(struct btrf
   again:
         list_for_each_entry(cache, &info->block_groups[index], list) {
                 spin_lock(&cache->lock);
- -              printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
- -                     "%llu pinned %llu reserved\n",
+ +              printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n",
                        (unsigned long long)cache->key.objectid,
                        (unsigned long long)cache->key.offset,
                        (unsigned long long)btrfs_block_group_used(&cache->item),
                        (unsigned long long)cache->pinned,
- -                     (unsigned long long)cache->reserved);
+ +                     (unsigned long long)cache->reserved,
+ +                     cache->ro ? "[readonly]" : "");
                 btrfs_dump_free_space(cache, bytes);
                 spin_unlock(&cache->lock);
         }
@@@ -7617,21 -7711,8 +7718,21 @@@ int btrfs_read_block_groups(struct btrf
                 INIT_LIST_HEAD(&cache->list);
                 INIT_LIST_HEAD(&cache->cluster_list);
   
- -              if (need_clear)
+ +              if (need_clear) {
+ +                      /*
+ +                       * When we mount with old space cache, we need to
+ +                       * set BTRFS_DC_CLEAR and set dirty flag.
+ +                       *
+ +                       * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
+ +                       *    truncate the old free space cache inode and
+ +                       *    setup a new one.
+ +                       * b) Setting 'dirty flag' makes sure that we flush
+ +                       *    the new space cache info onto disk.
+ +                       */
                         cache->disk_cache_state = BTRFS_DC_CLEAR;
+ +                      if (btrfs_test_opt(root, SPACE_CACHE))
+ +                              cache->dirty = 1;
+ +              }
   
                 read_extent_buffer(leaf, &cache->item,
                                    btrfs_item_ptr_offset(leaf, path->slots[0]),
diff --combined fs/btrfs/ioctl.c

index 17facea,1dffd0a..e54b663
--- 1/fs/btrfs/ioctl.c
--- 2/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@@ -336,7 -336,8 +336,8 @@@ static noinline int btrfs_ioctl_fitrim(
   static noinline int create_subvol(struct btrfs_root *root,
                                   struct dentry *dentry,
                                   char *name, int namelen,
-                                 u64 *async_transid)
+                                 u64 *async_transid,
+                                 struct btrfs_qgroup_inherit **inherit)
   {
         struct btrfs_trans_handle *trans;
         struct btrfs_key key;
@@@ -368,6 -369,11 +369,11 @@@
         if (IS_ERR(trans))
                 return PTR_ERR(trans);
   
+       ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid,
+                                  inherit ? *inherit : NULL);
+       if (ret)
+               goto fail;
+ 
         leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
                                       0, objectid, NULL, 0, 0, 0);
         if (IS_ERR(leaf)) {
@@@ -484,7 -490,7 +490,7 @@@ fail
   
   static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
                            char *name, int namelen, u64 *async_transid,
-                          bool readonly)
+                          bool readonly, struct btrfs_qgroup_inherit **inherit)
   {
         struct inode *inode;
         struct btrfs_pending_snapshot *pending_snapshot;
@@@ -502,6 -508,10 +508,10 @@@
         pending_snapshot->dentry = dentry;
         pending_snapshot->root = root;
         pending_snapshot->readonly = readonly;
+       if (inherit) {
+               pending_snapshot->inherit = *inherit;
+               *inherit = NULL;        /* take responsibility to free it */
+       }
   
         trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
         if (IS_ERR(trans)) {
@@@ -635,7 -645,8 +645,8 @@@ static inline int btrfs_may_create(stru
   static noinline int btrfs_mksubvol(struct path *parent,
                                    char *name, int namelen,
                                    struct btrfs_root *snap_src,
-                                  u64 *async_transid, bool readonly)
+                                  u64 *async_transid, bool readonly,
+                                  struct btrfs_qgroup_inherit **inherit)
   {
         struct inode *dir  = parent->dentry->d_inode;
         struct dentry *dentry;
@@@ -652,9 -663,13 +663,9 @@@
         if (dentry->d_inode)
                 goto out_dput;
   
- -      error = mnt_want_write(parent->mnt);
- -      if (error)
- -              goto out_dput;
- -
         error = btrfs_may_create(dir, dentry);
         if (error)
- -              goto out_drop_write;
+ +              goto out_dput;
   
         down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
   
@@@ -662,16 -677,18 +673,16 @@@
                 goto out_up_read;
   
         if (snap_src) {
-               error = create_snapshot(snap_src, dentry,
-                                       name, namelen, async_transid, readonly);
+               error = create_snapshot(snap_src, dentry, name, namelen,
+                                       async_transid, readonly, inherit);
         } else {
                 error = create_subvol(BTRFS_I(dir)->root, dentry,
-                                     name, namelen, async_transid);
+                                     name, namelen, async_transid, inherit);
         }
         if (!error)
                 fsnotify_mkdir(dir, dentry);
   out_up_read:
         up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
- -out_drop_write:
- -      mnt_drop_write(parent->mnt);
   out_dput:
         dput(dentry);
   out_unlock:
@@@ -826,8 -843,7 +837,8 @@@ static bool defrag_check_next_extent(st
   }
   
   static int should_defrag_range(struct inode *inode, u64 start, int thresh,
- -                             u64 *last_len, u64 *skip, u64 *defrag_end)
+ +                             u64 *last_len, u64 *skip, u64 *defrag_end,
+ +                             int compress)
   {
         struct extent_map *em;
         int ret = 1;
@@@ -858,7 -874,7 +869,7 @@@
          * we hit a real extent, if it is big or the next extent is not a
          * real extent, don't bother defragging it
          */
- -      if ((*last_len == 0 || *last_len >= thresh) &&
+ +      if (!compress && (*last_len == 0 || *last_len >= thresh) &&
             (em->len >= thresh || !next_mergeable))
                 ret = 0;
   out:
@@@ -1140,8 -1156,7 +1151,8 @@@ int btrfs_defrag_file(struct inode *ino
   
                 if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
                                          extent_thresh, &last_len, &skip,
- -                                       &defrag_end)) {
+ +                                       &defrag_end, range->flags &
+ +                                       BTRFS_DEFRAG_RANGE_COMPRESS)) {
                         unsigned long next;
                         /*
                          * the should_defrag function tells us how much to skip
@@@ -1375,41 -1390,39 +1386,39 @@@ out
   }
   
   static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
-                                                   char *name,
-                                                   unsigned long fd,
-                                                   int subvol,
-                                                   u64 *transid,
-                                                   bool readonly)
+                               char *name, unsigned long fd, int subvol,
+                               u64 *transid, bool readonly,
+                               struct btrfs_qgroup_inherit **inherit)
   {
- -      struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
         struct file *src_file;
         int namelen;
         int ret = 0;
   
- -      if (root->fs_info->sb->s_flags & MS_RDONLY)
- -              return -EROFS;
+ +      ret = mnt_want_write_file(file);
+ +      if (ret)
+ +              goto out;
   
         namelen = strlen(name);
         if (strchr(name, '/')) {
                 ret = -EINVAL;
- -              goto out;
+ +              goto out_drop_write;
         }
   
         if (name[0] == '.' &&
            (namelen == 1 || (name[1] == '.' && namelen == 2))) {
                 ret = -EEXIST;
- -              goto out;
+ +              goto out_drop_write;
         }
   
         if (subvol) {
                 ret = btrfs_mksubvol(&file->f_path, name, namelen,
-                                    NULL, transid, readonly);
+                                    NULL, transid, readonly, inherit);
         } else {
                 struct inode *src_inode;
                 src_file = fget(fd);
                 if (!src_file) {
                         ret = -EINVAL;
- -                      goto out;
+ +                      goto out_drop_write;
                 }
   
                 src_inode = src_file->f_path.dentry->d_inode;
@@@ -1418,15 -1431,13 +1427,15 @@@
                                "another FS\n");
                         ret = -EINVAL;
                         fput(src_file);
- -                      goto out;
+ +                      goto out_drop_write;
                 }
                 ret = btrfs_mksubvol(&file->f_path, name, namelen,
                                      BTRFS_I(src_inode)->root,
-                                    transid, readonly);
+                                    transid, readonly, inherit);
                 fput(src_file);
         }
+ +out_drop_write:
+ +      mnt_drop_write_file(file);
   out:
         return ret;
   }
@@@ -1444,7 -1455,7 +1453,7 @@@ static noinline int btrfs_ioctl_snap_cr
   
         ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
                                               vol_args->fd, subvol,
-                                             NULL, false);
+                                             NULL, false, NULL);
   
         kfree(vol_args);
         return ret;
@@@ -1458,6 -1469,7 +1467,7 @@@ static noinline int btrfs_ioctl_snap_cr
         u64 transid = 0;
         u64 *ptr = NULL;
         bool readonly = false;
+       struct btrfs_qgroup_inherit *inherit = NULL;
   
         vol_args = memdup_user(arg, sizeof(*vol_args));
         if (IS_ERR(vol_args))
@@@ -1465,7 -1477,8 +1475,8 @@@
         vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
   
         if (vol_args->flags &
-           ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY)) {
+           ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY |
+             BTRFS_SUBVOL_QGROUP_INHERIT)) {
                 ret = -EOPNOTSUPP;
                 goto out;
         }
@@@ -1474,10 -1487,21 +1485,21 @@@
                 ptr = &transid;
         if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
                 readonly = true;
+       if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
+               if (vol_args->size > PAGE_CACHE_SIZE) {
+                       ret = -EINVAL;
+                       goto out;
+               }
+               inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size);
+               if (IS_ERR(inherit)) {
+                       ret = PTR_ERR(inherit);
+                       goto out;
+               }
+       }
   
         ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
-                                             vol_args->fd, subvol,
-                                             ptr, readonly);
+                                             vol_args->fd, subvol, ptr,
+                                             readonly, &inherit);
   
         if (ret == 0 && ptr &&
             copy_to_user(arg +
@@@ -1486,6 -1510,7 +1508,7 @@@
                 ret = -EFAULT;
   out:
         kfree(vol_args);
+       kfree(inherit);
         return ret;
   }
   
@@@ -1521,40 -1546,29 +1544,40 @@@ static noinline int btrfs_ioctl_subvol_
         u64 flags;
         int ret = 0;
   
- -      if (root->fs_info->sb->s_flags & MS_RDONLY)
- -              return -EROFS;
+ +      ret = mnt_want_write_file(file);
+ +      if (ret)
+ +              goto out;
   
- -      if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID)
- -              return -EINVAL;
+ +      if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
+ +              ret = -EINVAL;
+ +              goto out_drop_write;
+ +      }
   
- -      if (copy_from_user(&flags, arg, sizeof(flags)))
- -              return -EFAULT;
+ +      if (copy_from_user(&flags, arg, sizeof(flags))) {
+ +              ret = -EFAULT;
+ +              goto out_drop_write;
+ +      }
   
- -      if (flags & BTRFS_SUBVOL_CREATE_ASYNC)
- -              return -EINVAL;
+ +      if (flags & BTRFS_SUBVOL_CREATE_ASYNC) {
+ +              ret = -EINVAL;
+ +              goto out_drop_write;
+ +      }
   
- -      if (flags & ~BTRFS_SUBVOL_RDONLY)
- -              return -EOPNOTSUPP;
+ +      if (flags & ~BTRFS_SUBVOL_RDONLY) {
+ +              ret = -EOPNOTSUPP;
+ +              goto out_drop_write;
+ +      }
   
- -      if (!inode_owner_or_capable(inode))
- -              return -EACCES;
+ +      if (!inode_owner_or_capable(inode)) {
+ +              ret = -EACCES;
+ +              goto out_drop_write;
+ +      }
   
         down_write(&root->fs_info->subvol_sem);
   
         /* nothing to do */
         if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root))
- -              goto out;
+ +              goto out_drop_sem;
   
         root_flags = btrfs_root_flags(&root->root_item);
         if (flags & BTRFS_SUBVOL_RDONLY)
@@@ -1577,11 -1591,8 +1600,11 @@@
   out_reset:
         if (ret)
                 btrfs_set_root_flags(&root->root_item, root_flags);
- -out:
+ +out_drop_sem:
         up_write(&root->fs_info->subvol_sem);
+ +out_drop_write:
+ +      mnt_drop_write_file(file);
+ +out:
         return ret;
   }
   
@@@ -3075,21 -3086,19 +3098,21 @@@ static long btrfs_ioctl_scrub_progress(
   }
   
   static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root,
- -                                    void __user *arg, int reset_after_read)
+ +                                    void __user *arg)
   {
         struct btrfs_ioctl_get_dev_stats *sa;
         int ret;
   
- -      if (reset_after_read && !capable(CAP_SYS_ADMIN))
- -              return -EPERM;
- -
         sa = memdup_user(arg, sizeof(*sa));
         if (IS_ERR(sa))
                 return PTR_ERR(sa);
   
- -      ret = btrfs_get_dev_stats(root, sa, reset_after_read);
+ +      if ((sa->flags & BTRFS_DEV_STATS_RESET) && !capable(CAP_SYS_ADMIN)) {
+ +              kfree(sa);
+ +              return -EPERM;
+ +      }
+ +
+ +      ret = btrfs_get_dev_stats(root, sa);
   
         if (copy_to_user(arg, sa, sizeof(*sa)))
                 ret = -EFAULT;
@@@ -3279,7 -3288,10 +3302,7 @@@ static long btrfs_ioctl_balance(struct 
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
   
- -      if (fs_info->sb->s_flags & MS_RDONLY)
- -              return -EROFS;
- -
- -      ret = mnt_want_write(file->f_path.mnt);
+ +      ret = mnt_want_write_file(file);
         if (ret)
                 return ret;
   
@@@ -3349,7 -3361,7 +3372,7 @@@ out_bargs
   out:
         mutex_unlock(&fs_info->balance_mutex);
         mutex_unlock(&fs_info->volume_mutex);
- -      mnt_drop_write(file->f_path.mnt);
+ +      mnt_drop_write_file(file);
         return ret;
   }
   
@@@ -3401,6 -3413,183 +3424,183 @@@ out
         return ret;
   }
   
+ static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg)
+ {
+       struct btrfs_ioctl_quota_ctl_args *sa;
+       struct btrfs_trans_handle *trans = NULL;
+       int ret;
+       int err;
+ 
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+ 
+       if (root->fs_info->sb->s_flags & MS_RDONLY)
+               return -EROFS;
+ 
+       sa = memdup_user(arg, sizeof(*sa));
+       if (IS_ERR(sa))
+               return PTR_ERR(sa);
+ 
+       if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) {
+               trans = btrfs_start_transaction(root, 2);
+               if (IS_ERR(trans)) {
+                       ret = PTR_ERR(trans);
+                       goto out;
+               }
+       }
+ 
+       switch (sa->cmd) {
+       case BTRFS_QUOTA_CTL_ENABLE:
+               ret = btrfs_quota_enable(trans, root->fs_info);
+               break;
+       case BTRFS_QUOTA_CTL_DISABLE:
+               ret = btrfs_quota_disable(trans, root->fs_info);
+               break;
+       case BTRFS_QUOTA_CTL_RESCAN:
+               ret = btrfs_quota_rescan(root->fs_info);
+               break;
+       default:
+               ret = -EINVAL;
+               break;
+       }
+ 
+       if (copy_to_user(arg, sa, sizeof(*sa)))
+               ret = -EFAULT;
+ 
+       if (trans) {
+               err = btrfs_commit_transaction(trans, root);
+               if (err && !ret)
+                       ret = err;
+       }
+ 
+ out:
+       kfree(sa);
+       return ret;
+ }
+ 
+ static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg)
+ {
+       struct btrfs_ioctl_qgroup_assign_args *sa;
+       struct btrfs_trans_handle *trans;
+       int ret;
+       int err;
+ 
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+ 
+       if (root->fs_info->sb->s_flags & MS_RDONLY)
+               return -EROFS;
+ 
+       sa = memdup_user(arg, sizeof(*sa));
+       if (IS_ERR(sa))
+               return PTR_ERR(sa);
+ 
+       trans = btrfs_join_transaction(root);
+       if (IS_ERR(trans)) {
+               ret = PTR_ERR(trans);
+               goto out;
+       }
+ 
+       /* FIXME: check if the IDs really exist */
+       if (sa->assign) {
+               ret = btrfs_add_qgroup_relation(trans, root->fs_info,
+                                               sa->src, sa->dst);
+       } else {
+               ret = btrfs_del_qgroup_relation(trans, root->fs_info,
+                                               sa->src, sa->dst);
+       }
+ 
+       err = btrfs_end_transaction(trans, root);
+       if (err && !ret)
+               ret = err;
+ 
+ out:
+       kfree(sa);
+       return ret;
+ }
+ 
+ static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg)
+ {
+       struct btrfs_ioctl_qgroup_create_args *sa;
+       struct btrfs_trans_handle *trans;
+       int ret;
+       int err;
+ 
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+ 
+       if (root->fs_info->sb->s_flags & MS_RDONLY)
+               return -EROFS;
+ 
+       sa = memdup_user(arg, sizeof(*sa));
+       if (IS_ERR(sa))
+               return PTR_ERR(sa);
+ 
+       trans = btrfs_join_transaction(root);
+       if (IS_ERR(trans)) {
+               ret = PTR_ERR(trans);
+               goto out;
+       }
+ 
+       /* FIXME: check if the IDs really exist */
+       if (sa->create) {
+               ret = btrfs_create_qgroup(trans, root->fs_info, sa->qgroupid,
+                                         NULL);
+       } else {
+               ret = btrfs_remove_qgroup(trans, root->fs_info, sa->qgroupid);
+       }
+ 
+       err = btrfs_end_transaction(trans, root);
+       if (err && !ret)
+               ret = err;
+ 
+ out:
+       kfree(sa);
+       return ret;
+ }
+ 
+ static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg)
+ {
+       struct btrfs_ioctl_qgroup_limit_args *sa;
+       struct btrfs_trans_handle *trans;
+       int ret;
+       int err;
+       u64 qgroupid;
+ 
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+ 
+       if (root->fs_info->sb->s_flags & MS_RDONLY)
+               return -EROFS;
+ 
+       sa = memdup_user(arg, sizeof(*sa));
+       if (IS_ERR(sa))
+               return PTR_ERR(sa);
+ 
+       trans = btrfs_join_transaction(root);
+       if (IS_ERR(trans)) {
+               ret = PTR_ERR(trans);
+               goto out;
+       }
+ 
+       qgroupid = sa->qgroupid;
+       if (!qgroupid) {
+               /* take the current subvol as qgroup */
+               qgroupid = root->root_key.objectid;
+       }
+ 
+       /* FIXME: check if the IDs really exist */
+       ret = btrfs_limit_qgroup(trans, root->fs_info, qgroupid, &sa->lim);
+ 
+       err = btrfs_end_transaction(trans, root);
+       if (err && !ret)
+               ret = err;
+ 
+ out:
+       kfree(sa);
+       return ret;
+ }
+ 
   long btrfs_ioctl(struct file *file, unsigned int
                 cmd, unsigned long arg)
   {
@@@ -3422,6 -3611,8 +3622,8 @@@
                 return btrfs_ioctl_snap_create_v2(file, argp, 0);
         case BTRFS_IOC_SUBVOL_CREATE:
                 return btrfs_ioctl_snap_create(file, argp, 1);
+       case BTRFS_IOC_SUBVOL_CREATE_V2:
+               return btrfs_ioctl_snap_create_v2(file, argp, 1);
         case BTRFS_IOC_SNAP_DESTROY:
                 return btrfs_ioctl_snap_destroy(file, argp);
         case BTRFS_IOC_SUBVOL_GETFLAGS:
@@@ -3484,7 -3675,17 +3686,15 @@@
         case BTRFS_IOC_BALANCE_PROGRESS:
                 return btrfs_ioctl_balance_progress(root, argp);
         case BTRFS_IOC_GET_DEV_STATS:
- -              return btrfs_ioctl_get_dev_stats(root, argp, 0);
- -      case BTRFS_IOC_GET_AND_RESET_DEV_STATS:
- -              return btrfs_ioctl_get_dev_stats(root, argp, 1);
+ +              return btrfs_ioctl_get_dev_stats(root, argp);
+       case BTRFS_IOC_QUOTA_CTL:
+               return btrfs_ioctl_quota_ctl(root, argp);
+       case BTRFS_IOC_QGROUP_ASSIGN:
+               return btrfs_ioctl_qgroup_assign(root, argp);
+       case BTRFS_IOC_QGROUP_CREATE:
+               return btrfs_ioctl_qgroup_create(root, argp);
+       case BTRFS_IOC_QGROUP_LIMIT:
+               return btrfs_ioctl_qgroup_limit(root, argp);
         }
   
         return -ENOTTY;
diff --combined fs/btrfs/ioctl.h

index 4e3e5d3,cdda57f..3f9701d
--- 1/fs/btrfs/ioctl.h
--- 2/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@@ -32,15 -32,46 +32,46 @@@ struct btrfs_ioctl_vol_args 
   
   #define BTRFS_SUBVOL_CREATE_ASYNC     (1ULL << 0)
   #define BTRFS_SUBVOL_RDONLY           (1ULL << 1)
+ #define BTRFS_SUBVOL_QGROUP_INHERIT   (1ULL << 2)
   #define BTRFS_FSID_SIZE 16
   #define BTRFS_UUID_SIZE 16
   
+ #define BTRFS_QGROUP_INHERIT_SET_LIMITS       (1ULL << 0)
+ 
+ struct btrfs_qgroup_limit {
+       __u64   flags;
+       __u64   max_rfer;
+       __u64   max_excl;
+       __u64   rsv_rfer;
+       __u64   rsv_excl;
+ };
+ 
+ struct btrfs_qgroup_inherit {
+       __u64   flags;
+       __u64   num_qgroups;
+       __u64   num_ref_copies;
+       __u64   num_excl_copies;
+       struct btrfs_qgroup_limit lim;
+       __u64   qgroups[0];
+ };
+ 
+ struct btrfs_ioctl_qgroup_limit_args {
+       __u64   qgroupid;
+       struct btrfs_qgroup_limit lim;
+ };
+ 
   #define BTRFS_SUBVOL_NAME_MAX 4039
   struct btrfs_ioctl_vol_args_v2 {
         __s64 fd;
         __u64 transid;
         __u64 flags;
-       __u64 unused[4];
+       union {
+               struct {
+                       __u64 size;
+                       struct btrfs_qgroup_inherit __user *qgroup_inherit;
+               };
+               __u64 unused[4];
+       };
         char name[BTRFS_SUBVOL_NAME_MAX + 1];
   };
   
@@@ -285,13 -316,9 +316,13 @@@ enum btrfs_dev_stat_values 
         BTRFS_DEV_STAT_VALUES_MAX
   };
   
+ +/* Reset statistics after reading; needs SYS_ADMIN capability */
+ +#define       BTRFS_DEV_STATS_RESET           (1ULL << 0)
+ +
   struct btrfs_ioctl_get_dev_stats {
         __u64 devid;                            /* in */
         __u64 nr_items;                         /* in/out */
+ +      __u64 flags;                            /* in/out */
   
         /* out values: */
         __u64 values[BTRFS_DEV_STAT_VALUES_MAX];
@@@ -299,6 -326,25 +330,25 @@@
         __u64 unused[128 - 2 - BTRFS_DEV_STAT_VALUES_MAX]; /* pad to 1k */
   };
   
+ #define BTRFS_QUOTA_CTL_ENABLE        1
+ #define BTRFS_QUOTA_CTL_DISABLE       2
+ #define BTRFS_QUOTA_CTL_RESCAN        3
+ struct btrfs_ioctl_quota_ctl_args {
+       __u64 cmd;
+       __u64 status;
+ };
+ 
+ struct btrfs_ioctl_qgroup_assign_args {
+       __u64 assign;
+       __u64 src;
+       __u64 dst;
+ };
+ 
+ struct btrfs_ioctl_qgroup_create_args {
+       __u64 create;
+       __u64 qgroupid;
+ };
+ 
   #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
                                    struct btrfs_ioctl_vol_args)
   #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@@ -343,6 -389,8 +393,8 @@@
   #define BTRFS_IOC_WAIT_SYNC  _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
   #define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
                                    struct btrfs_ioctl_vol_args_v2)
+ #define BTRFS_IOC_SUBVOL_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 24, \
+                                  struct btrfs_ioctl_vol_args_v2)
   #define BTRFS_IOC_SUBVOL_GETFLAGS _IOR(BTRFS_IOCTL_MAGIC, 25, __u64)
   #define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64)
   #define BTRFS_IOC_SCRUB _IOWR(BTRFS_IOCTL_MAGIC, 27, \
@@@ -363,8 -411,17 +415,16 @@@
                                         struct btrfs_ioctl_ino_path_args)
   #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
                                         struct btrfs_ioctl_ino_path_args)
- -#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \
- -                                    struct btrfs_ioctl_get_dev_stats)
- -#define BTRFS_IOC_GET_AND_RESET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 53, \
- -                                      struct btrfs_ioctl_get_dev_stats)
- -
+ +#define BTRFS_IOC_DEVICES_READY _IOR(BTRFS_IOCTL_MAGIC, 39, \
+ +                                   struct btrfs_ioctl_vol_args)
+ #define BTRFS_IOC_QUOTA_CTL _IOWR(BTRFS_IOCTL_MAGIC, 40, \
+                              struct btrfs_ioctl_quota_ctl_args)
+ #define BTRFS_IOC_QGROUP_ASSIGN _IOW(BTRFS_IOCTL_MAGIC, 41, \
+                              struct btrfs_ioctl_qgroup_assign_args)
+ #define BTRFS_IOC_QGROUP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 42, \
+                              struct btrfs_ioctl_qgroup_create_args)
+ #define BTRFS_IOC_QGROUP_LIMIT _IOR(BTRFS_IOCTL_MAGIC, 43, \
+                              struct btrfs_ioctl_qgroup_limit_args)
+ +#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \
+ +                                    struct btrfs_ioctl_get_dev_stats)
   #endif
diff --combined fs/btrfs/transaction.c

index 328b95f,1272839..cc20e95
--- 1/fs/btrfs/transaction.c
--- 2/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@@ -38,7 -38,6 +38,6 @@@ void put_transaction(struct btrfs_trans
         if (atomic_dec_and_test(&transaction->use_count)) {
                 BUG_ON(!list_empty(&transaction->list));
                 WARN_ON(transaction->delayed_refs.root.rb_node);
-               WARN_ON(!list_empty(&transaction->delayed_refs.seq_head));
                 memset(transaction, 0, sizeof(*transaction));
                 kmem_cache_free(btrfs_transaction_cachep, transaction);
         }
@@@ -100,8 -99,8 +99,8 @@@ loop
                 kmem_cache_free(btrfs_transaction_cachep, cur_trans);
                 cur_trans = fs_info->running_transaction;
                 goto loop;
- -      } else if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
- -              spin_unlock(&root->fs_info->trans_lock);
+ +      } else if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+ +              spin_unlock(&fs_info->trans_lock);
                 kmem_cache_free(btrfs_transaction_cachep, cur_trans);
                 return -EROFS;
         }
@@@ -126,7 -125,6 +125,6 @@@
         cur_trans->delayed_refs.num_heads = 0;
         cur_trans->delayed_refs.flushing = 0;
         cur_trans->delayed_refs.run_delayed_start = 0;
-       cur_trans->delayed_refs.seq = 1;
   
         /*
          * although the tree mod log is per file system and not per transaction,
@@@ -145,10 -143,8 +143,8 @@@
         }
         atomic_set(&fs_info->tree_mod_seq, 0);
   
-       init_waitqueue_head(&cur_trans->delayed_refs.seq_wait);
         spin_lock_init(&cur_trans->commit_lock);
         spin_lock_init(&cur_trans->delayed_refs.lock);
-       INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head);
   
         INIT_LIST_HEAD(&cur_trans->pending_snapshots);
         list_add_tail(&cur_trans->list, &fs_info->trans_list);
@@@ -299,6 -295,7 +295,7 @@@ static struct btrfs_trans_handle *start
         struct btrfs_transaction *cur_trans;
         u64 num_bytes = 0;
         int ret;
+       u64 qgroup_reserved = 0;
   
         if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
                 return ERR_PTR(-EROFS);
@@@ -317,6 -314,14 +314,14 @@@
          * the appropriate flushing if need be.
          */
         if (num_items > 0 && root != root->fs_info->chunk_root) {
+               if (root->fs_info->quota_enabled &&
+                   is_fstree(root->root_key.objectid)) {
+                       qgroup_reserved = num_items * root->leafsize;
+                       ret = btrfs_qgroup_reserve(root, qgroup_reserved);
+                       if (ret)
+                               return ERR_PTR(ret);
+               }
+ 
                 num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
                 ret = btrfs_block_rsv_add(root,
                                           &root->fs_info->trans_block_rsv,
@@@ -349,12 -354,15 +354,16 @@@ again
         h->transaction = cur_trans;
         h->blocks_used = 0;
         h->bytes_reserved = 0;
+       h->root = root;
         h->delayed_ref_updates = 0;
         h->use_count = 1;
+ +      h->adding_csums = 0;
         h->block_rsv = NULL;
         h->orig_rsv = NULL;
         h->aborted = 0;
+       h->qgroup_reserved = qgroup_reserved;
+       h->delayed_ref_elem.seq = 0;
+       INIT_LIST_HEAD(&h->qgroup_ref_list);
   
         smp_mb();
         if (cur_trans->blocked && may_wait_transaction(root, type)) {
@@@ -474,6 -482,7 +483,6 @@@ int btrfs_should_end_transaction(struc
                                  struct btrfs_root *root)
   {
         struct btrfs_transaction *cur_trans = trans->transaction;
- -      struct btrfs_block_rsv *rsv = trans->block_rsv;
         int updates;
         int err;
   
@@@ -481,6 -490,12 +490,6 @@@
         if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
                 return 1;
   
- -      /*
- -       * We need to do this in case we're deleting csums so the global block
- -       * rsv get's used instead of the csum block rsv.
- -       */
- -      trans->block_rsv = NULL;
- -
         updates = trans->delayed_ref_updates;
         trans->delayed_ref_updates = 0;
         if (updates) {
@@@ -489,6 -504,8 +498,6 @@@
                         return err;
         }
   
- -      trans->block_rsv = rsv;
- -
         return should_end_transaction(trans, root);
   }
   
@@@ -505,6 -522,24 +514,24 @@@ static int __btrfs_end_transaction(stru
                 return 0;
         }
   
+       /*
+        * do the qgroup accounting as early as possible
+        */
+       err = btrfs_delayed_refs_qgroup_accounting(trans, info);
+ 
+       btrfs_trans_release_metadata(trans, root);
+       trans->block_rsv = NULL;
+       /*
+        * the same root has to be passed to start_transaction and
+        * end_transaction. Subvolume quota depends on this.
+        */
+       WARN_ON(trans->root != root);
+ 
+       if (trans->qgroup_reserved) {
+               btrfs_qgroup_free(root, trans->qgroup_reserved);
+               trans->qgroup_reserved = 0;
+       }
+ 
         while (count < 2) {
                 unsigned long cur = trans->delayed_ref_updates;
                 trans->delayed_ref_updates = 0;
@@@ -517,8 -552,6 +544,8 @@@
                 }
                 count++;
         }
+ +      btrfs_trans_release_metadata(trans, root);
+ +      trans->block_rsv = NULL;
   
         if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
             should_end_transaction(trans, root)) {
@@@ -559,6 -592,7 +586,7 @@@
             root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
                 err = -EIO;
         }
+       assert_qgroups_uptodate(trans);
   
         memset(trans, 0, sizeof(*trans));
         kmem_cache_free(btrfs_trans_handle_cachep, trans);
@@@ -777,6 -811,13 +805,13 @@@ static noinline int commit_cowonly_root
         ret = btrfs_run_dev_stats(trans, root->fs_info);
         BUG_ON(ret);
   
+       ret = btrfs_run_qgroups(trans, root->fs_info);
+       BUG_ON(ret);
+ 
+       /* run_qgroups might have added some more refs */
+       ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+       BUG_ON(ret);
+ 
         while (!list_empty(&fs_info->dirty_cowonly_roots)) {
                 next = fs_info->dirty_cowonly_roots.next;
                 list_del_init(next);
@@@ -949,6 -990,14 +984,14 @@@ static noinline int create_pending_snap
                 }
         }
   
+       ret = btrfs_qgroup_inherit(trans, fs_info, root->root_key.objectid,
+                                  objectid, pending->inherit);
+       kfree(pending->inherit);
+       if (ret) {
+               pending->error = ret;
+               goto fail;
+       }
+ 
         key.objectid = objectid;
         key.offset = (u64)-1;
         key.type = BTRFS_ROOT_ITEM_KEY;
@@@ -1261,6 -1310,9 +1304,6 @@@ int btrfs_commit_transaction(struct btr
   
         btrfs_run_ordered_operations(root, 0);
   
- -      btrfs_trans_release_metadata(trans, root);
- -      trans->block_rsv = NULL;
- -
         if (cur_trans->aborted)
                 goto cleanup_transaction;
   
@@@ -1271,9 -1323,6 +1314,9 @@@
         if (ret)
                 goto cleanup_transaction;
   
+ +      btrfs_trans_release_metadata(trans, root);
+ +      trans->block_rsv = NULL;
+ +
         cur_trans = trans->transaction;
   
         /*
@@@ -1322,8 -1371,7 +1365,8 @@@
                 spin_unlock(&root->fs_info->trans_lock);
         }
   
- -      if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
+ +      if (!btrfs_test_opt(root, SSD) &&
+ +          (now < cur_trans->start_time || now - cur_trans->start_time < 1))
                 should_grow = 1;
   
         do {
@@@ -1344,6 -1392,13 +1387,13 @@@
                 if (ret)
                         goto cleanup_transaction;
   
+               /*
+                * running the delayed items may have added new refs. account
+                * them now so that they hinder processing of more delayed refs
+                * as little as possible.
+                */
+               btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
+ 
                 /*
                  * rename don't use btrfs_join_transaction, so, once we
                  * set the transaction to blocked above, we aren't going
@@@ -1456,6 -1511,7 +1506,7 @@@
                             root->fs_info->chunk_root->node);
         switch_commit_root(root->fs_info->chunk_root);
   
+       assert_qgroups_uptodate(trans);
         update_super_roots(root);
   
         if (!root->fs_info->log_root_recovering) {
@@@ -1525,8 -1581,6 +1576,8 @@@
         return ret;
   
   cleanup_transaction:
+ +      btrfs_trans_release_metadata(trans, root);
+ +      trans->block_rsv = NULL;
         btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n");
   //    WARN_ON(1);
         if (current->journal_info == trans)
diff --combined fs/btrfs/transaction.h

index d314a74,cca315d..e8b8416
--- 1/fs/btrfs/transaction.h
--- 2/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@@ -20,6 -20,7 +20,7 @@@
   #define __BTRFS_TRANSACTION__
   #include "btrfs_inode.h"
   #include "delayed-ref.h"
+ #include "ctree.h"
   
   struct btrfs_transaction {
         u64 transid;
@@@ -49,6 -50,7 +50,7 @@@
   struct btrfs_trans_handle {
         u64 transid;
         u64 bytes_reserved;
+       u64 qgroup_reserved;
         unsigned long use_count;
         unsigned long blocks_reserved;
         unsigned long blocks_used;
@@@ -57,13 -59,21 +59,22 @@@
         struct btrfs_block_rsv *block_rsv;
         struct btrfs_block_rsv *orig_rsv;
         int aborted;
+ +      int adding_csums;
+       /*
+        * this root is only needed to validate that the root passed to
+        * start_transaction is the same as the one passed to end_transaction.
+        * Subvolume quota depends on this
+        */
+       struct btrfs_root *root;
+       struct seq_list delayed_ref_elem;
+       struct list_head qgroup_ref_list;
   };
   
   struct btrfs_pending_snapshot {
         struct dentry *dentry;
         struct btrfs_root *root;
         struct btrfs_root *snap;
+       struct btrfs_qgroup_inherit *inherit;
         /* block reservation for the operation */
         struct btrfs_block_rsv block_rsv;
         /* extra metadata reseration for relocation */
author	Chris Mason <chris.mason@fusionio.com>
	Wed, 25 Jul 2012 20:11:38 +0000 (16:11 -0400)
committer	Chris Mason <chris.mason@fusionio.com>
	Wed, 25 Jul 2012 20:11:38 +0000 (16:11 -0400)
		1	2
fs/btrfs/ctree.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/disk-io.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/extent-tree.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/ioctl.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/ioctl.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/transaction.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/transaction.h	patch \|	diff1 \|	diff2 \|	blob \| history