fs/btrfs/qgroup.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (C) 2011 STRATO.  All rights reserved.
   4  */
   5
   6 #include <linux/sched.h>
   7 #include <linux/pagemap.h>
   8 #include <linux/writeback.h>
   9 #include <linux/blkdev.h>
  10 #include <linux/rbtree.h>
  11 #include <linux/slab.h>
  12 #include <linux/workqueue.h>
  13 #include <linux/btrfs.h>
  14 #include <linux/sched/mm.h>
  15
  16 #include "ctree.h"
  17 #include "transaction.h"
  18 #include "disk-io.h"
  19 #include "locking.h"
  20 #include "ulist.h"
  21 #include "backref.h"
  22 #include "extent_io.h"
  23 #include "qgroup.h"
  24 #include "block-group.h"
  25 #include "sysfs.h"
  26 #include "tree-mod-log.h"
  27 #include "fs.h"
  28 #include "accessors.h"
  29 #include "extent-tree.h"
  30 #include "root-tree.h"
  31 #include "tree-checker.h"
  32
  33 /*
  34  * Helpers to access qgroup reservation
  35  *
  36  * Callers should ensure the lock context and type are valid
  37  */
  38
  39 static u64 qgroup_rsv_total(const struct btrfs_qgroup *qgroup)
  40 {
  41         u64 ret = 0;
  42         int i;
  43
  44         for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
  45                 ret += qgroup->rsv.values[i];
  46
  47         return ret;
  48 }
  49
  50 #ifdef CONFIG_BTRFS_DEBUG
  51 static const char *qgroup_rsv_type_str(enum btrfs_qgroup_rsv_type type)
  52 {
  53         if (type == BTRFS_QGROUP_RSV_DATA)
  54                 return "data";
  55         if (type == BTRFS_QGROUP_RSV_META_PERTRANS)
  56                 return "meta_pertrans";
  57         if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
  58                 return "meta_prealloc";
  59         return NULL;
  60 }
  61 #endif
  62
  63 static void qgroup_rsv_add(struct btrfs_fs_info *fs_info,
  64                            struct btrfs_qgroup *qgroup, u64 num_bytes,
  65                            enum btrfs_qgroup_rsv_type type)
  66 {
  67         trace_qgroup_update_reserve(fs_info, qgroup, num_bytes, type);
  68         qgroup->rsv.values[type] += num_bytes;
  69 }
  70
  71 static void qgroup_rsv_release(struct btrfs_fs_info *fs_info,
  72                                struct btrfs_qgroup *qgroup, u64 num_bytes,
  73                                enum btrfs_qgroup_rsv_type type)
  74 {
  75         trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type);
  76         if (qgroup->rsv.values[type] >= num_bytes) {
  77                 qgroup->rsv.values[type] -= num_bytes;
  78                 return;
  79         }
  80 #ifdef CONFIG_BTRFS_DEBUG
  81         WARN_RATELIMIT(1,
  82                 "qgroup %llu %s reserved space underflow, have %llu to free %llu",
  83                 qgroup->qgroupid, qgroup_rsv_type_str(type),
  84                 qgroup->rsv.values[type], num_bytes);
  85 #endif
  86         qgroup->rsv.values[type] = 0;
  87 }
  88
  89 static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info,
  90                                      struct btrfs_qgroup *dest,
  91                                      struct btrfs_qgroup *src)
  92 {
  93         int i;
  94
  95         for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
  96                 qgroup_rsv_add(fs_info, dest, src->rsv.values[i], i);
  97 }
  98
  99 static void qgroup_rsv_release_by_qgroup(struct btrfs_fs_info *fs_info,
 100                                          struct btrfs_qgroup *dest,
 101                                           struct btrfs_qgroup *src)
 102 {
 103         int i;
 104
 105         for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
 106                 qgroup_rsv_release(fs_info, dest, src->rsv.values[i], i);
 107 }
 108
 109 static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq,
 110                                            int mod)
 111 {
 112         if (qg->old_refcnt < seq)
 113                 qg->old_refcnt = seq;
 114         qg->old_refcnt += mod;
 115 }
 116
 117 static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq,
 118                                            int mod)
 119 {
 120         if (qg->new_refcnt < seq)
 121                 qg->new_refcnt = seq;
 122         qg->new_refcnt += mod;
 123 }
 124
 125 static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq)
 126 {
 127         if (qg->old_refcnt < seq)
 128                 return 0;
 129         return qg->old_refcnt - seq;
 130 }
 131
 132 static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq)
 133 {
 134         if (qg->new_refcnt < seq)
 135                 return 0;
 136         return qg->new_refcnt - seq;
 137 }
 138
 139 /*
 140  * glue structure to represent the relations between qgroups.
 141  */
 142 struct btrfs_qgroup_list {
 143         struct list_head next_group;
 144         struct list_head next_member;
 145         struct btrfs_qgroup *group;
 146         struct btrfs_qgroup *member;
 147 };
 148
 149 static int
 150 qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
 151                    int init_flags);
 152 static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);
 153
 154 /* must be called with qgroup_ioctl_lock held */
 155 static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
 156                                            u64 qgroupid)
 157 {
 158         struct rb_node *n = fs_info->qgroup_tree.rb_node;
 159         struct btrfs_qgroup *qgroup;
 160
 161         while (n) {
 162                 qgroup = rb_entry(n, struct btrfs_qgroup, node);
 163                 if (qgroup->qgroupid < qgroupid)
 164                         n = n->rb_left;
 165                 else if (qgroup->qgroupid > qgroupid)
 166                         n = n->rb_right;
 167                 else
 168                         return qgroup;
 169         }
 170         return NULL;
 171 }
 172
 173 /*
 174  * Add qgroup to the filesystem's qgroup tree.
 175  *
 176  * Must be called with qgroup_lock held and @prealloc preallocated.
 177  *
 178  * The control on the lifespan of @prealloc would be transfered to this
 179  * function, thus caller should no longer touch @prealloc.
 180  */
 181 static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
 182                                           struct btrfs_qgroup *prealloc,
 183                                           u64 qgroupid)
 184 {
 185         struct rb_node **p = &fs_info->qgroup_tree.rb_node;
 186         struct rb_node *parent = NULL;
 187         struct btrfs_qgroup *qgroup;
 188
 189         /* Caller must have pre-allocated @prealloc. */
 190         ASSERT(prealloc);
 191
 192         while (*p) {
 193                 parent = *p;
 194                 qgroup = rb_entry(parent, struct btrfs_qgroup, node);
 195
 196                 if (qgroup->qgroupid < qgroupid) {
 197                         p = &(*p)->rb_left;
 198                 } else if (qgroup->qgroupid > qgroupid) {
 199                         p = &(*p)->rb_right;
 200                 } else {
 201                         kfree(prealloc);
 202                         return qgroup;
 203                 }
 204         }
 205
 206         qgroup = prealloc;
 207         qgroup->qgroupid = qgroupid;
 208         INIT_LIST_HEAD(&qgroup->groups);
 209         INIT_LIST_HEAD(&qgroup->members);
 210         INIT_LIST_HEAD(&qgroup->dirty);
 211         INIT_LIST_HEAD(&qgroup->iterator);
 212         INIT_LIST_HEAD(&qgroup->nested_iterator);
 213
 214         rb_link_node(&qgroup->node, parent, p);
 215         rb_insert_color(&qgroup->node, &fs_info->qgroup_tree);
 216
 217         return qgroup;
 218 }
 219
 220 static void __del_qgroup_rb(struct btrfs_fs_info *fs_info,
 221                             struct btrfs_qgroup *qgroup)
 222 {
 223         struct btrfs_qgroup_list *list;
 224
 225         list_del(&qgroup->dirty);
 226         while (!list_empty(&qgroup->groups)) {
 227                 list = list_first_entry(&qgroup->groups,
 228                                         struct btrfs_qgroup_list, next_group);
 229                 list_del(&list->next_group);
 230                 list_del(&list->next_member);
 231                 kfree(list);
 232         }
 233
 234         while (!list_empty(&qgroup->members)) {
 235                 list = list_first_entry(&qgroup->members,
 236                                         struct btrfs_qgroup_list, next_member);
 237                 list_del(&list->next_group);
 238                 list_del(&list->next_member);
 239                 kfree(list);
 240         }
 241 }
 242
 243 /* must be called with qgroup_lock held */
 244 static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
 245 {
 246         struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid);
 247
 248         if (!qgroup)
 249                 return -ENOENT;
 250
 251         rb_erase(&qgroup->node, &fs_info->qgroup_tree);
 252         __del_qgroup_rb(fs_info, qgroup);
 253         return 0;
 254 }
 255
 256 /*
 257  * Add relation specified by two qgroups.
 258  *
 259  * Must be called with qgroup_lock held, the ownership of @prealloc is
 260  * transferred to this function and caller should not touch it anymore.
 261  *
 262  * Return: 0        on success
 263  *         -ENOENT  if one of the qgroups is NULL
 264  *         <0       other errors
 265  */
 266 static int __add_relation_rb(struct btrfs_qgroup_list *prealloc,
 267                              struct btrfs_qgroup *member,
 268                              struct btrfs_qgroup *parent)
 269 {
 270         if (!member || !parent) {
 271                 kfree(prealloc);
 272                 return -ENOENT;
 273         }
 274
 275         prealloc->group = parent;
 276         prealloc->member = member;
 277         list_add_tail(&prealloc->next_group, &member->groups);
 278         list_add_tail(&prealloc->next_member, &parent->members);
 279
 280         return 0;
 281 }
 282
 283 /*
 284  * Add relation specified by two qgroup ids.
 285  *
 286  * Must be called with qgroup_lock held.
 287  *
 288  * Return: 0        on success
 289  *         -ENOENT  if one of the ids does not exist
 290  *         <0       other errors
 291  */
 292 static int add_relation_rb(struct btrfs_fs_info *fs_info,
 293                            struct btrfs_qgroup_list *prealloc,
 294                            u64 memberid, u64 parentid)
 295 {
 296         struct btrfs_qgroup *member;
 297         struct btrfs_qgroup *parent;
 298
 299         member = find_qgroup_rb(fs_info, memberid);
 300         parent = find_qgroup_rb(fs_info, parentid);
 301
 302         return __add_relation_rb(prealloc, member, parent);
 303 }
 304
 305 /* Must be called with qgroup_lock held */
 306 static int del_relation_rb(struct btrfs_fs_info *fs_info,
 307                            u64 memberid, u64 parentid)
 308 {
 309         struct btrfs_qgroup *member;
 310         struct btrfs_qgroup *parent;
 311         struct btrfs_qgroup_list *list;
 312
 313         member = find_qgroup_rb(fs_info, memberid);
 314         parent = find_qgroup_rb(fs_info, parentid);
 315         if (!member || !parent)
 316                 return -ENOENT;
 317
 318         list_for_each_entry(list, &member->groups, next_group) {
 319                 if (list->group == parent) {
 320                         list_del(&list->next_group);
 321                         list_del(&list->next_member);
 322                         kfree(list);
 323                         return 0;
 324                 }
 325         }
 326         return -ENOENT;
 327 }
 328
 329 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 330 int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
 331                                u64 rfer, u64 excl)
 332 {
 333         struct btrfs_qgroup *qgroup;
 334
 335         qgroup = find_qgroup_rb(fs_info, qgroupid);
 336         if (!qgroup)
 337                 return -EINVAL;
 338         if (qgroup->rfer != rfer || qgroup->excl != excl)
 339                 return -EINVAL;
 340         return 0;
 341 }
 342 #endif
 343
 344 static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info)
 345 {
 346         fs_info->qgroup_flags |= (BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT |
 347                                   BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN |
 348                                   BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING);
 349 }
 350
 351 /*
 352  * The full config is read in one go, only called from open_ctree()
 353  * It doesn't use any locking, as at this point we're still single-threaded
 354  */
 355 int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
 356 {
 357         struct btrfs_key key;
 358         struct btrfs_key found_key;
 359         struct btrfs_root *quota_root = fs_info->quota_root;
 360         struct btrfs_path *path = NULL;
 361         struct extent_buffer *l;
 362         int slot;
 363         int ret = 0;
 364         u64 flags = 0;
 365         u64 rescan_progress = 0;
 366
 367         if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
 368                 return 0;
 369
 370         fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
 371         if (!fs_info->qgroup_ulist) {
 372                 ret = -ENOMEM;
 373                 goto out;
 374         }
 375
 376         path = btrfs_alloc_path();
 377         if (!path) {
 378                 ret = -ENOMEM;
 379                 goto out;
 380         }
 381
 382         ret = btrfs_sysfs_add_qgroups(fs_info);
 383         if (ret < 0)
 384                 goto out;
 385         /* default this to quota off, in case no status key is found */
 386         fs_info->qgroup_flags = 0;
 387
 388         /*
 389          * pass 1: read status, all qgroup infos and limits
 390          */
 391         key.objectid = 0;
 392         key.type = 0;
 393         key.offset = 0;
 394         ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1);
 395         if (ret)
 396                 goto out;
 397
 398         while (1) {
 399                 struct btrfs_qgroup *qgroup;
 400
 401                 slot = path->slots[0];
 402                 l = path->nodes[0];
 403                 btrfs_item_key_to_cpu(l, &found_key, slot);
 404
 405                 if (found_key.type == BTRFS_QGROUP_STATUS_KEY) {
 406                         struct btrfs_qgroup_status_item *ptr;
 407
 408                         ptr = btrfs_item_ptr(l, slot,
 409                                              struct btrfs_qgroup_status_item);
 410
 411                         if (btrfs_qgroup_status_version(l, ptr) !=
 412                             BTRFS_QGROUP_STATUS_VERSION) {
 413                                 btrfs_err(fs_info,
 414                                  "old qgroup version, quota disabled");
 415                                 goto out;
 416                         }
 417                         if (btrfs_qgroup_status_generation(l, ptr) !=
 418                             fs_info->generation) {
 419                                 qgroup_mark_inconsistent(fs_info);
 420                                 btrfs_err(fs_info,
 421                                         "qgroup generation mismatch, marked as inconsistent");
 422                         }
 423                         fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
 424                                                                           ptr);
 425                         rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
 426                         goto next1;
 427                 }
 428
 429                 if (found_key.type != BTRFS_QGROUP_INFO_KEY &&
 430                     found_key.type != BTRFS_QGROUP_LIMIT_KEY)
 431                         goto next1;
 432
 433                 qgroup = find_qgroup_rb(fs_info, found_key.offset);
 434                 if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
 435                     (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
 436                         btrfs_err(fs_info, "inconsistent qgroup config");
 437                         qgroup_mark_inconsistent(fs_info);
 438                 }
 439                 if (!qgroup) {
 440                         struct btrfs_qgroup *prealloc;
 441
 442                         prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL);
 443                         if (!prealloc) {
 444                                 ret = -ENOMEM;
 445                                 goto out;
 446                         }
 447                         qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset);
 448                 }
 449                 ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
 450                 if (ret < 0)
 451                         goto out;
 452
 453                 switch (found_key.type) {
 454                 case BTRFS_QGROUP_INFO_KEY: {
 455                         struct btrfs_qgroup_info_item *ptr;
 456
 457                         ptr = btrfs_item_ptr(l, slot,
 458                                              struct btrfs_qgroup_info_item);
 459                         qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr);
 460                         qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr);
 461                         qgroup->excl = btrfs_qgroup_info_excl(l, ptr);
 462                         qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr);
 463                         /* generation currently unused */
 464                         break;
 465                 }
 466                 case BTRFS_QGROUP_LIMIT_KEY: {
 467                         struct btrfs_qgroup_limit_item *ptr;
 468
 469                         ptr = btrfs_item_ptr(l, slot,
 470                                              struct btrfs_qgroup_limit_item);
 471                         qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr);
 472                         qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr);
 473                         qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr);
 474                         qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr);
 475                         qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr);
 476                         break;
 477                 }
 478                 }
 479 next1:
 480                 ret = btrfs_next_item(quota_root, path);
 481                 if (ret < 0)
 482                         goto out;
 483                 if (ret)
 484                         break;
 485         }
 486         btrfs_release_path(path);
 487
 488         /*
 489          * pass 2: read all qgroup relations
 490          */
 491         key.objectid = 0;
 492         key.type = BTRFS_QGROUP_RELATION_KEY;
 493         key.offset = 0;
 494         ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0);
 495         if (ret)
 496                 goto out;
 497         while (1) {
 498                 struct btrfs_qgroup_list *list = NULL;
 499
 500                 slot = path->slots[0];
 501                 l = path->nodes[0];
 502                 btrfs_item_key_to_cpu(l, &found_key, slot);
 503
 504                 if (found_key.type != BTRFS_QGROUP_RELATION_KEY)
 505                         goto next2;
 506
 507                 if (found_key.objectid > found_key.offset) {
 508                         /* parent <- member, not needed to build config */
 509                         /* FIXME should we omit the key completely? */
 510                         goto next2;
 511                 }
 512
 513                 list = kzalloc(sizeof(*list), GFP_KERNEL);
 514                 if (!list) {
 515                         ret = -ENOMEM;
 516                         goto out;
 517                 }
 518                 ret = add_relation_rb(fs_info, list, found_key.objectid,
 519                                       found_key.offset);
 520                 list = NULL;
 521                 if (ret == -ENOENT) {
 522                         btrfs_warn(fs_info,
 523                                 "orphan qgroup relation 0x%llx->0x%llx",
 524                                 found_key.objectid, found_key.offset);
 525                         ret = 0;        /* ignore the error */
 526                 }
 527                 if (ret)
 528                         goto out;
 529 next2:
 530                 ret = btrfs_next_item(quota_root, path);
 531                 if (ret < 0)
 532                         goto out;
 533                 if (ret)
 534                         break;
 535         }
 536 out:
 537         btrfs_free_path(path);
 538         fs_info->qgroup_flags |= flags;
 539         if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
 540                 clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
 541         else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
 542                  ret >= 0)
 543                 ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
 544
 545         if (ret < 0) {
 546                 ulist_free(fs_info->qgroup_ulist);
 547                 fs_info->qgroup_ulist = NULL;
 548                 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
 549                 btrfs_sysfs_del_qgroups(fs_info);
 550         }
 551
 552         return ret < 0 ? ret : 0;
 553 }
 554
 555 /*
 556  * Called in close_ctree() when quota is still enabled.  This verifies we don't
 557  * leak some reserved space.
 558  *
 559  * Return false if no reserved space is left.
 560  * Return true if some reserved space is leaked.
 561  */
 562 bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info)
 563 {
 564         struct rb_node *node;
 565         bool ret = false;
 566
 567         if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
 568                 return ret;
 569         /*
 570          * Since we're unmounting, there is no race and no need to grab qgroup
 571          * lock.  And here we don't go post-order to provide a more user
 572          * friendly sorted result.
 573          */
 574         for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) {
 575                 struct btrfs_qgroup *qgroup;
 576                 int i;
 577
 578                 qgroup = rb_entry(node, struct btrfs_qgroup, node);
 579                 for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) {
 580                         if (qgroup->rsv.values[i]) {
 581                                 ret = true;
 582                                 btrfs_warn(fs_info,
 583                 "qgroup %hu/%llu has unreleased space, type %d rsv %llu",
 584                                    btrfs_qgroup_level(qgroup->qgroupid),
 585                                    btrfs_qgroup_subvolid(qgroup->qgroupid),
 586                                    i, qgroup->rsv.values[i]);
 587                         }
 588                 }
 589         }
 590         return ret;
 591 }
 592
 593 /*
 594  * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(),
 595  * first two are in single-threaded paths.And for the third one, we have set
 596  * quota_root to be null with qgroup_lock held before, so it is safe to clean
 597  * up the in-memory structures without qgroup_lock held.
 598  */
 599 void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
 600 {
 601         struct rb_node *n;
 602         struct btrfs_qgroup *qgroup;
 603
 604         while ((n = rb_first(&fs_info->qgroup_tree))) {
 605                 qgroup = rb_entry(n, struct btrfs_qgroup, node);
 606                 rb_erase(n, &fs_info->qgroup_tree);
 607                 __del_qgroup_rb(fs_info, qgroup);
 608                 btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
 609                 kfree(qgroup);
 610         }
 611         /*
 612          * We call btrfs_free_qgroup_config() when unmounting
 613          * filesystem and disabling quota, so we set qgroup_ulist
 614          * to be null here to avoid double free.
 615          */
 616         ulist_free(fs_info->qgroup_ulist);
 617         fs_info->qgroup_ulist = NULL;
 618         btrfs_sysfs_del_qgroups(fs_info);
 619 }
 620
 621 static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
 622                                     u64 dst)
 623 {
 624         int ret;
 625         struct btrfs_root *quota_root = trans->fs_info->quota_root;
 626         struct btrfs_path *path;
 627         struct btrfs_key key;
 628
 629         path = btrfs_alloc_path();
 630         if (!path)
 631                 return -ENOMEM;
 632
 633         key.objectid = src;
 634         key.type = BTRFS_QGROUP_RELATION_KEY;
 635         key.offset = dst;
 636
 637         ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0);
 638
 639         btrfs_mark_buffer_dirty(path->nodes[0]);
 640
 641         btrfs_free_path(path);
 642         return ret;
 643 }
 644
 645 static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
 646                                     u64 dst)
 647 {
 648         int ret;
 649         struct btrfs_root *quota_root = trans->fs_info->quota_root;
 650         struct btrfs_path *path;
 651         struct btrfs_key key;
 652
 653         path = btrfs_alloc_path();
 654         if (!path)
 655                 return -ENOMEM;
 656
 657         key.objectid = src;
 658         key.type = BTRFS_QGROUP_RELATION_KEY;
 659         key.offset = dst;
 660
 661         ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
 662         if (ret < 0)
 663                 goto out;
 664
 665         if (ret > 0) {
 666                 ret = -ENOENT;
 667                 goto out;
 668         }
 669
 670         ret = btrfs_del_item(trans, quota_root, path);
 671 out:
 672         btrfs_free_path(path);
 673         return ret;
 674 }
 675
 676 static int add_qgroup_item(struct btrfs_trans_handle *trans,
 677                            struct btrfs_root *quota_root, u64 qgroupid)
 678 {
 679         int ret;
 680         struct btrfs_path *path;
 681         struct btrfs_qgroup_info_item *qgroup_info;
 682         struct btrfs_qgroup_limit_item *qgroup_limit;
 683         struct extent_buffer *leaf;
 684         struct btrfs_key key;
 685
 686         if (btrfs_is_testing(quota_root->fs_info))
 687                 return 0;
 688
 689         path = btrfs_alloc_path();
 690         if (!path)
 691                 return -ENOMEM;
 692
 693         key.objectid = 0;
 694         key.type = BTRFS_QGROUP_INFO_KEY;
 695         key.offset = qgroupid;
 696
 697         /*
 698          * Avoid a transaction abort by catching -EEXIST here. In that
 699          * case, we proceed by re-initializing the existing structure
 700          * on disk.
 701          */
 702
 703         ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
 704                                       sizeof(*qgroup_info));
 705         if (ret && ret != -EEXIST)
 706                 goto out;
 707
 708         leaf = path->nodes[0];
 709         qgroup_info = btrfs_item_ptr(leaf, path->slots[0],
 710                                  struct btrfs_qgroup_info_item);
 711         btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid);
 712         btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0);
 713         btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0);
 714         btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0);
 715         btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0);
 716
 717         btrfs_mark_buffer_dirty(leaf);
 718
 719         btrfs_release_path(path);
 720
 721         key.type = BTRFS_QGROUP_LIMIT_KEY;
 722         ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
 723                                       sizeof(*qgroup_limit));
 724         if (ret && ret != -EEXIST)
 725                 goto out;
 726
 727         leaf = path->nodes[0];
 728         qgroup_limit = btrfs_item_ptr(leaf, path->slots[0],
 729                                   struct btrfs_qgroup_limit_item);
 730         btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0);
 731         btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0);
 732         btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0);
 733         btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0);
 734         btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0);
 735
 736         btrfs_mark_buffer_dirty(leaf);
 737
 738         ret = 0;
 739 out:
 740         btrfs_free_path(path);
 741         return ret;
 742 }
 743
 744 static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid)
 745 {
 746         int ret;
 747         struct btrfs_root *quota_root = trans->fs_info->quota_root;
 748         struct btrfs_path *path;
 749         struct btrfs_key key;
 750
 751         path = btrfs_alloc_path();
 752         if (!path)
 753                 return -ENOMEM;
 754
 755         key.objectid = 0;
 756         key.type = BTRFS_QGROUP_INFO_KEY;
 757         key.offset = qgroupid;
 758         ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
 759         if (ret < 0)
 760                 goto out;
 761
 762         if (ret > 0) {
 763                 ret = -ENOENT;
 764                 goto out;
 765         }
 766
 767         ret = btrfs_del_item(trans, quota_root, path);
 768         if (ret)
 769                 goto out;
 770
 771         btrfs_release_path(path);
 772
 773         key.type = BTRFS_QGROUP_LIMIT_KEY;
 774         ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
 775         if (ret < 0)
 776                 goto out;
 777
 778         if (ret > 0) {
 779                 ret = -ENOENT;
 780                 goto out;
 781         }
 782
 783         ret = btrfs_del_item(trans, quota_root, path);
 784
 785 out:
 786         btrfs_free_path(path);
 787         return ret;
 788 }
 789
 790 static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
 791                                     struct btrfs_qgroup *qgroup)
 792 {
 793         struct btrfs_root *quota_root = trans->fs_info->quota_root;
 794         struct btrfs_path *path;
 795         struct btrfs_key key;
 796         struct extent_buffer *l;
 797         struct btrfs_qgroup_limit_item *qgroup_limit;
 798         int ret;
 799         int slot;
 800
 801         key.objectid = 0;
 802         key.type = BTRFS_QGROUP_LIMIT_KEY;
 803         key.offset = qgroup->qgroupid;
 804
 805         path = btrfs_alloc_path();
 806         if (!path)
 807                 return -ENOMEM;
 808
 809         ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
 810         if (ret > 0)
 811                 ret = -ENOENT;
 812
 813         if (ret)
 814                 goto out;
 815
 816         l = path->nodes[0];
 817         slot = path->slots[0];
 818         qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item);
 819         btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags);
 820         btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer);
 821         btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl);
 822         btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer);
 823         btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl);
 824
 825         btrfs_mark_buffer_dirty(l);
 826
 827 out:
 828         btrfs_free_path(path);
 829         return ret;
 830 }
 831
 832 static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
 833                                    struct btrfs_qgroup *qgroup)
 834 {
 835         struct btrfs_fs_info *fs_info = trans->fs_info;
 836         struct btrfs_root *quota_root = fs_info->quota_root;
 837         struct btrfs_path *path;
 838         struct btrfs_key key;
 839         struct extent_buffer *l;
 840         struct btrfs_qgroup_info_item *qgroup_info;
 841         int ret;
 842         int slot;
 843
 844         if (btrfs_is_testing(fs_info))
 845                 return 0;
 846
 847         key.objectid = 0;
 848         key.type = BTRFS_QGROUP_INFO_KEY;
 849         key.offset = qgroup->qgroupid;
 850
 851         path = btrfs_alloc_path();
 852         if (!path)
 853                 return -ENOMEM;
 854
 855         ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
 856         if (ret > 0)
 857                 ret = -ENOENT;
 858
 859         if (ret)
 860                 goto out;
 861
 862         l = path->nodes[0];
 863         slot = path->slots[0];
 864         qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item);
 865         btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid);
 866         btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer);
 867         btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr);
 868         btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl);
 869         btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr);
 870
 871         btrfs_mark_buffer_dirty(l);
 872
 873 out:
 874         btrfs_free_path(path);
 875         return ret;
 876 }
 877
 878 static int update_qgroup_status_item(struct btrfs_trans_handle *trans)
 879 {
 880         struct btrfs_fs_info *fs_info = trans->fs_info;
 881         struct btrfs_root *quota_root = fs_info->quota_root;
 882         struct btrfs_path *path;
 883         struct btrfs_key key;
 884         struct extent_buffer *l;
 885         struct btrfs_qgroup_status_item *ptr;
 886         int ret;
 887         int slot;
 888
 889         key.objectid = 0;
 890         key.type = BTRFS_QGROUP_STATUS_KEY;
 891         key.offset = 0;
 892
 893         path = btrfs_alloc_path();
 894         if (!path)
 895                 return -ENOMEM;
 896
 897         ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
 898         if (ret > 0)
 899                 ret = -ENOENT;
 900
 901         if (ret)
 902                 goto out;
 903
 904         l = path->nodes[0];
 905         slot = path->slots[0];
 906         ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item);
 907         btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags &
 908                                       BTRFS_QGROUP_STATUS_FLAGS_MASK);
 909         btrfs_set_qgroup_status_generation(l, ptr, trans->transid);
 910         btrfs_set_qgroup_status_rescan(l, ptr,
 911                                 fs_info->qgroup_rescan_progress.objectid);
 912
 913         btrfs_mark_buffer_dirty(l);
 914
 915 out:
 916         btrfs_free_path(path);
 917         return ret;
 918 }
 919
 920 /*
 921  * called with qgroup_lock held
 922  */
 923 static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
 924                                   struct btrfs_root *root)
 925 {
 926         struct btrfs_path *path;
 927         struct btrfs_key key;
 928         struct extent_buffer *leaf = NULL;
 929         int ret;
 930         int nr = 0;
 931
 932         path = btrfs_alloc_path();
 933         if (!path)
 934                 return -ENOMEM;
 935
 936         key.objectid = 0;
 937         key.offset = 0;
 938         key.type = 0;
 939
 940         while (1) {
 941                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 942                 if (ret < 0)
 943                         goto out;
 944                 leaf = path->nodes[0];
 945                 nr = btrfs_header_nritems(leaf);
 946                 if (!nr)
 947                         break;
 948                 /*
 949                  * delete the leaf one by one
 950                  * since the whole tree is going
 951                  * to be deleted.
 952                  */
 953                 path->slots[0] = 0;
 954                 ret = btrfs_del_items(trans, root, path, 0, nr);
 955                 if (ret)
 956                         goto out;
 957
 958                 btrfs_release_path(path);
 959         }
 960         ret = 0;
 961 out:
 962         btrfs_free_path(path);
 963         return ret;
 964 }
 965
 966 int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
 967 {
 968         struct btrfs_root *quota_root;
 969         struct btrfs_root *tree_root = fs_info->tree_root;
 970         struct btrfs_path *path = NULL;
 971         struct btrfs_qgroup_status_item *ptr;
 972         struct extent_buffer *leaf;
 973         struct btrfs_key key;
 974         struct btrfs_key found_key;
 975         struct btrfs_qgroup *qgroup = NULL;
 976         struct btrfs_qgroup *prealloc = NULL;
 977         struct btrfs_trans_handle *trans = NULL;
 978         struct ulist *ulist = NULL;
 979         int ret = 0;
 980         int slot;
 981
 982         /*
 983          * We need to have subvol_sem write locked, to prevent races between
 984          * concurrent tasks trying to enable quotas, because we will unlock
 985          * and relock qgroup_ioctl_lock before setting fs_info->quota_root
 986          * and before setting BTRFS_FS_QUOTA_ENABLED.
 987          */
 988         lockdep_assert_held_write(&fs_info->subvol_sem);
 989
 990         if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
 991                 btrfs_err(fs_info,
 992                           "qgroups are currently unsupported in extent tree v2");
 993                 return -EINVAL;
 994         }
 995
 996         mutex_lock(&fs_info->qgroup_ioctl_lock);
 997         if (fs_info->quota_root)
 998                 goto out;
 999
1000         ulist = ulist_alloc(GFP_KERNEL);
1001         if (!ulist) {
1002                 ret = -ENOMEM;
1003                 goto out;
1004         }
1005
1006         ret = btrfs_sysfs_add_qgroups(fs_info);
1007         if (ret < 0)
1008                 goto out;
1009
1010         /*
1011          * Unlock qgroup_ioctl_lock before starting the transaction. This is to
1012          * avoid lock acquisition inversion problems (reported by lockdep) between
1013          * qgroup_ioctl_lock and the vfs freeze semaphores, acquired when we
1014          * start a transaction.
1015          * After we started the transaction lock qgroup_ioctl_lock again and
1016          * check if someone else created the quota root in the meanwhile. If so,
1017          * just return success and release the transaction handle.
1018          *
1019          * Also we don't need to worry about someone else calling
1020          * btrfs_sysfs_add_qgroups() after we unlock and getting an error because
1021          * that function returns 0 (success) when the sysfs entries already exist.
1022          */
1023         mutex_unlock(&fs_info->qgroup_ioctl_lock);
1024
1025         /*
1026          * 1 for quota root item
1027          * 1 for BTRFS_QGROUP_STATUS item
1028          *
1029          * Yet we also need 2*n items for a QGROUP_INFO/QGROUP_LIMIT items
1030          * per subvolume. However those are not currently reserved since it
1031          * would be a lot of overkill.
1032          */
1033         trans = btrfs_start_transaction(tree_root, 2);
1034
1035         mutex_lock(&fs_info->qgroup_ioctl_lock);
1036         if (IS_ERR(trans)) {
1037                 ret = PTR_ERR(trans);
1038                 trans = NULL;
1039                 goto out;
1040         }
1041
1042         if (fs_info->quota_root)
1043                 goto out;
1044
1045         fs_info->qgroup_ulist = ulist;
1046         ulist = NULL;
1047
1048         /*
1049          * initially create the quota tree
1050          */
1051         quota_root = btrfs_create_tree(trans, BTRFS_QUOTA_TREE_OBJECTID);
1052         if (IS_ERR(quota_root)) {
1053                 ret =  PTR_ERR(quota_root);
1054                 btrfs_abort_transaction(trans, ret);
1055                 goto out;
1056         }
1057
1058         path = btrfs_alloc_path();
1059         if (!path) {
1060                 ret = -ENOMEM;
1061                 btrfs_abort_transaction(trans, ret);
1062                 goto out_free_root;
1063         }
1064
1065         key.objectid = 0;
1066         key.type = BTRFS_QGROUP_STATUS_KEY;
1067         key.offset = 0;
1068
1069         ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
1070                                       sizeof(*ptr));
1071         if (ret) {
1072                 btrfs_abort_transaction(trans, ret);
1073                 goto out_free_path;
1074         }
1075
1076         leaf = path->nodes[0];
1077         ptr = btrfs_item_ptr(leaf, path->slots[0],
1078                                  struct btrfs_qgroup_status_item);
1079         btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid);
1080         btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION);
1081         fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON |
1082                                 BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1083         btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags &
1084                                       BTRFS_QGROUP_STATUS_FLAGS_MASK);
1085         btrfs_set_qgroup_status_rescan(leaf, ptr, 0);
1086
1087         btrfs_mark_buffer_dirty(leaf);
1088
1089         key.objectid = 0;
1090         key.type = BTRFS_ROOT_REF_KEY;
1091         key.offset = 0;
1092
1093         btrfs_release_path(path);
1094         ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0);
1095         if (ret > 0)
1096                 goto out_add_root;
1097         if (ret < 0) {
1098                 btrfs_abort_transaction(trans, ret);
1099                 goto out_free_path;
1100         }
1101
1102         while (1) {
1103                 slot = path->slots[0];
1104                 leaf = path->nodes[0];
1105                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
1106
1107                 if (found_key.type == BTRFS_ROOT_REF_KEY) {
1108
1109                         /* Release locks on tree_root before we access quota_root */
1110                         btrfs_release_path(path);
1111
1112                         /* We should not have a stray @prealloc pointer. */
1113                         ASSERT(prealloc == NULL);
1114                         prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
1115                         if (!prealloc) {
1116                                 ret = -ENOMEM;
1117                                 btrfs_abort_transaction(trans, ret);
1118                                 goto out_free_path;
1119                         }
1120
1121                         ret = add_qgroup_item(trans, quota_root,
1122                                               found_key.offset);
1123                         if (ret) {
1124                                 btrfs_abort_transaction(trans, ret);
1125                                 goto out_free_path;
1126                         }
1127
1128                         qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset);
1129                         prealloc = NULL;
1130                         if (IS_ERR(qgroup)) {
1131                                 ret = PTR_ERR(qgroup);
1132                                 btrfs_abort_transaction(trans, ret);
1133                                 goto out_free_path;
1134                         }
1135                         ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
1136                         if (ret < 0) {
1137                                 btrfs_abort_transaction(trans, ret);
1138                                 goto out_free_path;
1139                         }
1140                         ret = btrfs_search_slot_for_read(tree_root, &found_key,
1141                                                          path, 1, 0);
1142                         if (ret < 0) {
1143                                 btrfs_abort_transaction(trans, ret);
1144                                 goto out_free_path;
1145                         }
1146                         if (ret > 0) {
1147                                 /*
1148                                  * Shouldn't happen, but in case it does we
1149                                  * don't need to do the btrfs_next_item, just
1150                                  * continue.
1151                                  */
1152                                 continue;
1153                         }
1154                 }
1155                 ret = btrfs_next_item(tree_root, path);
1156                 if (ret < 0) {
1157                         btrfs_abort_transaction(trans, ret);
1158                         goto out_free_path;
1159                 }
1160                 if (ret)
1161                         break;
1162         }
1163
1164 out_add_root:
1165         btrfs_release_path(path);
1166         ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID);
1167         if (ret) {
1168                 btrfs_abort_transaction(trans, ret);
1169                 goto out_free_path;
1170         }
1171
1172         ASSERT(prealloc == NULL);
1173         prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
1174         if (!prealloc) {
1175                 ret = -ENOMEM;
1176                 goto out_free_path;
1177         }
1178         qgroup = add_qgroup_rb(fs_info, prealloc, BTRFS_FS_TREE_OBJECTID);
1179         prealloc = NULL;
1180         ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
1181         if (ret < 0) {
1182                 btrfs_abort_transaction(trans, ret);
1183                 goto out_free_path;
1184         }
1185
1186         mutex_unlock(&fs_info->qgroup_ioctl_lock);
1187         /*
1188          * Commit the transaction while not holding qgroup_ioctl_lock, to avoid
1189          * a deadlock with tasks concurrently doing other qgroup operations, such
1190          * adding/removing qgroups or adding/deleting qgroup relations for example,
1191          * because all qgroup operations first start or join a transaction and then
1192          * lock the qgroup_ioctl_lock mutex.
1193          * We are safe from a concurrent task trying to enable quotas, by calling
1194          * this function, since we are serialized by fs_info->subvol_sem.
1195          */
1196         ret = btrfs_commit_transaction(trans);
1197         trans = NULL;
1198         mutex_lock(&fs_info->qgroup_ioctl_lock);
1199         if (ret)
1200                 goto out_free_path;
1201
1202         /*
1203          * Set quota enabled flag after committing the transaction, to avoid
1204          * deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot
1205          * creation.
1206          */
1207         spin_lock(&fs_info->qgroup_lock);
1208         fs_info->quota_root = quota_root;
1209         set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
1210         spin_unlock(&fs_info->qgroup_lock);
1211
1212         ret = qgroup_rescan_init(fs_info, 0, 1);
1213         if (!ret) {
1214                 qgroup_rescan_zero_tracking(fs_info);
1215                 fs_info->qgroup_rescan_running = true;
1216                 btrfs_queue_work(fs_info->qgroup_rescan_workers,
1217                                  &fs_info->qgroup_rescan_work);
1218         } else {
1219                 /*
1220                  * We have set both BTRFS_FS_QUOTA_ENABLED and
1221                  * BTRFS_QGROUP_STATUS_FLAG_ON, so we can only fail with
1222                  * -EINPROGRESS. That can happen because someone started the
1223                  * rescan worker by calling quota rescan ioctl before we
1224                  * attempted to initialize the rescan worker. Failure due to
1225                  * quotas disabled in the meanwhile is not possible, because
1226                  * we are holding a write lock on fs_info->subvol_sem, which
1227                  * is also acquired when disabling quotas.
1228                  * Ignore such error, and any other error would need to undo
1229                  * everything we did in the transaction we just committed.
1230                  */
1231                 ASSERT(ret == -EINPROGRESS);
1232                 ret = 0;
1233         }
1234
1235 out_free_path:
1236         btrfs_free_path(path);
1237 out_free_root:
1238         if (ret)
1239                 btrfs_put_root(quota_root);
1240 out:
1241         if (ret) {
1242                 ulist_free(fs_info->qgroup_ulist);
1243                 fs_info->qgroup_ulist = NULL;
1244                 btrfs_sysfs_del_qgroups(fs_info);
1245         }
1246         mutex_unlock(&fs_info->qgroup_ioctl_lock);
1247         if (ret && trans)
1248                 btrfs_end_transaction(trans);
1249         else if (trans)
1250                 ret = btrfs_end_transaction(trans);
1251         ulist_free(ulist);
1252         kfree(prealloc);
1253         return ret;
1254 }
1255
1256 int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
1257 {
1258         struct btrfs_root *quota_root;
1259         struct btrfs_trans_handle *trans = NULL;
1260         int ret = 0;
1261
1262         /*
1263          * We need to have subvol_sem write locked to prevent races with
1264          * snapshot creation.
1265          */
1266         lockdep_assert_held_write(&fs_info->subvol_sem);
1267
1268         /*
1269          * Lock the cleaner mutex to prevent races with concurrent relocation,
1270          * because relocation may be building backrefs for blocks of the quota
1271          * root while we are deleting the root. This is like dropping fs roots
1272          * of deleted snapshots/subvolumes, we need the same protection.
1273          *
1274          * This also prevents races between concurrent tasks trying to disable
1275          * quotas, because we will unlock and relock qgroup_ioctl_lock across
1276          * BTRFS_FS_QUOTA_ENABLED changes.
1277          */
1278         mutex_lock(&fs_info->cleaner_mutex);
1279
1280         mutex_lock(&fs_info->qgroup_ioctl_lock);
1281         if (!fs_info->quota_root)
1282                 goto out;
1283
1284         /*
1285          * Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to
1286          * complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs
1287          * to lock that mutex while holding a transaction handle and the rescan
1288          * worker needs to commit a transaction.
1289          */
1290         mutex_unlock(&fs_info->qgroup_ioctl_lock);
1291
1292         /*
1293          * Request qgroup rescan worker to complete and wait for it. This wait
1294          * must be done before transaction start for quota disable since it may
1295          * deadlock with transaction by the qgroup rescan worker.
1296          */
1297         clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
1298         btrfs_qgroup_wait_for_completion(fs_info, false);
1299
1300         /*
1301          * 1 For the root item
1302          *
1303          * We should also reserve enough items for the quota tree deletion in
1304          * btrfs_clean_quota_tree but this is not done.
1305          *
1306          * Also, we must always start a transaction without holding the mutex
1307          * qgroup_ioctl_lock, see btrfs_quota_enable().
1308          */
1309         trans = btrfs_start_transaction(fs_info->tree_root, 1);
1310
1311         mutex_lock(&fs_info->qgroup_ioctl_lock);
1312         if (IS_ERR(trans)) {
1313                 ret = PTR_ERR(trans);
1314                 trans = NULL;
1315                 set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
1316                 goto out;
1317         }
1318
1319         if (!fs_info->quota_root)
1320                 goto out;
1321
1322         spin_lock(&fs_info->qgroup_lock);
1323         quota_root = fs_info->quota_root;
1324         fs_info->quota_root = NULL;
1325         fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
1326         fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL;
1327         spin_unlock(&fs_info->qgroup_lock);
1328
1329         btrfs_free_qgroup_config(fs_info);
1330
1331         ret = btrfs_clean_quota_tree(trans, quota_root);
1332         if (ret) {
1333                 btrfs_abort_transaction(trans, ret);
1334                 goto out;
1335         }
1336
1337         ret = btrfs_del_root(trans, &quota_root->root_key);
1338         if (ret) {
1339                 btrfs_abort_transaction(trans, ret);
1340                 goto out;
1341         }
1342
1343         spin_lock(&fs_info->trans_lock);
1344         list_del(&quota_root->dirty_list);
1345         spin_unlock(&fs_info->trans_lock);
1346
1347         btrfs_tree_lock(quota_root->node);
1348         btrfs_clear_buffer_dirty(trans, quota_root->node);
1349         btrfs_tree_unlock(quota_root->node);
1350         btrfs_free_tree_block(trans, btrfs_root_id(quota_root),
1351                               quota_root->node, 0, 1);
1352
1353         btrfs_put_root(quota_root);
1354
1355 out:
1356         mutex_unlock(&fs_info->qgroup_ioctl_lock);
1357         if (ret && trans)
1358                 btrfs_end_transaction(trans);
1359         else if (trans)
1360                 ret = btrfs_end_transaction(trans);
1361         mutex_unlock(&fs_info->cleaner_mutex);
1362
1363         return ret;
1364 }
1365
1366 static void qgroup_dirty(struct btrfs_fs_info *fs_info,
1367                          struct btrfs_qgroup *qgroup)
1368 {
1369         if (list_empty(&qgroup->dirty))
1370                 list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
1371 }
1372
1373 static void qgroup_iterator_add(struct list_head *head, struct btrfs_qgroup *qgroup)
1374 {
1375         if (!list_empty(&qgroup->iterator))
1376                 return;
1377
1378         list_add_tail(&qgroup->iterator, head);
1379 }
1380
1381 static void qgroup_iterator_clean(struct list_head *head)
1382 {
1383         while (!list_empty(head)) {
1384                 struct btrfs_qgroup *qgroup;
1385
1386                 qgroup = list_first_entry(head, struct btrfs_qgroup, iterator);
1387                 list_del_init(&qgroup->iterator);
1388         }
1389 }
1390
1391 /*
1392  * The easy accounting, we're updating qgroup relationship whose child qgroup
1393  * only has exclusive extents.
1394  *
1395  * In this case, all exclusive extents will also be exclusive for parent, so
1396  * excl/rfer just get added/removed.
1397  *
1398  * So is qgroup reservation space, which should also be added/removed to
1399  * parent.
1400  * Or when child tries to release reservation space, parent will underflow its
1401  * reservation (for relationship adding case).
1402  *
1403  * Caller should hold fs_info->qgroup_lock.
1404  */
1405 static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root,
1406                                     struct btrfs_qgroup *src, int sign)
1407 {
1408         struct btrfs_qgroup *qgroup;
1409         struct btrfs_qgroup *cur;
1410         LIST_HEAD(qgroup_list);
1411         u64 num_bytes = src->excl;
1412         int ret = 0;
1413
1414         qgroup = find_qgroup_rb(fs_info, ref_root);
1415         if (!qgroup)
1416                 goto out;
1417
1418         qgroup_iterator_add(&qgroup_list, qgroup);
1419         list_for_each_entry(cur, &qgroup_list, iterator) {
1420                 struct btrfs_qgroup_list *glist;
1421
1422                 qgroup->rfer += sign * num_bytes;
1423                 qgroup->rfer_cmpr += sign * num_bytes;
1424
1425                 WARN_ON(sign < 0 && qgroup->excl < num_bytes);
1426                 qgroup->excl += sign * num_bytes;
1427                 qgroup->excl_cmpr += sign * num_bytes;
1428
1429                 if (sign > 0)
1430                         qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
1431                 else
1432                         qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
1433                 qgroup_dirty(fs_info, qgroup);
1434
1435                 /* Append parent qgroups to @qgroup_list. */
1436                 list_for_each_entry(glist, &qgroup->groups, next_group)
1437                         qgroup_iterator_add(&qgroup_list, glist->group);
1438         }
1439         ret = 0;
1440 out:
1441         qgroup_iterator_clean(&qgroup_list);
1442         return ret;
1443 }
1444
1445
1446 /*
1447  * Quick path for updating qgroup with only excl refs.
1448  *
1449  * In that case, just update all parent will be enough.
1450  * Or we needs to do a full rescan.
1451  * Caller should also hold fs_info->qgroup_lock.
1452  *
1453  * Return 0 for quick update, return >0 for need to full rescan
1454  * and mark INCONSISTENT flag.
1455  * Return < 0 for other error.
1456  */
1457 static int quick_update_accounting(struct btrfs_fs_info *fs_info,
1458                                    u64 src, u64 dst, int sign)
1459 {
1460         struct btrfs_qgroup *qgroup;
1461         int ret = 1;
1462         int err = 0;
1463
1464         qgroup = find_qgroup_rb(fs_info, src);
1465         if (!qgroup)
1466                 goto out;
1467         if (qgroup->excl == qgroup->rfer) {
1468                 ret = 0;
1469                 err = __qgroup_excl_accounting(fs_info, dst, qgroup, sign);
1470                 if (err < 0) {
1471                         ret = err;
1472                         goto out;
1473                 }
1474         }
1475 out:
1476         if (ret)
1477                 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1478         return ret;
1479 }
1480
1481 int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
1482                               u64 dst)
1483 {
1484         struct btrfs_fs_info *fs_info = trans->fs_info;
1485         struct btrfs_qgroup *parent;
1486         struct btrfs_qgroup *member;
1487         struct btrfs_qgroup_list *list;
1488         struct btrfs_qgroup_list *prealloc = NULL;
1489         int ret = 0;
1490
1491         /* Check the level of src and dst first */
1492         if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
1493                 return -EINVAL;
1494
1495         mutex_lock(&fs_info->qgroup_ioctl_lock);
1496         if (!fs_info->quota_root) {
1497                 ret = -ENOTCONN;
1498                 goto out;
1499         }
1500         member = find_qgroup_rb(fs_info, src);
1501         parent = find_qgroup_rb(fs_info, dst);
1502         if (!member || !parent) {
1503                 ret = -EINVAL;
1504                 goto out;
1505         }
1506
1507         /* check if such qgroup relation exist firstly */
1508         list_for_each_entry(list, &member->groups, next_group) {
1509                 if (list->group == parent) {
1510                         ret = -EEXIST;
1511                         goto out;
1512                 }
1513         }
1514
1515         prealloc = kzalloc(sizeof(*list), GFP_NOFS);
1516         if (!prealloc) {
1517                 ret = -ENOMEM;
1518                 goto out;
1519         }
1520         ret = add_qgroup_relation_item(trans, src, dst);
1521         if (ret)
1522                 goto out;
1523
1524         ret = add_qgroup_relation_item(trans, dst, src);
1525         if (ret) {
1526                 del_qgroup_relation_item(trans, src, dst);
1527                 goto out;
1528         }
1529
1530         spin_lock(&fs_info->qgroup_lock);
1531         ret = __add_relation_rb(prealloc, member, parent);
1532         prealloc = NULL;
1533         if (ret < 0) {
1534                 spin_unlock(&fs_info->qgroup_lock);
1535                 goto out;
1536         }
1537         ret = quick_update_accounting(fs_info, src, dst, 1);
1538         spin_unlock(&fs_info->qgroup_lock);
1539 out:
1540         kfree(prealloc);
1541         mutex_unlock(&fs_info->qgroup_ioctl_lock);
1542         return ret;
1543 }
1544
1545 static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
1546                                  u64 dst)
1547 {
1548         struct btrfs_fs_info *fs_info = trans->fs_info;
1549         struct btrfs_qgroup *parent;
1550         struct btrfs_qgroup *member;
1551         struct btrfs_qgroup_list *list;
1552         bool found = false;
1553         int ret = 0;
1554         int ret2;
1555
1556         if (!fs_info->quota_root) {
1557                 ret = -ENOTCONN;
1558                 goto out;
1559         }
1560
1561         member = find_qgroup_rb(fs_info, src);
1562         parent = find_qgroup_rb(fs_info, dst);
1563         /*
1564          * The parent/member pair doesn't exist, then try to delete the dead
1565          * relation items only.
1566          */
1567         if (!member || !parent)
1568                 goto delete_item;
1569
1570         /* check if such qgroup relation exist firstly */
1571         list_for_each_entry(list, &member->groups, next_group) {
1572                 if (list->group == parent) {
1573                         found = true;
1574                         break;
1575                 }
1576         }
1577
1578 delete_item:
1579         ret = del_qgroup_relation_item(trans, src, dst);
1580         if (ret < 0 && ret != -ENOENT)
1581                 goto out;
1582         ret2 = del_qgroup_relation_item(trans, dst, src);
1583         if (ret2 < 0 && ret2 != -ENOENT)
1584                 goto out;
1585
1586         /* At least one deletion succeeded, return 0 */
1587         if (!ret || !ret2)
1588                 ret = 0;
1589
1590         if (found) {
1591                 spin_lock(&fs_info->qgroup_lock);
1592                 del_relation_rb(fs_info, src, dst);
1593                 ret = quick_update_accounting(fs_info, src, dst, -1);
1594                 spin_unlock(&fs_info->qgroup_lock);
1595         }
1596 out:
1597         return ret;
1598 }
1599
1600 int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
1601                               u64 dst)
1602 {
1603         struct btrfs_fs_info *fs_info = trans->fs_info;
1604         int ret = 0;
1605
1606         mutex_lock(&fs_info->qgroup_ioctl_lock);
1607         ret = __del_qgroup_relation(trans, src, dst);
1608         mutex_unlock(&fs_info->qgroup_ioctl_lock);
1609
1610         return ret;
1611 }
1612
1613 int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
1614 {
1615         struct btrfs_fs_info *fs_info = trans->fs_info;
1616         struct btrfs_root *quota_root;
1617         struct btrfs_qgroup *qgroup;
1618         struct btrfs_qgroup *prealloc = NULL;
1619         int ret = 0;
1620
1621         mutex_lock(&fs_info->qgroup_ioctl_lock);
1622         if (!fs_info->quota_root) {
1623                 ret = -ENOTCONN;
1624                 goto out;
1625         }
1626         quota_root = fs_info->quota_root;
1627         qgroup = find_qgroup_rb(fs_info, qgroupid);
1628         if (qgroup) {
1629                 ret = -EEXIST;
1630                 goto out;
1631         }
1632
1633         prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
1634         if (!prealloc) {
1635                 ret = -ENOMEM;
1636                 goto out;
1637         }
1638
1639         ret = add_qgroup_item(trans, quota_root, qgroupid);
1640         if (ret)
1641                 goto out;
1642
1643         spin_lock(&fs_info->qgroup_lock);
1644         qgroup = add_qgroup_rb(fs_info, prealloc, qgroupid);
1645         spin_unlock(&fs_info->qgroup_lock);
1646         prealloc = NULL;
1647
1648         ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
1649 out:
1650         mutex_unlock(&fs_info->qgroup_ioctl_lock);
1651         kfree(prealloc);
1652         return ret;
1653 }
1654
1655 int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
1656 {
1657         struct btrfs_fs_info *fs_info = trans->fs_info;
1658         struct btrfs_qgroup *qgroup;
1659         struct btrfs_qgroup_list *list;
1660         int ret = 0;
1661
1662         mutex_lock(&fs_info->qgroup_ioctl_lock);
1663         if (!fs_info->quota_root) {
1664                 ret = -ENOTCONN;
1665                 goto out;
1666         }
1667
1668         qgroup = find_qgroup_rb(fs_info, qgroupid);
1669         if (!qgroup) {
1670                 ret = -ENOENT;
1671                 goto out;
1672         }
1673
1674         /* Check if there are no children of this qgroup */
1675         if (!list_empty(&qgroup->members)) {
1676                 ret = -EBUSY;
1677                 goto out;
1678         }
1679
1680         ret = del_qgroup_item(trans, qgroupid);
1681         if (ret && ret != -ENOENT)
1682                 goto out;
1683
1684         while (!list_empty(&qgroup->groups)) {
1685                 list = list_first_entry(&qgroup->groups,
1686                                         struct btrfs_qgroup_list, next_group);
1687                 ret = __del_qgroup_relation(trans, qgroupid,
1688                                             list->group->qgroupid);
1689                 if (ret)
1690                         goto out;
1691         }
1692
1693         spin_lock(&fs_info->qgroup_lock);
1694         del_qgroup_rb(fs_info, qgroupid);
1695         spin_unlock(&fs_info->qgroup_lock);
1696
1697         /*
1698          * Remove the qgroup from sysfs now without holding the qgroup_lock
1699          * spinlock, since the sysfs_remove_group() function needs to take
1700          * the mutex kernfs_mutex through kernfs_remove_by_name_ns().
1701          */
1702         btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
1703         kfree(qgroup);
1704 out:
1705         mutex_unlock(&fs_info->qgroup_ioctl_lock);
1706         return ret;
1707 }
1708
1709 int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
1710                        struct btrfs_qgroup_limit *limit)
1711 {
1712         struct btrfs_fs_info *fs_info = trans->fs_info;
1713         struct btrfs_qgroup *qgroup;
1714         int ret = 0;
1715         /* Sometimes we would want to clear the limit on this qgroup.
1716          * To meet this requirement, we treat the -1 as a special value
1717          * which tell kernel to clear the limit on this qgroup.
1718          */
1719         const u64 CLEAR_VALUE = -1;
1720
1721         mutex_lock(&fs_info->qgroup_ioctl_lock);
1722         if (!fs_info->quota_root) {
1723                 ret = -ENOTCONN;
1724                 goto out;
1725         }
1726
1727         qgroup = find_qgroup_rb(fs_info, qgroupid);
1728         if (!qgroup) {
1729                 ret = -ENOENT;
1730                 goto out;
1731         }
1732
1733         spin_lock(&fs_info->qgroup_lock);
1734         if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) {
1735                 if (limit->max_rfer == CLEAR_VALUE) {
1736                         qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
1737                         limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
1738                         qgroup->max_rfer = 0;
1739                 } else {
1740                         qgroup->max_rfer = limit->max_rfer;
1741                 }
1742         }
1743         if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
1744                 if (limit->max_excl == CLEAR_VALUE) {
1745                         qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
1746                         limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
1747                         qgroup->max_excl = 0;
1748                 } else {
1749                         qgroup->max_excl = limit->max_excl;
1750                 }
1751         }
1752         if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) {
1753                 if (limit->rsv_rfer == CLEAR_VALUE) {
1754                         qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
1755                         limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
1756                         qgroup->rsv_rfer = 0;
1757                 } else {
1758                         qgroup->rsv_rfer = limit->rsv_rfer;
1759                 }
1760         }
1761         if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) {
1762                 if (limit->rsv_excl == CLEAR_VALUE) {
1763                         qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
1764                         limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
1765                         qgroup->rsv_excl = 0;
1766                 } else {
1767                         qgroup->rsv_excl = limit->rsv_excl;
1768                 }
1769         }
1770         qgroup->lim_flags |= limit->flags;
1771
1772         spin_unlock(&fs_info->qgroup_lock);
1773
1774         ret = update_qgroup_limit_item(trans, qgroup);
1775         if (ret) {
1776                 qgroup_mark_inconsistent(fs_info);
1777                 btrfs_info(fs_info, "unable to update quota limit for %llu",
1778                        qgroupid);
1779         }
1780
1781 out:
1782         mutex_unlock(&fs_info->qgroup_ioctl_lock);
1783         return ret;
1784 }
1785
1786 /*
1787  * Inform qgroup to trace one dirty extent, its info is recorded in @record.
1788  * So qgroup can account it at transaction committing time.
1789  *
1790  * No lock version, caller must acquire delayed ref lock and allocated memory,
1791  * then call btrfs_qgroup_trace_extent_post() after exiting lock context.
1792  *
1793  * Return 0 for success insert
1794  * Return >0 for existing record, caller can free @record safely.
1795  * Error is not possible
1796  */
1797 int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
1798                                 struct btrfs_delayed_ref_root *delayed_refs,
1799                                 struct btrfs_qgroup_extent_record *record)
1800 {
1801         struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node;
1802         struct rb_node *parent_node = NULL;
1803         struct btrfs_qgroup_extent_record *entry;
1804         u64 bytenr = record->bytenr;
1805
1806         lockdep_assert_held(&delayed_refs->lock);
1807         trace_btrfs_qgroup_trace_extent(fs_info, record);
1808
1809         while (*p) {
1810                 parent_node = *p;
1811                 entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
1812                                  node);
1813                 if (bytenr < entry->bytenr) {
1814                         p = &(*p)->rb_left;
1815                 } else if (bytenr > entry->bytenr) {
1816                         p = &(*p)->rb_right;
1817                 } else {
1818                         if (record->data_rsv && !entry->data_rsv) {
1819                                 entry->data_rsv = record->data_rsv;
1820                                 entry->data_rsv_refroot =
1821                                         record->data_rsv_refroot;
1822                         }
1823                         return 1;
1824                 }
1825         }
1826
1827         rb_link_node(&record->node, parent_node, p);
1828         rb_insert_color(&record->node, &delayed_refs->dirty_extent_root);
1829         return 0;
1830 }
1831
1832 /*
1833  * Post handler after qgroup_trace_extent_nolock().
1834  *
1835  * NOTE: Current qgroup does the expensive backref walk at transaction
1836  * committing time with TRANS_STATE_COMMIT_DOING, this blocks incoming
1837  * new transaction.
1838  * This is designed to allow btrfs_find_all_roots() to get correct new_roots
1839  * result.
1840  *
1841  * However for old_roots there is no need to do backref walk at that time,
1842  * since we search commit roots to walk backref and result will always be
1843  * correct.
1844  *
1845  * Due to the nature of no lock version, we can't do backref there.
1846  * So we must call btrfs_qgroup_trace_extent_post() after exiting
1847  * spinlock context.
1848  *
1849  * TODO: If we can fix and prove btrfs_find_all_roots() can get correct result
1850  * using current root, then we can move all expensive backref walk out of
1851  * transaction committing, but not now as qgroup accounting will be wrong again.
1852  */
1853 int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
1854                                    struct btrfs_qgroup_extent_record *qrecord)
1855 {
1856         struct btrfs_backref_walk_ctx ctx = { 0 };
1857         int ret;
1858
1859         /*
1860          * We are always called in a context where we are already holding a
1861          * transaction handle. Often we are called when adding a data delayed
1862          * reference from btrfs_truncate_inode_items() (truncating or unlinking),
1863          * in which case we will be holding a write lock on extent buffer from a
1864          * subvolume tree. In this case we can't allow btrfs_find_all_roots() to
1865          * acquire fs_info->commit_root_sem, because that is a higher level lock
1866          * that must be acquired before locking any extent buffers.
1867          *
1868          * So we want btrfs_find_all_roots() to not acquire the commit_root_sem
1869          * but we can't pass it a non-NULL transaction handle, because otherwise
1870          * it would not use commit roots and would lock extent buffers, causing
1871          * a deadlock if it ends up trying to read lock the same extent buffer
1872          * that was previously write locked at btrfs_truncate_inode_items().
1873          *
1874          * So pass a NULL transaction handle to btrfs_find_all_roots() and
1875          * explicitly tell it to not acquire the commit_root_sem - if we are
1876          * holding a transaction handle we don't need its protection.
1877          */
1878         ASSERT(trans != NULL);
1879
1880         if (trans->fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
1881                 return 0;
1882
1883         ctx.bytenr = qrecord->bytenr;
1884         ctx.fs_info = trans->fs_info;
1885
1886         ret = btrfs_find_all_roots(&ctx, true);
1887         if (ret < 0) {
1888                 qgroup_mark_inconsistent(trans->fs_info);
1889                 btrfs_warn(trans->fs_info,
1890 "error accounting new delayed refs extent (err code: %d), quota inconsistent",
1891                         ret);
1892                 return 0;
1893         }
1894
1895         /*
1896          * Here we don't need to get the lock of
1897          * trans->transaction->delayed_refs, since inserted qrecord won't
1898          * be deleted, only qrecord->node may be modified (new qrecord insert)
1899          *
1900          * So modifying qrecord->old_roots is safe here
1901          */
1902         qrecord->old_roots = ctx.roots;
1903         return 0;
1904 }
1905
1906 /*
1907  * Inform qgroup to trace one dirty extent, specified by @bytenr and
1908  * @num_bytes.
1909  * So qgroup can account it at commit trans time.
1910  *
1911  * Better encapsulated version, with memory allocation and backref walk for
1912  * commit roots.
1913  * So this can sleep.
1914  *
1915  * Return 0 if the operation is done.
1916  * Return <0 for error, like memory allocation failure or invalid parameter
1917  * (NULL trans)
1918  */
1919 int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
1920                               u64 num_bytes)
1921 {
1922         struct btrfs_fs_info *fs_info = trans->fs_info;
1923         struct btrfs_qgroup_extent_record *record;
1924         struct btrfs_delayed_ref_root *delayed_refs;
1925         int ret;
1926
1927         if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)
1928             || bytenr == 0 || num_bytes == 0)
1929                 return 0;
1930         record = kzalloc(sizeof(*record), GFP_NOFS);
1931         if (!record)
1932                 return -ENOMEM;
1933
1934         delayed_refs = &trans->transaction->delayed_refs;
1935         record->bytenr = bytenr;
1936         record->num_bytes = num_bytes;
1937         record->old_roots = NULL;
1938
1939         spin_lock(&delayed_refs->lock);
1940         ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record);
1941         spin_unlock(&delayed_refs->lock);
1942         if (ret > 0) {
1943                 kfree(record);
1944                 return 0;
1945         }
1946         return btrfs_qgroup_trace_extent_post(trans, record);
1947 }
1948
1949 /*
1950  * Inform qgroup to trace all leaf items of data
1951  *
1952  * Return 0 for success
1953  * Return <0 for error(ENOMEM)
1954  */
1955 int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
1956                                   struct extent_buffer *eb)
1957 {
1958         struct btrfs_fs_info *fs_info = trans->fs_info;
1959         int nr = btrfs_header_nritems(eb);
1960         int i, extent_type, ret;
1961         struct btrfs_key key;
1962         struct btrfs_file_extent_item *fi;
1963         u64 bytenr, num_bytes;
1964
1965         /* We can be called directly from walk_up_proc() */
1966         if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
1967                 return 0;
1968
1969         for (i = 0; i < nr; i++) {
1970                 btrfs_item_key_to_cpu(eb, &key, i);
1971
1972                 if (key.type != BTRFS_EXTENT_DATA_KEY)
1973                         continue;
1974
1975                 fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
1976                 /* filter out non qgroup-accountable extents  */
1977                 extent_type = btrfs_file_extent_type(eb, fi);
1978
1979                 if (extent_type == BTRFS_FILE_EXTENT_INLINE)
1980                         continue;
1981
1982                 bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
1983                 if (!bytenr)
1984                         continue;
1985
1986                 num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
1987
1988                 ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes);
1989                 if (ret)
1990                         return ret;
1991         }
1992         cond_resched();
1993         return 0;
1994 }
1995
1996 /*
1997  * Walk up the tree from the bottom, freeing leaves and any interior
1998  * nodes which have had all slots visited. If a node (leaf or
1999  * interior) is freed, the node above it will have it's slot
2000  * incremented. The root node will never be freed.
2001  *
2002  * At the end of this function, we should have a path which has all
2003  * slots incremented to the next position for a search. If we need to
2004  * read a new node it will be NULL and the node above it will have the
2005  * correct slot selected for a later read.
2006  *
2007  * If we increment the root nodes slot counter past the number of
2008  * elements, 1 is returned to signal completion of the search.
2009  */
2010 static int adjust_slots_upwards(struct btrfs_path *path, int root_level)
2011 {
2012         int level = 0;
2013         int nr, slot;
2014         struct extent_buffer *eb;
2015
2016         if (root_level == 0)
2017                 return 1;
2018
2019         while (level <= root_level) {
2020                 eb = path->nodes[level];
2021                 nr = btrfs_header_nritems(eb);
2022                 path->slots[level]++;
2023                 slot = path->slots[level];
2024                 if (slot >= nr || level == 0) {
2025                         /*
2026                          * Don't free the root -  we will detect this
2027                          * condition after our loop and return a
2028                          * positive value for caller to stop walking the tree.
2029                          */
2030                         if (level != root_level) {
2031                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
2032                                 path->locks[level] = 0;
2033
2034                                 free_extent_buffer(eb);
2035                                 path->nodes[level] = NULL;
2036                                 path->slots[level] = 0;
2037                         }
2038                 } else {
2039                         /*
2040                          * We have a valid slot to walk back down
2041                          * from. Stop here so caller can process these
2042                          * new nodes.
2043                          */
2044                         break;
2045                 }
2046
2047                 level++;
2048         }
2049
2050         eb = path->nodes[root_level];
2051         if (path->slots[root_level] >= btrfs_header_nritems(eb))
2052                 return 1;
2053
2054         return 0;
2055 }
2056
2057 /*
2058  * Helper function to trace a subtree tree block swap.
2059  *
2060  * The swap will happen in highest tree block, but there may be a lot of
2061  * tree blocks involved.
2062  *
2063  * For example:
2064  *  OO = Old tree blocks
2065  *  NN = New tree blocks allocated during balance
2066  *
2067  *           File tree (257)                  Reloc tree for 257
2068  * L2              OO                                NN
2069  *               /    \                            /    \
2070  * L1          OO      OO (a)                    OO      NN (a)
2071  *            / \     / \                       / \     / \
2072  * L0       OO   OO OO   OO                   OO   OO NN   NN
2073  *                  (b)  (c)                          (b)  (c)
2074  *
2075  * When calling qgroup_trace_extent_swap(), we will pass:
2076  * @src_eb = OO(a)
2077  * @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ]
2078  * @dst_level = 0
2079  * @root_level = 1
2080  *
2081  * In that case, qgroup_trace_extent_swap() will search from OO(a) to
2082  * reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty.
2083  *
2084  * The main work of qgroup_trace_extent_swap() can be split into 3 parts:
2085  *
2086  * 1) Tree search from @src_eb
2087  *    It should acts as a simplified btrfs_search_slot().
2088  *    The key for search can be extracted from @dst_path->nodes[dst_level]
2089  *    (first key).
2090  *
2091  * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty
2092  *    NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty.
2093  *    They should be marked during previous (@dst_level = 1) iteration.
2094  *
2095  * 3) Mark file extents in leaves dirty
2096  *    We don't have good way to pick out new file extents only.
2097  *    So we still follow the old method by scanning all file extents in
2098  *    the leave.
2099  *
2100  * This function can free us from keeping two paths, thus later we only need
2101  * to care about how to iterate all new tree blocks in reloc tree.
2102  */
2103 static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
2104                                     struct extent_buffer *src_eb,
2105                                     struct btrfs_path *dst_path,
2106                                     int dst_level, int root_level,
2107                                     bool trace_leaf)
2108 {
2109         struct btrfs_key key;
2110         struct btrfs_path *src_path;
2111         struct btrfs_fs_info *fs_info = trans->fs_info;
2112         u32 nodesize = fs_info->nodesize;
2113         int cur_level = root_level;
2114         int ret;
2115
2116         BUG_ON(dst_level > root_level);
2117         /* Level mismatch */
2118         if (btrfs_header_level(src_eb) != root_level)
2119                 return -EINVAL;
2120
2121         src_path = btrfs_alloc_path();
2122         if (!src_path) {
2123                 ret = -ENOMEM;
2124                 goto out;
2125         }
2126
2127         if (dst_level)
2128                 btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
2129         else
2130                 btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
2131
2132         /* For src_path */
2133         atomic_inc(&src_eb->refs);
2134         src_path->nodes[root_level] = src_eb;
2135         src_path->slots[root_level] = dst_path->slots[root_level];
2136         src_path->locks[root_level] = 0;
2137
2138         /* A simplified version of btrfs_search_slot() */
2139         while (cur_level >= dst_level) {
2140                 struct btrfs_key src_key;
2141                 struct btrfs_key dst_key;
2142
2143                 if (src_path->nodes[cur_level] == NULL) {
2144                         struct extent_buffer *eb;
2145                         int parent_slot;
2146
2147                         eb = src_path->nodes[cur_level + 1];
2148                         parent_slot = src_path->slots[cur_level + 1];
2149
2150                         eb = btrfs_read_node_slot(eb, parent_slot);
2151                         if (IS_ERR(eb)) {
2152                                 ret = PTR_ERR(eb);
2153                                 goto out;
2154                         }
2155
2156                         src_path->nodes[cur_level] = eb;
2157
2158                         btrfs_tree_read_lock(eb);
2159                         src_path->locks[cur_level] = BTRFS_READ_LOCK;
2160                 }
2161
2162                 src_path->slots[cur_level] = dst_path->slots[cur_level];
2163                 if (cur_level) {
2164                         btrfs_node_key_to_cpu(dst_path->nodes[cur_level],
2165                                         &dst_key, dst_path->slots[cur_level]);
2166                         btrfs_node_key_to_cpu(src_path->nodes[cur_level],
2167                                         &src_key, src_path->slots[cur_level]);
2168                 } else {
2169                         btrfs_item_key_to_cpu(dst_path->nodes[cur_level],
2170                                         &dst_key, dst_path->slots[cur_level]);
2171                         btrfs_item_key_to_cpu(src_path->nodes[cur_level],
2172                                         &src_key, src_path->slots[cur_level]);
2173                 }
2174                 /* Content mismatch, something went wrong */
2175                 if (btrfs_comp_cpu_keys(&dst_key, &src_key)) {
2176                         ret = -ENOENT;
2177                         goto out;
2178                 }
2179                 cur_level--;
2180         }
2181
2182         /*
2183          * Now both @dst_path and @src_path have been populated, record the tree
2184          * blocks for qgroup accounting.
2185          */
2186         ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start,
2187                                         nodesize);
2188         if (ret < 0)
2189                 goto out;
2190         ret = btrfs_qgroup_trace_extent(trans, dst_path->nodes[dst_level]->start,
2191                                         nodesize);
2192         if (ret < 0)
2193                 goto out;
2194
2195         /* Record leaf file extents */
2196         if (dst_level == 0 && trace_leaf) {
2197                 ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]);
2198                 if (ret < 0)
2199                         goto out;
2200                 ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]);
2201         }
2202 out:
2203         btrfs_free_path(src_path);
2204         return ret;
2205 }
2206
2207 /*
2208  * Helper function to do recursive generation-aware depth-first search, to
2209  * locate all new tree blocks in a subtree of reloc tree.
2210  *
2211  * E.g. (OO = Old tree blocks, NN = New tree blocks, whose gen == last_snapshot)
2212  *         reloc tree
2213  * L2         NN (a)
2214  *          /    \
2215  * L1    OO        NN (b)
2216  *      /  \      /  \
2217  * L0  OO  OO    OO  NN
2218  *               (c) (d)
2219  * If we pass:
2220  * @dst_path = [ nodes[1] = NN(b), nodes[0] = NULL ],
2221  * @cur_level = 1
2222  * @root_level = 1
2223  *
2224  * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace
2225  * above tree blocks along with their counter parts in file tree.
2226  * While during search, old tree blocks OO(c) will be skipped as tree block swap
2227  * won't affect OO(c).
2228  */
2229 static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
2230                                            struct extent_buffer *src_eb,
2231                                            struct btrfs_path *dst_path,
2232                                            int cur_level, int root_level,
2233                                            u64 last_snapshot, bool trace_leaf)
2234 {
2235         struct btrfs_fs_info *fs_info = trans->fs_info;
2236         struct extent_buffer *eb;
2237         bool need_cleanup = false;
2238         int ret = 0;
2239         int i;
2240
2241         /* Level sanity check */
2242         if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 ||
2243             root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 ||
2244             root_level < cur_level) {
2245                 btrfs_err_rl(fs_info,
2246                         "%s: bad levels, cur_level=%d root_level=%d",
2247                         __func__, cur_level, root_level);
2248                 return -EUCLEAN;
2249         }
2250
2251         /* Read the tree block if needed */
2252         if (dst_path->nodes[cur_level] == NULL) {
2253                 int parent_slot;
2254                 u64 child_gen;
2255
2256                 /*
2257                  * dst_path->nodes[root_level] must be initialized before
2258                  * calling this function.
2259                  */
2260                 if (cur_level == root_level) {
2261                         btrfs_err_rl(fs_info,
2262         "%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d",
2263                                 __func__, root_level, root_level, cur_level);
2264                         return -EUCLEAN;
2265                 }
2266
2267                 /*
2268                  * We need to get child blockptr/gen from parent before we can
2269                  * read it.
2270                   */
2271                 eb = dst_path->nodes[cur_level + 1];
2272                 parent_slot = dst_path->slots[cur_level + 1];
2273                 child_gen = btrfs_node_ptr_generation(eb, parent_slot);
2274
2275                 /* This node is old, no need to trace */
2276                 if (child_gen < last_snapshot)
2277                         goto out;
2278
2279                 eb = btrfs_read_node_slot(eb, parent_slot);
2280                 if (IS_ERR(eb)) {
2281                         ret = PTR_ERR(eb);
2282                         goto out;
2283                 }
2284
2285                 dst_path->nodes[cur_level] = eb;
2286                 dst_path->slots[cur_level] = 0;
2287
2288                 btrfs_tree_read_lock(eb);
2289                 dst_path->locks[cur_level] = BTRFS_READ_LOCK;
2290                 need_cleanup = true;
2291         }
2292
2293         /* Now record this tree block and its counter part for qgroups */
2294         ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level,
2295                                        root_level, trace_leaf);
2296         if (ret < 0)
2297                 goto cleanup;
2298
2299         eb = dst_path->nodes[cur_level];
2300
2301         if (cur_level > 0) {
2302                 /* Iterate all child tree blocks */
2303                 for (i = 0; i < btrfs_header_nritems(eb); i++) {
2304                         /* Skip old tree blocks as they won't be swapped */
2305                         if (btrfs_node_ptr_generation(eb, i) < last_snapshot)
2306                                 continue;
2307                         dst_path->slots[cur_level] = i;
2308
2309                         /* Recursive call (at most 7 times) */
2310                         ret = qgroup_trace_new_subtree_blocks(trans, src_eb,
2311                                         dst_path, cur_level - 1, root_level,
2312                                         last_snapshot, trace_leaf);
2313                         if (ret < 0)
2314                                 goto cleanup;
2315                 }
2316         }
2317
2318 cleanup:
2319         if (need_cleanup) {
2320                 /* Clean up */
2321                 btrfs_tree_unlock_rw(dst_path->nodes[cur_level],
2322                                      dst_path->locks[cur_level]);
2323                 free_extent_buffer(dst_path->nodes[cur_level]);
2324                 dst_path->nodes[cur_level] = NULL;
2325                 dst_path->slots[cur_level] = 0;
2326                 dst_path->locks[cur_level] = 0;
2327         }
2328 out:
2329         return ret;
2330 }
2331
2332 static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
2333                                 struct extent_buffer *src_eb,
2334                                 struct extent_buffer *dst_eb,
2335                                 u64 last_snapshot, bool trace_leaf)
2336 {
2337         struct btrfs_fs_info *fs_info = trans->fs_info;
2338         struct btrfs_path *dst_path = NULL;
2339         int level;
2340         int ret;
2341
2342         if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2343                 return 0;
2344
2345         /* Wrong parameter order */
2346         if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) {
2347                 btrfs_err_rl(fs_info,
2348                 "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__,
2349                              btrfs_header_generation(src_eb),
2350                              btrfs_header_generation(dst_eb));
2351                 return -EUCLEAN;
2352         }
2353
2354         if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) {
2355                 ret = -EIO;
2356                 goto out;
2357         }
2358
2359         level = btrfs_header_level(dst_eb);
2360         dst_path = btrfs_alloc_path();
2361         if (!dst_path) {
2362                 ret = -ENOMEM;
2363                 goto out;
2364         }
2365         /* For dst_path */
2366         atomic_inc(&dst_eb->refs);
2367         dst_path->nodes[level] = dst_eb;
2368         dst_path->slots[level] = 0;
2369         dst_path->locks[level] = 0;
2370
2371         /* Do the generation aware breadth-first search */
2372         ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level,
2373                                               level, last_snapshot, trace_leaf);
2374         if (ret < 0)
2375                 goto out;
2376         ret = 0;
2377
2378 out:
2379         btrfs_free_path(dst_path);
2380         if (ret < 0)
2381                 qgroup_mark_inconsistent(fs_info);
2382         return ret;
2383 }
2384
2385 /*
2386  * Inform qgroup to trace a whole subtree, including all its child tree
2387  * blocks and data.
2388  * The root tree block is specified by @root_eb.
2389  *
2390  * Normally used by relocation(tree block swap) and subvolume deletion.
2391  *
2392  * Return 0 for success
2393  * Return <0 for error(ENOMEM or tree search error)
2394  */
2395 int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
2396                                struct extent_buffer *root_eb,
2397                                u64 root_gen, int root_level)
2398 {
2399         struct btrfs_fs_info *fs_info = trans->fs_info;
2400         int ret = 0;
2401         int level;
2402         u8 drop_subptree_thres;
2403         struct extent_buffer *eb = root_eb;
2404         struct btrfs_path *path = NULL;
2405
2406         BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL);
2407         BUG_ON(root_eb == NULL);
2408
2409         if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2410                 return 0;
2411
2412         spin_lock(&fs_info->qgroup_lock);
2413         drop_subptree_thres = fs_info->qgroup_drop_subtree_thres;
2414         spin_unlock(&fs_info->qgroup_lock);
2415
2416         /*
2417          * This function only gets called for snapshot drop, if we hit a high
2418          * node here, it means we are going to change ownership for quite a lot
2419          * of extents, which will greatly slow down btrfs_commit_transaction().
2420          *
2421          * So here if we find a high tree here, we just skip the accounting and
2422          * mark qgroup inconsistent.
2423          */
2424         if (root_level >= drop_subptree_thres) {
2425                 qgroup_mark_inconsistent(fs_info);
2426                 return 0;
2427         }
2428
2429         if (!extent_buffer_uptodate(root_eb)) {
2430                 struct btrfs_tree_parent_check check = {
2431                         .has_first_key = false,
2432                         .transid = root_gen,
2433                         .level = root_level
2434                 };
2435
2436                 ret = btrfs_read_extent_buffer(root_eb, &check);
2437                 if (ret)
2438                         goto out;
2439         }
2440
2441         if (root_level == 0) {
2442                 ret = btrfs_qgroup_trace_leaf_items(trans, root_eb);
2443                 goto out;
2444         }
2445
2446         path = btrfs_alloc_path();
2447         if (!path)
2448                 return -ENOMEM;
2449
2450         /*
2451          * Walk down the tree.  Missing extent blocks are filled in as
2452          * we go. Metadata is accounted every time we read a new
2453          * extent block.
2454          *
2455          * When we reach a leaf, we account for file extent items in it,
2456          * walk back up the tree (adjusting slot pointers as we go)
2457          * and restart the search process.
2458          */
2459         atomic_inc(&root_eb->refs);     /* For path */
2460         path->nodes[root_level] = root_eb;
2461         path->slots[root_level] = 0;
2462         path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
2463 walk_down:
2464         level = root_level;
2465         while (level >= 0) {
2466                 if (path->nodes[level] == NULL) {
2467                         int parent_slot;
2468                         u64 child_bytenr;
2469
2470                         /*
2471                          * We need to get child blockptr from parent before we
2472                          * can read it.
2473                           */
2474                         eb = path->nodes[level + 1];
2475                         parent_slot = path->slots[level + 1];
2476                         child_bytenr = btrfs_node_blockptr(eb, parent_slot);
2477
2478                         eb = btrfs_read_node_slot(eb, parent_slot);
2479                         if (IS_ERR(eb)) {
2480                                 ret = PTR_ERR(eb);
2481                                 goto out;
2482                         }
2483
2484                         path->nodes[level] = eb;
2485                         path->slots[level] = 0;
2486
2487                         btrfs_tree_read_lock(eb);
2488                         path->locks[level] = BTRFS_READ_LOCK;
2489
2490                         ret = btrfs_qgroup_trace_extent(trans, child_bytenr,
2491                                                         fs_info->nodesize);
2492                         if (ret)
2493                                 goto out;
2494                 }
2495
2496                 if (level == 0) {
2497                         ret = btrfs_qgroup_trace_leaf_items(trans,
2498                                                             path->nodes[level]);
2499                         if (ret)
2500                                 goto out;
2501
2502                         /* Nonzero return here means we completed our search */
2503                         ret = adjust_slots_upwards(path, root_level);
2504                         if (ret)
2505                                 break;
2506
2507                         /* Restart search with new slots */
2508                         goto walk_down;
2509                 }
2510
2511                 level--;
2512         }
2513
2514         ret = 0;
2515 out:
2516         btrfs_free_path(path);
2517
2518         return ret;
2519 }
2520
2521 static void qgroup_iterator_nested_add(struct list_head *head, struct btrfs_qgroup *qgroup)
2522 {
2523         if (!list_empty(&qgroup->nested_iterator))
2524                 return;
2525
2526         list_add_tail(&qgroup->nested_iterator, head);
2527 }
2528
2529 static void qgroup_iterator_nested_clean(struct list_head *head)
2530 {
2531         while (!list_empty(head)) {
2532                 struct btrfs_qgroup *qgroup;
2533
2534                 qgroup = list_first_entry(head, struct btrfs_qgroup, nested_iterator);
2535                 list_del_init(&qgroup->nested_iterator);
2536         }
2537 }
2538
2539 #define UPDATE_NEW      0
2540 #define UPDATE_OLD      1
2541 /*
2542  * Walk all of the roots that points to the bytenr and adjust their refcnts.
2543  */
2544 static void qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
2545                                  struct ulist *roots, struct list_head *qgroups,
2546                                  u64 seq, int update_old)
2547 {
2548         struct ulist_node *unode;
2549         struct ulist_iterator uiter;
2550         struct btrfs_qgroup *qg;
2551
2552         if (!roots)
2553                 return;
2554         ULIST_ITER_INIT(&uiter);
2555         while ((unode = ulist_next(roots, &uiter))) {
2556                 LIST_HEAD(tmp);
2557
2558                 qg = find_qgroup_rb(fs_info, unode->val);
2559                 if (!qg)
2560                         continue;
2561
2562                 qgroup_iterator_nested_add(qgroups, qg);
2563                 qgroup_iterator_add(&tmp, qg);
2564                 list_for_each_entry(qg, &tmp, iterator) {
2565                         struct btrfs_qgroup_list *glist;
2566
2567                         if (update_old)
2568                                 btrfs_qgroup_update_old_refcnt(qg, seq, 1);
2569                         else
2570                                 btrfs_qgroup_update_new_refcnt(qg, seq, 1);
2571
2572                         list_for_each_entry(glist, &qg->groups, next_group) {
2573                                 qgroup_iterator_nested_add(qgroups, glist->group);
2574                                 qgroup_iterator_add(&tmp, glist->group);
2575                         }
2576                 }
2577                 qgroup_iterator_clean(&tmp);
2578         }
2579 }
2580
2581 /*
2582  * Update qgroup rfer/excl counters.
2583  * Rfer update is easy, codes can explain themselves.
2584  *
2585  * Excl update is tricky, the update is split into 2 parts.
2586  * Part 1: Possible exclusive <-> sharing detect:
2587  *      |       A       |       !A      |
2588  *  -------------------------------------
2589  *  B   |       *       |       -       |
2590  *  -------------------------------------
2591  *  !B  |       +       |       **      |
2592  *  -------------------------------------
2593  *
2594  * Conditions:
2595  * A:   cur_old_roots < nr_old_roots    (not exclusive before)
2596  * !A:  cur_old_roots == nr_old_roots   (possible exclusive before)
2597  * B:   cur_new_roots < nr_new_roots    (not exclusive now)
2598  * !B:  cur_new_roots == nr_new_roots   (possible exclusive now)
2599  *
2600  * Results:
2601  * +: Possible sharing -> exclusive     -: Possible exclusive -> sharing
2602  * *: Definitely not changed.           **: Possible unchanged.
2603  *
2604  * For !A and !B condition, the exception is cur_old/new_roots == 0 case.
2605  *
2606  * To make the logic clear, we first use condition A and B to split
2607  * combination into 4 results.
2608  *
2609  * Then, for result "+" and "-", check old/new_roots == 0 case, as in them
2610  * only on variant maybe 0.
2611  *
2612  * Lastly, check result **, since there are 2 variants maybe 0, split them
2613  * again(2x2).
2614  * But this time we don't need to consider other things, the codes and logic
2615  * is easy to understand now.
2616  */
2617 static void qgroup_update_counters(struct btrfs_fs_info *fs_info,
2618                                    struct list_head *qgroups, u64 nr_old_roots,
2619                                    u64 nr_new_roots, u64 num_bytes, u64 seq)
2620 {
2621         struct btrfs_qgroup *qg;
2622
2623         list_for_each_entry(qg, qgroups, nested_iterator) {
2624                 u64 cur_new_count, cur_old_count;
2625                 bool dirty = false;
2626
2627                 cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
2628                 cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
2629
2630                 trace_qgroup_update_counters(fs_info, qg, cur_old_count,
2631                                              cur_new_count);
2632
2633                 /* Rfer update part */
2634                 if (cur_old_count == 0 && cur_new_count > 0) {
2635                         qg->rfer += num_bytes;
2636                         qg->rfer_cmpr += num_bytes;
2637                         dirty = true;
2638                 }
2639                 if (cur_old_count > 0 && cur_new_count == 0) {
2640                         qg->rfer -= num_bytes;
2641                         qg->rfer_cmpr -= num_bytes;
2642                         dirty = true;
2643                 }
2644
2645                 /* Excl update part */
2646                 /* Exclusive/none -> shared case */
2647                 if (cur_old_count == nr_old_roots &&
2648                     cur_new_count < nr_new_roots) {
2649                         /* Exclusive -> shared */
2650                         if (cur_old_count != 0) {
2651                                 qg->excl -= num_bytes;
2652                                 qg->excl_cmpr -= num_bytes;
2653                                 dirty = true;
2654                         }
2655                 }
2656
2657                 /* Shared -> exclusive/none case */
2658                 if (cur_old_count < nr_old_roots &&
2659                     cur_new_count == nr_new_roots) {
2660                         /* Shared->exclusive */
2661                         if (cur_new_count != 0) {
2662                                 qg->excl += num_bytes;
2663                                 qg->excl_cmpr += num_bytes;
2664                                 dirty = true;
2665                         }
2666                 }
2667
2668                 /* Exclusive/none -> exclusive/none case */
2669                 if (cur_old_count == nr_old_roots &&
2670                     cur_new_count == nr_new_roots) {
2671                         if (cur_old_count == 0) {
2672                                 /* None -> exclusive/none */
2673
2674                                 if (cur_new_count != 0) {
2675                                         /* None -> exclusive */
2676                                         qg->excl += num_bytes;
2677                                         qg->excl_cmpr += num_bytes;
2678                                         dirty = true;
2679                                 }
2680                                 /* None -> none, nothing changed */
2681                         } else {
2682                                 /* Exclusive -> exclusive/none */
2683
2684                                 if (cur_new_count == 0) {
2685                                         /* Exclusive -> none */
2686                                         qg->excl -= num_bytes;
2687                                         qg->excl_cmpr -= num_bytes;
2688                                         dirty = true;
2689                                 }
2690                                 /* Exclusive -> exclusive, nothing changed */
2691                         }
2692                 }
2693
2694                 if (dirty)
2695                         qgroup_dirty(fs_info, qg);
2696         }
2697 }
2698
2699 /*
2700  * Check if the @roots potentially is a list of fs tree roots
2701  *
2702  * Return 0 for definitely not a fs/subvol tree roots ulist
2703  * Return 1 for possible fs/subvol tree roots in the list (considering an empty
2704  *          one as well)
2705  */
2706 static int maybe_fs_roots(struct ulist *roots)
2707 {
2708         struct ulist_node *unode;
2709         struct ulist_iterator uiter;
2710
2711         /* Empty one, still possible for fs roots */
2712         if (!roots || roots->nnodes == 0)
2713                 return 1;
2714
2715         ULIST_ITER_INIT(&uiter);
2716         unode = ulist_next(roots, &uiter);
2717         if (!unode)
2718                 return 1;
2719
2720         /*
2721          * If it contains fs tree roots, then it must belong to fs/subvol
2722          * trees.
2723          * If it contains a non-fs tree, it won't be shared with fs/subvol trees.
2724          */
2725         return is_fstree(unode->val);
2726 }
2727
2728 int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
2729                                 u64 num_bytes, struct ulist *old_roots,
2730                                 struct ulist *new_roots)
2731 {
2732         struct btrfs_fs_info *fs_info = trans->fs_info;
2733         LIST_HEAD(qgroups);
2734         u64 seq;
2735         u64 nr_new_roots = 0;
2736         u64 nr_old_roots = 0;
2737         int ret = 0;
2738
2739         /*
2740          * If quotas get disabled meanwhile, the resources need to be freed and
2741          * we can't just exit here.
2742          */
2743         if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
2744             fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
2745                 goto out_free;
2746
2747         if (new_roots) {
2748                 if (!maybe_fs_roots(new_roots))
2749                         goto out_free;
2750                 nr_new_roots = new_roots->nnodes;
2751         }
2752         if (old_roots) {
2753                 if (!maybe_fs_roots(old_roots))
2754                         goto out_free;
2755                 nr_old_roots = old_roots->nnodes;
2756         }
2757
2758         /* Quick exit, either not fs tree roots, or won't affect any qgroup */
2759         if (nr_old_roots == 0 && nr_new_roots == 0)
2760                 goto out_free;
2761
2762         BUG_ON(!fs_info->quota_root);
2763
2764         trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr,
2765                                         num_bytes, nr_old_roots, nr_new_roots);
2766
2767         mutex_lock(&fs_info->qgroup_rescan_lock);
2768         if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
2769                 if (fs_info->qgroup_rescan_progress.objectid <= bytenr) {
2770                         mutex_unlock(&fs_info->qgroup_rescan_lock);
2771                         ret = 0;
2772                         goto out_free;
2773                 }
2774         }
2775         mutex_unlock(&fs_info->qgroup_rescan_lock);
2776
2777         spin_lock(&fs_info->qgroup_lock);
2778         seq = fs_info->qgroup_seq;
2779
2780         /* Update old refcnts using old_roots */
2781         qgroup_update_refcnt(fs_info, old_roots, &qgroups, seq, UPDATE_OLD);
2782
2783         /* Update new refcnts using new_roots */
2784         qgroup_update_refcnt(fs_info, new_roots, &qgroups, seq, UPDATE_NEW);
2785
2786         qgroup_update_counters(fs_info, &qgroups, nr_old_roots, nr_new_roots,
2787                                num_bytes, seq);
2788
2789         /*
2790          * Bump qgroup_seq to avoid seq overlap
2791          */
2792         fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1;
2793         spin_unlock(&fs_info->qgroup_lock);
2794 out_free:
2795         qgroup_iterator_nested_clean(&qgroups);
2796         ulist_free(old_roots);
2797         ulist_free(new_roots);
2798         return ret;
2799 }
2800
2801 int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
2802 {
2803         struct btrfs_fs_info *fs_info = trans->fs_info;
2804         struct btrfs_qgroup_extent_record *record;
2805         struct btrfs_delayed_ref_root *delayed_refs;
2806         struct ulist *new_roots = NULL;
2807         struct rb_node *node;
2808         u64 num_dirty_extents = 0;
2809         u64 qgroup_to_skip;
2810         int ret = 0;
2811
2812         delayed_refs = &trans->transaction->delayed_refs;
2813         qgroup_to_skip = delayed_refs->qgroup_to_skip;
2814         while ((node = rb_first(&delayed_refs->dirty_extent_root))) {
2815                 record = rb_entry(node, struct btrfs_qgroup_extent_record,
2816                                   node);
2817
2818                 num_dirty_extents++;
2819                 trace_btrfs_qgroup_account_extents(fs_info, record);
2820
2821                 if (!ret && !(fs_info->qgroup_flags &
2822                               BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) {
2823                         struct btrfs_backref_walk_ctx ctx = { 0 };
2824
2825                         ctx.bytenr = record->bytenr;
2826                         ctx.fs_info = fs_info;
2827
2828                         /*
2829                          * Old roots should be searched when inserting qgroup
2830                          * extent record.
2831                          *
2832                          * But for INCONSISTENT (NO_ACCOUNTING) -> rescan case,
2833                          * we may have some record inserted during
2834                          * NO_ACCOUNTING (thus no old_roots populated), but
2835                          * later we start rescan, which clears NO_ACCOUNTING,
2836                          * leaving some inserted records without old_roots
2837                          * populated.
2838                          *
2839                          * Those cases are rare and should not cause too much
2840                          * time spent during commit_transaction().
2841                          */
2842                         if (!record->old_roots) {
2843                                 /* Search commit root to find old_roots */
2844                                 ret = btrfs_find_all_roots(&ctx, false);
2845                                 if (ret < 0)
2846                                         goto cleanup;
2847                                 record->old_roots = ctx.roots;
2848                                 ctx.roots = NULL;
2849                         }
2850
2851                         /* Free the reserved data space */
2852                         btrfs_qgroup_free_refroot(fs_info,
2853                                         record->data_rsv_refroot,
2854                                         record->data_rsv,
2855                                         BTRFS_QGROUP_RSV_DATA);
2856                         /*
2857                          * Use BTRFS_SEQ_LAST as time_seq to do special search,
2858                          * which doesn't lock tree or delayed_refs and search
2859                          * current root. It's safe inside commit_transaction().
2860                          */
2861                         ctx.trans = trans;
2862                         ctx.time_seq = BTRFS_SEQ_LAST;
2863                         ret = btrfs_find_all_roots(&ctx, false);
2864                         if (ret < 0)
2865                                 goto cleanup;
2866                         new_roots = ctx.roots;
2867                         if (qgroup_to_skip) {
2868                                 ulist_del(new_roots, qgroup_to_skip, 0);
2869                                 ulist_del(record->old_roots, qgroup_to_skip,
2870                                           0);
2871                         }
2872                         ret = btrfs_qgroup_account_extent(trans, record->bytenr,
2873                                                           record->num_bytes,
2874                                                           record->old_roots,
2875                                                           new_roots);
2876                         record->old_roots = NULL;
2877                         new_roots = NULL;
2878                 }
2879 cleanup:
2880                 ulist_free(record->old_roots);
2881                 ulist_free(new_roots);
2882                 new_roots = NULL;
2883                 rb_erase(node, &delayed_refs->dirty_extent_root);
2884                 kfree(record);
2885
2886         }
2887         trace_qgroup_num_dirty_extents(fs_info, trans->transid,
2888                                        num_dirty_extents);
2889         return ret;
2890 }
2891
2892 /*
2893  * Writes all changed qgroups to disk.
2894  * Called by the transaction commit path and the qgroup assign ioctl.
2895  */
2896 int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
2897 {
2898         struct btrfs_fs_info *fs_info = trans->fs_info;
2899         int ret = 0;
2900
2901         /*
2902          * In case we are called from the qgroup assign ioctl, assert that we
2903          * are holding the qgroup_ioctl_lock, otherwise we can race with a quota
2904          * disable operation (ioctl) and access a freed quota root.
2905          */
2906         if (trans->transaction->state != TRANS_STATE_COMMIT_DOING)
2907                 lockdep_assert_held(&fs_info->qgroup_ioctl_lock);
2908
2909         if (!fs_info->quota_root)
2910                 return ret;
2911
2912         spin_lock(&fs_info->qgroup_lock);
2913         while (!list_empty(&fs_info->dirty_qgroups)) {
2914                 struct btrfs_qgroup *qgroup;
2915                 qgroup = list_first_entry(&fs_info->dirty_qgroups,
2916                                           struct btrfs_qgroup, dirty);
2917                 list_del_init(&qgroup->dirty);
2918                 spin_unlock(&fs_info->qgroup_lock);
2919                 ret = update_qgroup_info_item(trans, qgroup);
2920                 if (ret)
2921                         qgroup_mark_inconsistent(fs_info);
2922                 ret = update_qgroup_limit_item(trans, qgroup);
2923                 if (ret)
2924                         qgroup_mark_inconsistent(fs_info);
2925                 spin_lock(&fs_info->qgroup_lock);
2926         }
2927         if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2928                 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON;
2929         else
2930                 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
2931         spin_unlock(&fs_info->qgroup_lock);
2932
2933         ret = update_qgroup_status_item(trans);
2934         if (ret)
2935                 qgroup_mark_inconsistent(fs_info);
2936
2937         return ret;
2938 }
2939
2940 /*
2941  * Copy the accounting information between qgroups. This is necessary
2942  * when a snapshot or a subvolume is created. Throwing an error will
2943  * cause a transaction abort so we take extra care here to only error
2944  * when a readonly fs is a reasonable outcome.
2945  */
2946 int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
2947                          u64 objectid, struct btrfs_qgroup_inherit *inherit)
2948 {
2949         int ret = 0;
2950         int i;
2951         u64 *i_qgroups;
2952         bool committing = false;
2953         struct btrfs_fs_info *fs_info = trans->fs_info;
2954         struct btrfs_root *quota_root;
2955         struct btrfs_qgroup *srcgroup;
2956         struct btrfs_qgroup *dstgroup;
2957         struct btrfs_qgroup *prealloc;
2958         struct btrfs_qgroup_list **qlist_prealloc = NULL;
2959         bool need_rescan = false;
2960         u32 level_size = 0;
2961         u64 nums;
2962
2963         prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
2964         if (!prealloc)
2965                 return -ENOMEM;
2966
2967         /*
2968          * There are only two callers of this function.
2969          *
2970          * One in create_subvol() in the ioctl context, which needs to hold
2971          * the qgroup_ioctl_lock.
2972          *
2973          * The other one in create_pending_snapshot() where no other qgroup
2974          * code can modify the fs as they all need to either start a new trans
2975          * or hold a trans handler, thus we don't need to hold
2976          * qgroup_ioctl_lock.
2977          * This would avoid long and complex lock chain and make lockdep happy.
2978          */
2979         spin_lock(&fs_info->trans_lock);
2980         if (trans->transaction->state == TRANS_STATE_COMMIT_DOING)
2981                 committing = true;
2982         spin_unlock(&fs_info->trans_lock);
2983
2984         if (!committing)
2985                 mutex_lock(&fs_info->qgroup_ioctl_lock);
2986         if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2987                 goto out;
2988
2989         quota_root = fs_info->quota_root;
2990         if (!quota_root) {
2991                 ret = -EINVAL;
2992                 goto out;
2993         }
2994
2995         if (inherit) {
2996                 i_qgroups = (u64 *)(inherit + 1);
2997                 nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
2998                        2 * inherit->num_excl_copies;
2999                 for (i = 0; i < nums; ++i) {
3000                         srcgroup = find_qgroup_rb(fs_info, *i_qgroups);
3001
3002                         /*
3003                          * Zero out invalid groups so we can ignore
3004                          * them later.
3005                          */
3006                         if (!srcgroup ||
3007                             ((srcgroup->qgroupid >> 48) <= (objectid >> 48)))
3008                                 *i_qgroups = 0ULL;
3009
3010                         ++i_qgroups;
3011                 }
3012         }
3013
3014         /*
3015          * create a tracking group for the subvol itself
3016          */
3017         ret = add_qgroup_item(trans, quota_root, objectid);
3018         if (ret)
3019                 goto out;
3020
3021         /*
3022          * add qgroup to all inherited groups
3023          */
3024         if (inherit) {
3025                 i_qgroups = (u64 *)(inherit + 1);
3026                 for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) {
3027                         if (*i_qgroups == 0)
3028                                 continue;
3029                         ret = add_qgroup_relation_item(trans, objectid,
3030                                                        *i_qgroups);
3031                         if (ret && ret != -EEXIST)
3032                                 goto out;
3033                         ret = add_qgroup_relation_item(trans, *i_qgroups,
3034                                                        objectid);
3035                         if (ret && ret != -EEXIST)
3036                                 goto out;
3037                 }
3038                 ret = 0;
3039
3040                 qlist_prealloc = kcalloc(inherit->num_qgroups,
3041                                          sizeof(struct btrfs_qgroup_list *),
3042                                          GFP_NOFS);
3043                 if (!qlist_prealloc) {
3044                         ret = -ENOMEM;
3045                         goto out;
3046                 }
3047                 for (int i = 0; i < inherit->num_qgroups; i++) {
3048                         qlist_prealloc[i] = kzalloc(sizeof(struct btrfs_qgroup_list),
3049                                                     GFP_NOFS);
3050                         if (!qlist_prealloc[i]) {
3051                                 ret = -ENOMEM;
3052                                 goto out;
3053                         }
3054                 }
3055         }
3056
3057         spin_lock(&fs_info->qgroup_lock);
3058
3059         dstgroup = add_qgroup_rb(fs_info, prealloc, objectid);
3060         prealloc = NULL;
3061
3062         if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
3063                 dstgroup->lim_flags = inherit->lim.flags;
3064                 dstgroup->max_rfer = inherit->lim.max_rfer;
3065                 dstgroup->max_excl = inherit->lim.max_excl;
3066                 dstgroup->rsv_rfer = inherit->lim.rsv_rfer;
3067                 dstgroup->rsv_excl = inherit->lim.rsv_excl;
3068
3069                 qgroup_dirty(fs_info, dstgroup);
3070         }
3071
3072         if (srcid) {
3073                 srcgroup = find_qgroup_rb(fs_info, srcid);
3074                 if (!srcgroup)
3075                         goto unlock;
3076
3077                 /*
3078                  * We call inherit after we clone the root in order to make sure
3079                  * our counts don't go crazy, so at this point the only
3080                  * difference between the two roots should be the root node.
3081                  */
3082                 level_size = fs_info->nodesize;
3083                 dstgroup->rfer = srcgroup->rfer;
3084                 dstgroup->rfer_cmpr = srcgroup->rfer_cmpr;
3085                 dstgroup->excl = level_size;
3086                 dstgroup->excl_cmpr = level_size;
3087                 srcgroup->excl = level_size;
3088                 srcgroup->excl_cmpr = level_size;
3089
3090                 /* inherit the limit info */
3091                 dstgroup->lim_flags = srcgroup->lim_flags;
3092                 dstgroup->max_rfer = srcgroup->max_rfer;
3093                 dstgroup->max_excl = srcgroup->max_excl;
3094                 dstgroup->rsv_rfer = srcgroup->rsv_rfer;
3095                 dstgroup->rsv_excl = srcgroup->rsv_excl;
3096
3097                 qgroup_dirty(fs_info, dstgroup);
3098                 qgroup_dirty(fs_info, srcgroup);
3099         }
3100
3101         if (!inherit)
3102                 goto unlock;
3103
3104         i_qgroups = (u64 *)(inherit + 1);
3105         for (i = 0; i < inherit->num_qgroups; ++i) {
3106                 if (*i_qgroups) {
3107                         ret = add_relation_rb(fs_info, qlist_prealloc[i], objectid,
3108                                               *i_qgroups);
3109                         qlist_prealloc[i] = NULL;
3110                         if (ret)
3111                                 goto unlock;
3112                 }
3113                 ++i_qgroups;
3114
3115                 /*
3116                  * If we're doing a snapshot, and adding the snapshot to a new
3117                  * qgroup, the numbers are guaranteed to be incorrect.
3118                  */
3119                 if (srcid)
3120                         need_rescan = true;
3121         }
3122
3123         for (i = 0; i <  inherit->num_ref_copies; ++i, i_qgroups += 2) {
3124                 struct btrfs_qgroup *src;
3125                 struct btrfs_qgroup *dst;
3126
3127                 if (!i_qgroups[0] || !i_qgroups[1])
3128                         continue;
3129
3130                 src = find_qgroup_rb(fs_info, i_qgroups[0]);
3131                 dst = find_qgroup_rb(fs_info, i_qgroups[1]);
3132
3133                 if (!src || !dst) {
3134                         ret = -EINVAL;
3135                         goto unlock;
3136                 }
3137
3138                 dst->rfer = src->rfer - level_size;
3139                 dst->rfer_cmpr = src->rfer_cmpr - level_size;
3140
3141                 /* Manually tweaking numbers certainly needs a rescan */
3142                 need_rescan = true;
3143         }
3144         for (i = 0; i <  inherit->num_excl_copies; ++i, i_qgroups += 2) {
3145                 struct btrfs_qgroup *src;
3146                 struct btrfs_qgroup *dst;
3147
3148                 if (!i_qgroups[0] || !i_qgroups[1])
3149                         continue;
3150
3151                 src = find_qgroup_rb(fs_info, i_qgroups[0]);
3152                 dst = find_qgroup_rb(fs_info, i_qgroups[1]);
3153
3154                 if (!src || !dst) {
3155                         ret = -EINVAL;
3156                         goto unlock;
3157                 }
3158
3159                 dst->excl = src->excl + level_size;
3160                 dst->excl_cmpr = src->excl_cmpr + level_size;
3161                 need_rescan = true;
3162         }
3163
3164 unlock:
3165         spin_unlock(&fs_info->qgroup_lock);
3166         if (!ret)
3167                 ret = btrfs_sysfs_add_one_qgroup(fs_info, dstgroup);
3168 out:
3169         if (!committing)
3170                 mutex_unlock(&fs_info->qgroup_ioctl_lock);
3171         if (need_rescan)
3172                 qgroup_mark_inconsistent(fs_info);
3173         if (qlist_prealloc) {
3174                 for (int i = 0; i < inherit->num_qgroups; i++)
3175                         kfree(qlist_prealloc[i]);
3176                 kfree(qlist_prealloc);
3177         }
3178         kfree(prealloc);
3179         return ret;
3180 }
3181
3182 static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes)
3183 {
3184         if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
3185             qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer)
3186                 return false;
3187
3188         if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) &&
3189             qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl)
3190                 return false;
3191
3192         return true;
3193 }
3194
3195 static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
3196                           enum btrfs_qgroup_rsv_type type)
3197 {
3198         struct btrfs_qgroup *qgroup;
3199         struct btrfs_fs_info *fs_info = root->fs_info;
3200         u64 ref_root = root->root_key.objectid;
3201         int ret = 0;
3202         LIST_HEAD(qgroup_list);
3203
3204         if (!is_fstree(ref_root))
3205                 return 0;
3206
3207         if (num_bytes == 0)
3208                 return 0;
3209
3210         if (test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags) &&
3211             capable(CAP_SYS_RESOURCE))
3212                 enforce = false;
3213
3214         spin_lock(&fs_info->qgroup_lock);
3215         if (!fs_info->quota_root)
3216                 goto out;
3217
3218         qgroup = find_qgroup_rb(fs_info, ref_root);
3219         if (!qgroup)
3220                 goto out;
3221
3222         qgroup_iterator_add(&qgroup_list, qgroup);
3223         list_for_each_entry(qgroup, &qgroup_list, iterator) {
3224                 struct btrfs_qgroup_list *glist;
3225
3226                 if (enforce && !qgroup_check_limits(qgroup, num_bytes)) {
3227                         ret = -EDQUOT;
3228                         goto out;
3229                 }
3230
3231                 list_for_each_entry(glist, &qgroup->groups, next_group)
3232                         qgroup_iterator_add(&qgroup_list, glist->group);
3233         }
3234
3235         ret = 0;
3236         /*
3237          * no limits exceeded, now record the reservation into all qgroups
3238          */
3239         list_for_each_entry(qgroup, &qgroup_list, iterator)
3240                 qgroup_rsv_add(fs_info, qgroup, num_bytes, type);
3241
3242 out:
3243         qgroup_iterator_clean(&qgroup_list);
3244         spin_unlock(&fs_info->qgroup_lock);
3245         return ret;
3246 }
3247
3248 /*
3249  * Free @num_bytes of reserved space with @type for qgroup.  (Normally level 0
3250  * qgroup).
3251  *
3252  * Will handle all higher level qgroup too.
3253  *
3254  * NOTE: If @num_bytes is (u64)-1, this means to free all bytes of this qgroup.
3255  * This special case is only used for META_PERTRANS type.
3256  */
3257 void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
3258                                u64 ref_root, u64 num_bytes,
3259                                enum btrfs_qgroup_rsv_type type)
3260 {
3261         struct btrfs_qgroup *qgroup;
3262         LIST_HEAD(qgroup_list);
3263
3264         if (!is_fstree(ref_root))
3265                 return;
3266
3267         if (num_bytes == 0)
3268                 return;
3269
3270         if (num_bytes == (u64)-1 && type != BTRFS_QGROUP_RSV_META_PERTRANS) {
3271                 WARN(1, "%s: Invalid type to free", __func__);
3272                 return;
3273         }
3274         spin_lock(&fs_info->qgroup_lock);
3275
3276         if (!fs_info->quota_root)
3277                 goto out;
3278
3279         qgroup = find_qgroup_rb(fs_info, ref_root);
3280         if (!qgroup)
3281                 goto out;
3282
3283         if (num_bytes == (u64)-1)
3284                 /*
3285                  * We're freeing all pertrans rsv, get reserved value from
3286                  * level 0 qgroup as real num_bytes to free.
3287                  */
3288                 num_bytes = qgroup->rsv.values[type];
3289
3290         qgroup_iterator_add(&qgroup_list, qgroup);
3291         list_for_each_entry(qgroup, &qgroup_list, iterator) {
3292                 struct btrfs_qgroup_list *glist;
3293
3294                 qgroup_rsv_release(fs_info, qgroup, num_bytes, type);
3295                 list_for_each_entry(glist, &qgroup->groups, next_group) {
3296                         qgroup_iterator_add(&qgroup_list, glist->group);
3297                 }
3298         }
3299 out:
3300         qgroup_iterator_clean(&qgroup_list);
3301         spin_unlock(&fs_info->qgroup_lock);
3302 }
3303
3304 /*
3305  * Check if the leaf is the last leaf. Which means all node pointers
3306  * are at their last position.
3307  */
3308 static bool is_last_leaf(struct btrfs_path *path)
3309 {
3310         int i;
3311
3312         for (i = 1; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
3313                 if (path->slots[i] != btrfs_header_nritems(path->nodes[i]) - 1)
3314                         return false;
3315         }
3316         return true;
3317 }
3318
3319 /*
3320  * returns < 0 on error, 0 when more leafs are to be scanned.
3321  * returns 1 when done.
3322  */
3323 static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
3324                               struct btrfs_path *path)
3325 {
3326         struct btrfs_fs_info *fs_info = trans->fs_info;
3327         struct btrfs_root *extent_root;
3328         struct btrfs_key found;
3329         struct extent_buffer *scratch_leaf = NULL;
3330         u64 num_bytes;
3331         bool done;
3332         int slot;
3333         int ret;
3334
3335         mutex_lock(&fs_info->qgroup_rescan_lock);
3336         extent_root = btrfs_extent_root(fs_info,
3337                                 fs_info->qgroup_rescan_progress.objectid);
3338         ret = btrfs_search_slot_for_read(extent_root,
3339                                          &fs_info->qgroup_rescan_progress,
3340                                          path, 1, 0);
3341
3342         btrfs_debug(fs_info,
3343                 "current progress key (%llu %u %llu), search_slot ret %d",
3344                 fs_info->qgroup_rescan_progress.objectid,
3345                 fs_info->qgroup_rescan_progress.type,
3346                 fs_info->qgroup_rescan_progress.offset, ret);
3347
3348         if (ret) {
3349                 /*
3350                  * The rescan is about to end, we will not be scanning any
3351                  * further blocks. We cannot unset the RESCAN flag here, because
3352                  * we want to commit the transaction if everything went well.
3353                  * To make the live accounting work in this phase, we set our
3354                  * scan progress pointer such that every real extent objectid
3355                  * will be smaller.
3356                  */
3357                 fs_info->qgroup_rescan_progress.objectid = (u64)-1;
3358                 btrfs_release_path(path);
3359                 mutex_unlock(&fs_info->qgroup_rescan_lock);
3360                 return ret;
3361         }
3362         done = is_last_leaf(path);
3363
3364         btrfs_item_key_to_cpu(path->nodes[0], &found,
3365                               btrfs_header_nritems(path->nodes[0]) - 1);
3366         fs_info->qgroup_rescan_progress.objectid = found.objectid + 1;
3367
3368         scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]);
3369         if (!scratch_leaf) {
3370                 ret = -ENOMEM;
3371                 mutex_unlock(&fs_info->qgroup_rescan_lock);
3372                 goto out;
3373         }
3374         slot = path->slots[0];
3375         btrfs_release_path(path);
3376         mutex_unlock(&fs_info->qgroup_rescan_lock);
3377
3378         for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) {
3379                 struct btrfs_backref_walk_ctx ctx = { 0 };
3380
3381                 btrfs_item_key_to_cpu(scratch_leaf, &found, slot);
3382                 if (found.type != BTRFS_EXTENT_ITEM_KEY &&
3383                     found.type != BTRFS_METADATA_ITEM_KEY)
3384                         continue;
3385                 if (found.type == BTRFS_METADATA_ITEM_KEY)
3386                         num_bytes = fs_info->nodesize;
3387                 else
3388                         num_bytes = found.offset;
3389
3390                 ctx.bytenr = found.objectid;
3391                 ctx.fs_info = fs_info;
3392
3393                 ret = btrfs_find_all_roots(&ctx, false);
3394                 if (ret < 0)
3395                         goto out;
3396                 /* For rescan, just pass old_roots as NULL */
3397                 ret = btrfs_qgroup_account_extent(trans, found.objectid,
3398                                                   num_bytes, NULL, ctx.roots);
3399                 if (ret < 0)
3400                         goto out;
3401         }
3402 out:
3403         if (scratch_leaf)
3404                 free_extent_buffer(scratch_leaf);
3405
3406         if (done && !ret) {
3407                 ret = 1;
3408                 fs_info->qgroup_rescan_progress.objectid = (u64)-1;
3409         }
3410         return ret;
3411 }
3412
3413 static bool rescan_should_stop(struct btrfs_fs_info *fs_info)
3414 {
3415         return btrfs_fs_closing(fs_info) ||
3416                 test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state) ||
3417                 !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
3418                           fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN;
3419 }
3420
3421 static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
3422 {
3423         struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info,
3424                                                      qgroup_rescan_work);
3425         struct btrfs_path *path;
3426         struct btrfs_trans_handle *trans = NULL;
3427         int err = -ENOMEM;
3428         int ret = 0;
3429         bool stopped = false;
3430         bool did_leaf_rescans = false;
3431
3432         path = btrfs_alloc_path();
3433         if (!path)
3434                 goto out;
3435         /*
3436          * Rescan should only search for commit root, and any later difference
3437          * should be recorded by qgroup
3438          */
3439         path->search_commit_root = 1;
3440         path->skip_locking = 1;
3441
3442         err = 0;
3443         while (!err && !(stopped = rescan_should_stop(fs_info))) {
3444                 trans = btrfs_start_transaction(fs_info->fs_root, 0);
3445                 if (IS_ERR(trans)) {
3446                         err = PTR_ERR(trans);
3447                         break;
3448                 }
3449
3450                 err = qgroup_rescan_leaf(trans, path);
3451                 did_leaf_rescans = true;
3452
3453                 if (err > 0)
3454                         btrfs_commit_transaction(trans);
3455                 else
3456                         btrfs_end_transaction(trans);
3457         }
3458
3459 out:
3460         btrfs_free_path(path);
3461
3462         mutex_lock(&fs_info->qgroup_rescan_lock);
3463         if (err > 0 &&
3464             fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
3465                 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
3466         } else if (err < 0 || stopped) {
3467                 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
3468         }
3469         mutex_unlock(&fs_info->qgroup_rescan_lock);
3470
3471         /*
3472          * Only update status, since the previous part has already updated the
3473          * qgroup info, and only if we did any actual work. This also prevents
3474          * race with a concurrent quota disable, which has already set
3475          * fs_info->quota_root to NULL and cleared BTRFS_FS_QUOTA_ENABLED at
3476          * btrfs_quota_disable().
3477          */
3478         if (did_leaf_rescans) {
3479                 trans = btrfs_start_transaction(fs_info->quota_root, 1);
3480                 if (IS_ERR(trans)) {
3481                         err = PTR_ERR(trans);
3482                         trans = NULL;
3483                         btrfs_err(fs_info,
3484                                   "fail to start transaction for status update: %d",
3485                                   err);
3486                 }
3487         } else {
3488                 trans = NULL;
3489         }
3490
3491         mutex_lock(&fs_info->qgroup_rescan_lock);
3492         if (!stopped ||
3493             fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN)
3494                 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3495         if (trans) {
3496                 ret = update_qgroup_status_item(trans);
3497                 if (ret < 0) {
3498                         err = ret;
3499                         btrfs_err(fs_info, "fail to update qgroup status: %d",
3500                                   err);
3501                 }
3502         }
3503         fs_info->qgroup_rescan_running = false;
3504         fs_info->qgroup_flags &= ~BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN;
3505         complete_all(&fs_info->qgroup_rescan_completion);
3506         mutex_unlock(&fs_info->qgroup_rescan_lock);
3507
3508         if (!trans)
3509                 return;
3510
3511         btrfs_end_transaction(trans);
3512
3513         if (stopped) {
3514                 btrfs_info(fs_info, "qgroup scan paused");
3515         } else if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) {
3516                 btrfs_info(fs_info, "qgroup scan cancelled");
3517         } else if (err >= 0) {
3518                 btrfs_info(fs_info, "qgroup scan completed%s",
3519                         err > 0 ? " (inconsistency flag cleared)" : "");
3520         } else {
3521                 btrfs_err(fs_info, "qgroup scan failed with %d", err);
3522         }
3523 }
3524
3525 /*
3526  * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all
3527  * memory required for the rescan context.
3528  */
3529 static int
3530 qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
3531                    int init_flags)
3532 {
3533         int ret = 0;
3534
3535         if (!init_flags) {
3536                 /* we're resuming qgroup rescan at mount time */
3537                 if (!(fs_info->qgroup_flags &
3538                       BTRFS_QGROUP_STATUS_FLAG_RESCAN)) {
3539                         btrfs_warn(fs_info,
3540                         "qgroup rescan init failed, qgroup rescan is not queued");
3541                         ret = -EINVAL;
3542                 } else if (!(fs_info->qgroup_flags &
3543                              BTRFS_QGROUP_STATUS_FLAG_ON)) {
3544                         btrfs_warn(fs_info,
3545                         "qgroup rescan init failed, qgroup is not enabled");
3546                         ret = -EINVAL;
3547                 }
3548
3549                 if (ret)
3550                         return ret;
3551         }
3552
3553         mutex_lock(&fs_info->qgroup_rescan_lock);
3554
3555         if (init_flags) {
3556                 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
3557                         btrfs_warn(fs_info,
3558                                    "qgroup rescan is already in progress");
3559                         ret = -EINPROGRESS;
3560                 } else if (!(fs_info->qgroup_flags &
3561                              BTRFS_QGROUP_STATUS_FLAG_ON)) {
3562                         btrfs_warn(fs_info,
3563                         "qgroup rescan init failed, qgroup is not enabled");
3564                         ret = -EINVAL;
3565                 } else if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
3566                         /* Quota disable is in progress */
3567                         ret = -EBUSY;
3568                 }
3569
3570                 if (ret) {
3571                         mutex_unlock(&fs_info->qgroup_rescan_lock);
3572                         return ret;
3573                 }
3574                 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3575         }
3576
3577         memset(&fs_info->qgroup_rescan_progress, 0,
3578                 sizeof(fs_info->qgroup_rescan_progress));
3579         fs_info->qgroup_flags &= ~(BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN |
3580                                    BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING);
3581         fs_info->qgroup_rescan_progress.objectid = progress_objectid;
3582         init_completion(&fs_info->qgroup_rescan_completion);
3583         mutex_unlock(&fs_info->qgroup_rescan_lock);
3584
3585         btrfs_init_work(&fs_info->qgroup_rescan_work,
3586                         btrfs_qgroup_rescan_worker, NULL, NULL);
3587         return 0;
3588 }
3589
3590 static void
3591 qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info)
3592 {
3593         struct rb_node *n;
3594         struct btrfs_qgroup *qgroup;
3595
3596         spin_lock(&fs_info->qgroup_lock);
3597         /* clear all current qgroup tracking information */
3598         for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) {
3599                 qgroup = rb_entry(n, struct btrfs_qgroup, node);
3600                 qgroup->rfer = 0;
3601                 qgroup->rfer_cmpr = 0;
3602                 qgroup->excl = 0;
3603                 qgroup->excl_cmpr = 0;
3604                 qgroup_dirty(fs_info, qgroup);
3605         }
3606         spin_unlock(&fs_info->qgroup_lock);
3607 }
3608
3609 int
3610 btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
3611 {
3612         int ret = 0;
3613         struct btrfs_trans_handle *trans;
3614
3615         ret = qgroup_rescan_init(fs_info, 0, 1);
3616         if (ret)
3617                 return ret;
3618
3619         /*
3620          * We have set the rescan_progress to 0, which means no more
3621          * delayed refs will be accounted by btrfs_qgroup_account_ref.
3622          * However, btrfs_qgroup_account_ref may be right after its call
3623          * to btrfs_find_all_roots, in which case it would still do the
3624          * accounting.
3625          * To solve this, we're committing the transaction, which will
3626          * ensure we run all delayed refs and only after that, we are
3627          * going to clear all tracking information for a clean start.
3628          */
3629
3630         trans = btrfs_attach_transaction_barrier(fs_info->fs_root);
3631         if (IS_ERR(trans) && trans != ERR_PTR(-ENOENT)) {
3632                 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3633                 return PTR_ERR(trans);
3634         } else if (trans != ERR_PTR(-ENOENT)) {
3635                 ret = btrfs_commit_transaction(trans);
3636                 if (ret) {
3637                         fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3638                         return ret;
3639                 }
3640         }
3641
3642         qgroup_rescan_zero_tracking(fs_info);
3643
3644         mutex_lock(&fs_info->qgroup_rescan_lock);
3645         fs_info->qgroup_rescan_running = true;
3646         btrfs_queue_work(fs_info->qgroup_rescan_workers,
3647                          &fs_info->qgroup_rescan_work);
3648         mutex_unlock(&fs_info->qgroup_rescan_lock);
3649
3650         return 0;
3651 }
3652
3653 int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
3654                                      bool interruptible)
3655 {
3656         int running;
3657         int ret = 0;
3658
3659         mutex_lock(&fs_info->qgroup_rescan_lock);
3660         running = fs_info->qgroup_rescan_running;
3661         mutex_unlock(&fs_info->qgroup_rescan_lock);
3662
3663         if (!running)
3664                 return 0;
3665
3666         if (interruptible)
3667                 ret = wait_for_completion_interruptible(
3668                                         &fs_info->qgroup_rescan_completion);
3669         else
3670                 wait_for_completion(&fs_info->qgroup_rescan_completion);
3671
3672         return ret;
3673 }
3674
3675 /*
3676  * this is only called from open_ctree where we're still single threaded, thus
3677  * locking is omitted here.
3678  */
3679 void
3680 btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
3681 {
3682         if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
3683                 mutex_lock(&fs_info->qgroup_rescan_lock);
3684                 fs_info->qgroup_rescan_running = true;
3685                 btrfs_queue_work(fs_info->qgroup_rescan_workers,
3686                                  &fs_info->qgroup_rescan_work);
3687                 mutex_unlock(&fs_info->qgroup_rescan_lock);
3688         }
3689 }
3690
3691 #define rbtree_iterate_from_safe(node, next, start)                             \
3692        for (node = start; node && ({ next = rb_next(node); 1;}); node = next)
3693
3694 static int qgroup_unreserve_range(struct btrfs_inode *inode,
3695                                   struct extent_changeset *reserved, u64 start,
3696                                   u64 len)
3697 {
3698         struct rb_node *node;
3699         struct rb_node *next;
3700         struct ulist_node *entry;
3701         int ret = 0;
3702
3703         node = reserved->range_changed.root.rb_node;
3704         if (!node)
3705                 return 0;
3706         while (node) {
3707                 entry = rb_entry(node, struct ulist_node, rb_node);
3708                 if (entry->val < start)
3709                         node = node->rb_right;
3710                 else
3711                         node = node->rb_left;
3712         }
3713
3714         if (entry->val > start && rb_prev(&entry->rb_node))
3715                 entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node,
3716                                  rb_node);
3717
3718         rbtree_iterate_from_safe(node, next, &entry->rb_node) {
3719                 u64 entry_start;
3720                 u64 entry_end;
3721                 u64 entry_len;
3722                 int clear_ret;
3723
3724                 entry = rb_entry(node, struct ulist_node, rb_node);
3725                 entry_start = entry->val;
3726                 entry_end = entry->aux;
3727                 entry_len = entry_end - entry_start + 1;
3728
3729                 if (entry_start >= start + len)
3730                         break;
3731                 if (entry_start + entry_len <= start)
3732                         continue;
3733                 /*
3734                  * Now the entry is in [start, start + len), revert the
3735                  * EXTENT_QGROUP_RESERVED bit.
3736                  */
3737                 clear_ret = clear_extent_bits(&inode->io_tree, entry_start,
3738                                               entry_end, EXTENT_QGROUP_RESERVED);
3739                 if (!ret && clear_ret < 0)
3740                         ret = clear_ret;
3741
3742                 ulist_del(&reserved->range_changed, entry->val, entry->aux);
3743                 if (likely(reserved->bytes_changed >= entry_len)) {
3744                         reserved->bytes_changed -= entry_len;
3745                 } else {
3746                         WARN_ON(1);
3747                         reserved->bytes_changed = 0;
3748                 }
3749         }
3750
3751         return ret;
3752 }
3753
3754 /*
3755  * Try to free some space for qgroup.
3756  *
3757  * For qgroup, there are only 3 ways to free qgroup space:
3758  * - Flush nodatacow write
3759  *   Any nodatacow write will free its reserved data space at run_delalloc_range().
3760  *   In theory, we should only flush nodatacow inodes, but it's not yet
3761  *   possible, so we need to flush the whole root.
3762  *
3763  * - Wait for ordered extents
3764  *   When ordered extents are finished, their reserved metadata is finally
3765  *   converted to per_trans status, which can be freed by later commit
3766  *   transaction.
3767  *
3768  * - Commit transaction
3769  *   This would free the meta_per_trans space.
3770  *   In theory this shouldn't provide much space, but any more qgroup space
3771  *   is needed.
3772  */
3773 static int try_flush_qgroup(struct btrfs_root *root)
3774 {
3775         struct btrfs_trans_handle *trans;
3776         int ret;
3777
3778         /* Can't hold an open transaction or we run the risk of deadlocking. */
3779         ASSERT(current->journal_info == NULL);
3780         if (WARN_ON(current->journal_info))
3781                 return 0;
3782
3783         /*
3784          * We don't want to run flush again and again, so if there is a running
3785          * one, we won't try to start a new flush, but exit directly.
3786          */
3787         if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) {
3788                 wait_event(root->qgroup_flush_wait,
3789                         !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state));
3790                 return 0;
3791         }
3792
3793         ret = btrfs_start_delalloc_snapshot(root, true);
3794         if (ret < 0)
3795                 goto out;
3796         btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
3797
3798         trans = btrfs_attach_transaction_barrier(root);
3799         if (IS_ERR(trans)) {
3800                 ret = PTR_ERR(trans);
3801                 if (ret == -ENOENT)
3802                         ret = 0;
3803                 goto out;
3804         }
3805
3806         ret = btrfs_commit_transaction(trans);
3807 out:
3808         clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
3809         wake_up(&root->qgroup_flush_wait);
3810         return ret;
3811 }
3812
3813 static int qgroup_reserve_data(struct btrfs_inode *inode,
3814                         struct extent_changeset **reserved_ret, u64 start,
3815                         u64 len)
3816 {
3817         struct btrfs_root *root = inode->root;
3818         struct extent_changeset *reserved;
3819         bool new_reserved = false;
3820         u64 orig_reserved;
3821         u64 to_reserve;
3822         int ret;
3823
3824         if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) ||
3825             !is_fstree(root->root_key.objectid) || len == 0)
3826                 return 0;
3827
3828         /* @reserved parameter is mandatory for qgroup */
3829         if (WARN_ON(!reserved_ret))
3830                 return -EINVAL;
3831         if (!*reserved_ret) {
3832                 new_reserved = true;
3833                 *reserved_ret = extent_changeset_alloc();
3834                 if (!*reserved_ret)
3835                         return -ENOMEM;
3836         }
3837         reserved = *reserved_ret;
3838         /* Record already reserved space */
3839         orig_reserved = reserved->bytes_changed;
3840         ret = set_record_extent_bits(&inode->io_tree, start,
3841                         start + len -1, EXTENT_QGROUP_RESERVED, reserved);
3842
3843         /* Newly reserved space */
3844         to_reserve = reserved->bytes_changed - orig_reserved;
3845         trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len,
3846                                         to_reserve, QGROUP_RESERVE);
3847         if (ret < 0)
3848                 goto out;
3849         ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA);
3850         if (ret < 0)
3851                 goto cleanup;
3852
3853         return ret;
3854
3855 cleanup:
3856         qgroup_unreserve_range(inode, reserved, start, len);
3857 out:
3858         if (new_reserved) {
3859                 extent_changeset_free(reserved);
3860                 *reserved_ret = NULL;
3861         }
3862         return ret;
3863 }
3864
3865 /*
3866  * Reserve qgroup space for range [start, start + len).
3867  *
3868  * This function will either reserve space from related qgroups or do nothing
3869  * if the range is already reserved.
3870  *
3871  * Return 0 for successful reservation
3872  * Return <0 for error (including -EQUOT)
3873  *
3874  * NOTE: This function may sleep for memory allocation, dirty page flushing and
3875  *       commit transaction. So caller should not hold any dirty page locked.
3876  */
3877 int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
3878                         struct extent_changeset **reserved_ret, u64 start,
3879                         u64 len)
3880 {
3881         int ret;
3882
3883         ret = qgroup_reserve_data(inode, reserved_ret, start, len);
3884         if (ret <= 0 && ret != -EDQUOT)
3885                 return ret;
3886
3887         ret = try_flush_qgroup(inode->root);
3888         if (ret < 0)
3889                 return ret;
3890         return qgroup_reserve_data(inode, reserved_ret, start, len);
3891 }
3892
3893 /* Free ranges specified by @reserved, normally in error path */
3894 static int qgroup_free_reserved_data(struct btrfs_inode *inode,
3895                         struct extent_changeset *reserved, u64 start, u64 len)
3896 {
3897         struct btrfs_root *root = inode->root;
3898         struct ulist_node *unode;
3899         struct ulist_iterator uiter;
3900         struct extent_changeset changeset;
3901         int freed = 0;
3902         int ret;
3903
3904         extent_changeset_init(&changeset);
3905         len = round_up(start + len, root->fs_info->sectorsize);
3906         start = round_down(start, root->fs_info->sectorsize);
3907
3908         ULIST_ITER_INIT(&uiter);
3909         while ((unode = ulist_next(&reserved->range_changed, &uiter))) {
3910                 u64 range_start = unode->val;
3911                 /* unode->aux is the inclusive end */
3912                 u64 range_len = unode->aux - range_start + 1;
3913                 u64 free_start;
3914                 u64 free_len;
3915
3916                 extent_changeset_release(&changeset);
3917
3918                 /* Only free range in range [start, start + len) */
3919                 if (range_start >= start + len ||
3920                     range_start + range_len <= start)
3921                         continue;
3922                 free_start = max(range_start, start);
3923                 free_len = min(start + len, range_start + range_len) -
3924                            free_start;
3925                 /*
3926                  * TODO: To also modify reserved->ranges_reserved to reflect
3927                  * the modification.
3928                  *
3929                  * However as long as we free qgroup reserved according to
3930                  * EXTENT_QGROUP_RESERVED, we won't double free.
3931                  * So not need to rush.
3932                  */
3933                 ret = clear_record_extent_bits(&inode->io_tree, free_start,
3934                                 free_start + free_len - 1,
3935                                 EXTENT_QGROUP_RESERVED, &changeset);
3936                 if (ret < 0)
3937                         goto out;
3938                 freed += changeset.bytes_changed;
3939         }
3940         btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed,
3941                                   BTRFS_QGROUP_RSV_DATA);
3942         ret = freed;
3943 out:
3944         extent_changeset_release(&changeset);
3945         return ret;
3946 }
3947
3948 static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
3949                         struct extent_changeset *reserved, u64 start, u64 len,
3950                         int free)
3951 {
3952         struct extent_changeset changeset;
3953         int trace_op = QGROUP_RELEASE;
3954         int ret;
3955
3956         if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &inode->root->fs_info->flags))
3957                 return 0;
3958
3959         /* In release case, we shouldn't have @reserved */
3960         WARN_ON(!free && reserved);
3961         if (free && reserved)
3962                 return qgroup_free_reserved_data(inode, reserved, start, len);
3963         extent_changeset_init(&changeset);
3964         ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1,
3965                                        EXTENT_QGROUP_RESERVED, &changeset);
3966         if (ret < 0)
3967                 goto out;
3968
3969         if (free)
3970                 trace_op = QGROUP_FREE;
3971         trace_btrfs_qgroup_release_data(&inode->vfs_inode, start, len,
3972                                         changeset.bytes_changed, trace_op);
3973         if (free)
3974                 btrfs_qgroup_free_refroot(inode->root->fs_info,
3975                                 inode->root->root_key.objectid,
3976                                 changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
3977         ret = changeset.bytes_changed;
3978 out:
3979         extent_changeset_release(&changeset);
3980         return ret;
3981 }
3982
3983 /*
3984  * Free a reserved space range from io_tree and related qgroups
3985  *
3986  * Should be called when a range of pages get invalidated before reaching disk.
3987  * Or for error cleanup case.
3988  * if @reserved is given, only reserved range in [@start, @start + @len) will
3989  * be freed.
3990  *
3991  * For data written to disk, use btrfs_qgroup_release_data().
3992  *
3993  * NOTE: This function may sleep for memory allocation.
3994  */
3995 int btrfs_qgroup_free_data(struct btrfs_inode *inode,
3996                         struct extent_changeset *reserved, u64 start, u64 len)
3997 {
3998         return __btrfs_qgroup_release_data(inode, reserved, start, len, 1);
3999 }
4000
4001 /*
4002  * Release a reserved space range from io_tree only.
4003  *
4004  * Should be called when a range of pages get written to disk and corresponding
4005  * FILE_EXTENT is inserted into corresponding root.
4006  *
4007  * Since new qgroup accounting framework will only update qgroup numbers at
4008  * commit_transaction() time, its reserved space shouldn't be freed from
4009  * related qgroups.
4010  *
4011  * But we should release the range from io_tree, to allow further write to be
4012  * COWed.
4013  *
4014  * NOTE: This function may sleep for memory allocation.
4015  */
4016 int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len)
4017 {
4018         return __btrfs_qgroup_release_data(inode, NULL, start, len, 0);
4019 }
4020
4021 static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes,
4022                               enum btrfs_qgroup_rsv_type type)
4023 {
4024         if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
4025             type != BTRFS_QGROUP_RSV_META_PERTRANS)
4026                 return;
4027         if (num_bytes == 0)
4028                 return;
4029
4030         spin_lock(&root->qgroup_meta_rsv_lock);
4031         if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
4032                 root->qgroup_meta_rsv_prealloc += num_bytes;
4033         else
4034                 root->qgroup_meta_rsv_pertrans += num_bytes;
4035         spin_unlock(&root->qgroup_meta_rsv_lock);
4036 }
4037
4038 static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes,
4039                              enum btrfs_qgroup_rsv_type type)
4040 {
4041         if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
4042             type != BTRFS_QGROUP_RSV_META_PERTRANS)
4043                 return 0;
4044         if (num_bytes == 0)
4045                 return 0;
4046
4047         spin_lock(&root->qgroup_meta_rsv_lock);
4048         if (type == BTRFS_QGROUP_RSV_META_PREALLOC) {
4049                 num_bytes = min_t(u64, root->qgroup_meta_rsv_prealloc,
4050                                   num_bytes);
4051                 root->qgroup_meta_rsv_prealloc -= num_bytes;
4052         } else {
4053                 num_bytes = min_t(u64, root->qgroup_meta_rsv_pertrans,
4054                                   num_bytes);
4055                 root->qgroup_meta_rsv_pertrans -= num_bytes;
4056         }
4057         spin_unlock(&root->qgroup_meta_rsv_lock);
4058         return num_bytes;
4059 }
4060
4061 int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
4062                               enum btrfs_qgroup_rsv_type type, bool enforce)
4063 {
4064         struct btrfs_fs_info *fs_info = root->fs_info;
4065         int ret;
4066
4067         if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
4068             !is_fstree(root->root_key.objectid) || num_bytes == 0)
4069                 return 0;
4070
4071         BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
4072         trace_qgroup_meta_reserve(root, (s64)num_bytes, type);
4073         ret = qgroup_reserve(root, num_bytes, enforce, type);
4074         if (ret < 0)
4075                 return ret;
4076         /*
4077          * Record what we have reserved into root.
4078          *
4079          * To avoid quota disabled->enabled underflow.
4080          * In that case, we may try to free space we haven't reserved
4081          * (since quota was disabled), so record what we reserved into root.
4082          * And ensure later release won't underflow this number.
4083          */
4084         add_root_meta_rsv(root, num_bytes, type);
4085         return ret;
4086 }
4087
4088 int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
4089                                 enum btrfs_qgroup_rsv_type type, bool enforce,
4090                                 bool noflush)
4091 {
4092         int ret;
4093
4094         ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
4095         if ((ret <= 0 && ret != -EDQUOT) || noflush)
4096                 return ret;
4097
4098         ret = try_flush_qgroup(root);
4099         if (ret < 0)
4100                 return ret;
4101         return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
4102 }
4103
4104 /*
4105  * Per-transaction meta reservation should be all freed at transaction commit
4106  * time
4107  */
4108 void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
4109 {
4110         struct btrfs_fs_info *fs_info = root->fs_info;
4111
4112         if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
4113             !is_fstree(root->root_key.objectid))
4114                 return;
4115
4116         /* TODO: Update trace point to handle such free */
4117         trace_qgroup_meta_free_all_pertrans(root);
4118         /* Special value -1 means to free all reserved space */
4119         btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, (u64)-1,
4120                                   BTRFS_QGROUP_RSV_META_PERTRANS);
4121 }
4122
4123 void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
4124                               enum btrfs_qgroup_rsv_type type)
4125 {
4126         struct btrfs_fs_info *fs_info = root->fs_info;
4127
4128         if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
4129             !is_fstree(root->root_key.objectid))
4130                 return;
4131
4132         /*
4133          * reservation for META_PREALLOC can happen before quota is enabled,
4134          * which can lead to underflow.
4135          * Here ensure we will only free what we really have reserved.
4136          */
4137         num_bytes = sub_root_meta_rsv(root, num_bytes, type);
4138         BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
4139         trace_qgroup_meta_reserve(root, -(s64)num_bytes, type);
4140         btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid,
4141                                   num_bytes, type);
4142 }
4143
4144 static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
4145                                 int num_bytes)
4146 {
4147         struct btrfs_qgroup *qgroup;
4148         LIST_HEAD(qgroup_list);
4149
4150         if (num_bytes == 0)
4151                 return;
4152         if (!fs_info->quota_root)
4153                 return;
4154
4155         spin_lock(&fs_info->qgroup_lock);
4156         qgroup = find_qgroup_rb(fs_info, ref_root);
4157         if (!qgroup)
4158                 goto out;
4159
4160         qgroup_iterator_add(&qgroup_list, qgroup);
4161         list_for_each_entry(qgroup, &qgroup_list, iterator) {
4162                 struct btrfs_qgroup_list *glist;
4163
4164                 qgroup_rsv_release(fs_info, qgroup, num_bytes,
4165                                 BTRFS_QGROUP_RSV_META_PREALLOC);
4166                 qgroup_rsv_add(fs_info, qgroup, num_bytes,
4167                                 BTRFS_QGROUP_RSV_META_PERTRANS);
4168
4169                 list_for_each_entry(glist, &qgroup->groups, next_group)
4170                         qgroup_iterator_add(&qgroup_list, glist->group);
4171         }
4172 out:
4173         qgroup_iterator_clean(&qgroup_list);
4174         spin_unlock(&fs_info->qgroup_lock);
4175 }
4176
4177 /*
4178  * Convert @num_bytes of META_PREALLOCATED reservation to META_PERTRANS.
4179  *
4180  * This is called when preallocated meta reservation needs to be used.
4181  * Normally after btrfs_join_transaction() call.
4182  */
4183 void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
4184 {
4185         struct btrfs_fs_info *fs_info = root->fs_info;
4186
4187         if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
4188             !is_fstree(root->root_key.objectid))
4189                 return;
4190         /* Same as btrfs_qgroup_free_meta_prealloc() */
4191         num_bytes = sub_root_meta_rsv(root, num_bytes,
4192                                       BTRFS_QGROUP_RSV_META_PREALLOC);
4193         trace_qgroup_meta_convert(root, num_bytes);
4194         qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes);
4195 }
4196
4197 /*
4198  * Check qgroup reserved space leaking, normally at destroy inode
4199  * time
4200  */
4201 void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode)
4202 {
4203         struct extent_changeset changeset;
4204         struct ulist_node *unode;
4205         struct ulist_iterator iter;
4206         int ret;
4207
4208         extent_changeset_init(&changeset);
4209         ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1,
4210                         EXTENT_QGROUP_RESERVED, &changeset);
4211
4212         WARN_ON(ret < 0);
4213         if (WARN_ON(changeset.bytes_changed)) {
4214                 ULIST_ITER_INIT(&iter);
4215                 while ((unode = ulist_next(&changeset.range_changed, &iter))) {
4216                         btrfs_warn(inode->root->fs_info,
4217                 "leaking qgroup reserved space, ino: %llu, start: %llu, end: %llu",
4218                                 btrfs_ino(inode), unode->val, unode->aux);
4219                 }
4220                 btrfs_qgroup_free_refroot(inode->root->fs_info,
4221                                 inode->root->root_key.objectid,
4222                                 changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
4223
4224         }
4225         extent_changeset_release(&changeset);
4226 }
4227
4228 void btrfs_qgroup_init_swapped_blocks(
4229         struct btrfs_qgroup_swapped_blocks *swapped_blocks)
4230 {
4231         int i;
4232
4233         spin_lock_init(&swapped_blocks->lock);
4234         for (i = 0; i < BTRFS_MAX_LEVEL; i++)
4235                 swapped_blocks->blocks[i] = RB_ROOT;
4236         swapped_blocks->swapped = false;
4237 }
4238
4239 /*
4240  * Delete all swapped blocks record of @root.
4241  * Every record here means we skipped a full subtree scan for qgroup.
4242  *
4243  * Gets called when committing one transaction.
4244  */
4245 void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root)
4246 {
4247         struct btrfs_qgroup_swapped_blocks *swapped_blocks;
4248         int i;
4249
4250         swapped_blocks = &root->swapped_blocks;
4251
4252         spin_lock(&swapped_blocks->lock);
4253         if (!swapped_blocks->swapped)
4254                 goto out;
4255         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
4256                 struct rb_root *cur_root = &swapped_blocks->blocks[i];
4257                 struct btrfs_qgroup_swapped_block *entry;
4258                 struct btrfs_qgroup_swapped_block *next;
4259
4260                 rbtree_postorder_for_each_entry_safe(entry, next, cur_root,
4261                                                      node)
4262                         kfree(entry);
4263                 swapped_blocks->blocks[i] = RB_ROOT;
4264         }
4265         swapped_blocks->swapped = false;
4266 out:
4267         spin_unlock(&swapped_blocks->lock);
4268 }
4269
4270 /*
4271  * Add subtree roots record into @subvol_root.
4272  *
4273  * @subvol_root:        tree root of the subvolume tree get swapped
4274  * @bg:                 block group under balance
4275  * @subvol_parent/slot: pointer to the subtree root in subvolume tree
4276  * @reloc_parent/slot:  pointer to the subtree root in reloc tree
4277  *                      BOTH POINTERS ARE BEFORE TREE SWAP
4278  * @last_snapshot:      last snapshot generation of the subvolume tree
4279  */
4280 int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
4281                 struct btrfs_root *subvol_root,
4282                 struct btrfs_block_group *bg,
4283                 struct extent_buffer *subvol_parent, int subvol_slot,
4284                 struct extent_buffer *reloc_parent, int reloc_slot,
4285                 u64 last_snapshot)
4286 {
4287         struct btrfs_fs_info *fs_info = subvol_root->fs_info;
4288         struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks;
4289         struct btrfs_qgroup_swapped_block *block;
4290         struct rb_node **cur;
4291         struct rb_node *parent = NULL;
4292         int level = btrfs_header_level(subvol_parent) - 1;
4293         int ret = 0;
4294
4295         if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
4296                 return 0;
4297
4298         if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
4299             btrfs_node_ptr_generation(reloc_parent, reloc_slot)) {
4300                 btrfs_err_rl(fs_info,
4301                 "%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu",
4302                         __func__,
4303                         btrfs_node_ptr_generation(subvol_parent, subvol_slot),
4304                         btrfs_node_ptr_generation(reloc_parent, reloc_slot));
4305                 return -EUCLEAN;
4306         }
4307
4308         block = kmalloc(sizeof(*block), GFP_NOFS);
4309         if (!block) {
4310                 ret = -ENOMEM;
4311                 goto out;
4312         }
4313
4314         /*
4315          * @reloc_parent/slot is still before swap, while @block is going to
4316          * record the bytenr after swap, so we do the swap here.
4317          */
4318         block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot);
4319         block->subvol_generation = btrfs_node_ptr_generation(reloc_parent,
4320                                                              reloc_slot);
4321         block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot);
4322         block->reloc_generation = btrfs_node_ptr_generation(subvol_parent,
4323                                                             subvol_slot);
4324         block->last_snapshot = last_snapshot;
4325         block->level = level;
4326
4327         /*
4328          * If we have bg == NULL, we're called from btrfs_recover_relocation(),
4329          * no one else can modify tree blocks thus we qgroup will not change
4330          * no matter the value of trace_leaf.
4331          */
4332         if (bg && bg->flags & BTRFS_BLOCK_GROUP_DATA)
4333                 block->trace_leaf = true;
4334         else
4335                 block->trace_leaf = false;
4336         btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot);
4337
4338         /* Insert @block into @blocks */
4339         spin_lock(&blocks->lock);
4340         cur = &blocks->blocks[level].rb_node;
4341         while (*cur) {
4342                 struct btrfs_qgroup_swapped_block *entry;
4343
4344                 parent = *cur;
4345                 entry = rb_entry(parent, struct btrfs_qgroup_swapped_block,
4346                                  node);
4347
4348                 if (entry->subvol_bytenr < block->subvol_bytenr) {
4349                         cur = &(*cur)->rb_left;
4350                 } else if (entry->subvol_bytenr > block->subvol_bytenr) {
4351                         cur = &(*cur)->rb_right;
4352                 } else {
4353                         if (entry->subvol_generation !=
4354                                         block->subvol_generation ||
4355                             entry->reloc_bytenr != block->reloc_bytenr ||
4356                             entry->reloc_generation !=
4357                                         block->reloc_generation) {
4358                                 /*
4359                                  * Duplicated but mismatch entry found.
4360                                  * Shouldn't happen.
4361                                  *
4362                                  * Marking qgroup inconsistent should be enough
4363                                  * for end users.
4364                                  */
4365                                 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
4366                                 ret = -EEXIST;
4367                         }
4368                         kfree(block);
4369                         goto out_unlock;
4370                 }
4371         }
4372         rb_link_node(&block->node, parent, cur);
4373         rb_insert_color(&block->node, &blocks->blocks[level]);
4374         blocks->swapped = true;
4375 out_unlock:
4376         spin_unlock(&blocks->lock);
4377 out:
4378         if (ret < 0)
4379                 qgroup_mark_inconsistent(fs_info);
4380         return ret;
4381 }
4382
4383 /*
4384  * Check if the tree block is a subtree root, and if so do the needed
4385  * delayed subtree trace for qgroup.
4386  *
4387  * This is called during btrfs_cow_block().
4388  */
4389 int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
4390                                          struct btrfs_root *root,
4391                                          struct extent_buffer *subvol_eb)
4392 {
4393         struct btrfs_fs_info *fs_info = root->fs_info;
4394         struct btrfs_tree_parent_check check = { 0 };
4395         struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks;
4396         struct btrfs_qgroup_swapped_block *block;
4397         struct extent_buffer *reloc_eb = NULL;
4398         struct rb_node *node;
4399         bool found = false;
4400         bool swapped = false;
4401         int level = btrfs_header_level(subvol_eb);
4402         int ret = 0;
4403         int i;
4404
4405         if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
4406                 return 0;
4407         if (!is_fstree(root->root_key.objectid) || !root->reloc_root)
4408                 return 0;
4409
4410         spin_lock(&blocks->lock);
4411         if (!blocks->swapped) {
4412                 spin_unlock(&blocks->lock);
4413                 return 0;
4414         }
4415         node = blocks->blocks[level].rb_node;
4416
4417         while (node) {
4418                 block = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
4419                 if (block->subvol_bytenr < subvol_eb->start) {
4420                         node = node->rb_left;
4421                 } else if (block->subvol_bytenr > subvol_eb->start) {
4422                         node = node->rb_right;
4423                 } else {
4424                         found = true;
4425                         break;
4426                 }
4427         }
4428         if (!found) {
4429                 spin_unlock(&blocks->lock);
4430                 goto out;
4431         }
4432         /* Found one, remove it from @blocks first and update blocks->swapped */
4433         rb_erase(&block->node, &blocks->blocks[level]);
4434         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
4435                 if (RB_EMPTY_ROOT(&blocks->blocks[i])) {
4436                         swapped = true;
4437                         break;
4438                 }
4439         }
4440         blocks->swapped = swapped;
4441         spin_unlock(&blocks->lock);
4442
4443         check.level = block->level;
4444         check.transid = block->reloc_generation;
4445         check.has_first_key = true;
4446         memcpy(&check.first_key, &block->first_key, sizeof(check.first_key));
4447
4448         /* Read out reloc subtree root */
4449         reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, &check);
4450         if (IS_ERR(reloc_eb)) {
4451                 ret = PTR_ERR(reloc_eb);
4452                 reloc_eb = NULL;
4453                 goto free_out;
4454         }
4455         if (!extent_buffer_uptodate(reloc_eb)) {
4456                 ret = -EIO;
4457                 goto free_out;
4458         }
4459
4460         ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb,
4461                         block->last_snapshot, block->trace_leaf);
4462 free_out:
4463         kfree(block);
4464         free_extent_buffer(reloc_eb);
4465 out:
4466         if (ret < 0) {
4467                 btrfs_err_rl(fs_info,
4468                              "failed to account subtree at bytenr %llu: %d",
4469                              subvol_eb->start, ret);
4470                 qgroup_mark_inconsistent(fs_info);
4471         }
4472         return ret;
4473 }
4474
4475 void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans)
4476 {
4477         struct btrfs_qgroup_extent_record *entry;
4478         struct btrfs_qgroup_extent_record *next;
4479         struct rb_root *root;
4480
4481         root = &trans->delayed_refs.dirty_extent_root;
4482         rbtree_postorder_for_each_entry_safe(entry, next, root, node) {
4483                 ulist_free(entry->old_roots);
4484                 kfree(entry);
4485         }
4486         *root = RB_ROOT;
4487 }