fs/btrfs/block-group.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include "misc.h"
   4 #include "ctree.h"
   5 #include "block-group.h"
   6 #include "space-info.h"
   7 #include "disk-io.h"
   8 #include "free-space-cache.h"
   9 #include "free-space-tree.h"
  10 #include "volumes.h"
  11 #include "transaction.h"
  12 #include "ref-verify.h"
  13 #include "sysfs.h"
  14 #include "tree-log.h"
  15 #include "delalloc-space.h"
  16 #include "discard.h"
  17 #include "raid56.h"
  18
  19 /*
  20  * Return target flags in extended format or 0 if restripe for this chunk_type
  21  * is not in progress
  22  *
  23  * Should be called with balance_lock held
  24  */
  25 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
  26 {
  27         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
  28         u64 target = 0;
  29
  30         if (!bctl)
  31                 return 0;
  32
  33         if (flags & BTRFS_BLOCK_GROUP_DATA &&
  34             bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
  35                 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
  36         } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
  37                    bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
  38                 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
  39         } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
  40                    bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
  41                 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
  42         }
  43
  44         return target;
  45 }
  46
  47 /*
  48  * @flags: available profiles in extended format (see ctree.h)
  49  *
  50  * Return reduced profile in chunk format.  If profile changing is in progress
  51  * (either running or paused) picks the target profile (if it's already
  52  * available), otherwise falls back to plain reducing.
  53  */
  54 static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
  55 {
  56         u64 num_devices = fs_info->fs_devices->rw_devices;
  57         u64 target;
  58         u64 raid_type;
  59         u64 allowed = 0;
  60
  61         /*
  62          * See if restripe for this chunk_type is in progress, if so try to
  63          * reduce to the target profile
  64          */
  65         spin_lock(&fs_info->balance_lock);
  66         target = get_restripe_target(fs_info, flags);
  67         if (target) {
  68                 /* Pick target profile only if it's already available */
  69                 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
  70                         spin_unlock(&fs_info->balance_lock);
  71                         return extended_to_chunk(target);
  72                 }
  73         }
  74         spin_unlock(&fs_info->balance_lock);
  75
  76         /* First, mask out the RAID levels which aren't possible */
  77         for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
  78                 if (num_devices >= btrfs_raid_array[raid_type].devs_min)
  79                         allowed |= btrfs_raid_array[raid_type].bg_flag;
  80         }
  81         allowed &= flags;
  82
  83         if (allowed & BTRFS_BLOCK_GROUP_RAID6)
  84                 allowed = BTRFS_BLOCK_GROUP_RAID6;
  85         else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
  86                 allowed = BTRFS_BLOCK_GROUP_RAID5;
  87         else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
  88                 allowed = BTRFS_BLOCK_GROUP_RAID10;
  89         else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
  90                 allowed = BTRFS_BLOCK_GROUP_RAID1;
  91         else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
  92                 allowed = BTRFS_BLOCK_GROUP_RAID0;
  93
  94         flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
  95
  96         return extended_to_chunk(flags | allowed);
  97 }
  98
  99 u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
 100 {
 101         unsigned seq;
 102         u64 flags;
 103
 104         do {
 105                 flags = orig_flags;
 106                 seq = read_seqbegin(&fs_info->profiles_lock);
 107
 108                 if (flags & BTRFS_BLOCK_GROUP_DATA)
 109                         flags |= fs_info->avail_data_alloc_bits;
 110                 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
 111                         flags |= fs_info->avail_system_alloc_bits;
 112                 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
 113                         flags |= fs_info->avail_metadata_alloc_bits;
 114         } while (read_seqretry(&fs_info->profiles_lock, seq));
 115
 116         return btrfs_reduce_alloc_profile(fs_info, flags);
 117 }
 118
 119 void btrfs_get_block_group(struct btrfs_block_group *cache)
 120 {
 121         atomic_inc(&cache->count);
 122 }
 123
 124 void btrfs_put_block_group(struct btrfs_block_group *cache)
 125 {
 126         if (atomic_dec_and_test(&cache->count)) {
 127                 WARN_ON(cache->pinned > 0);
 128                 WARN_ON(cache->reserved > 0);
 129
 130                 /*
 131                  * A block_group shouldn't be on the discard_list anymore.
 132                  * Remove the block_group from the discard_list to prevent us
 133                  * from causing a panic due to NULL pointer dereference.
 134                  */
 135                 if (WARN_ON(!list_empty(&cache->discard_list)))
 136                         btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
 137                                                   cache);
 138
 139                 /*
 140                  * If not empty, someone is still holding mutex of
 141                  * full_stripe_lock, which can only be released by caller.
 142                  * And it will definitely cause use-after-free when caller
 143                  * tries to release full stripe lock.
 144                  *
 145                  * No better way to resolve, but only to warn.
 146                  */
 147                 WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
 148                 kfree(cache->free_space_ctl);
 149                 kfree(cache);
 150         }
 151 }
 152
 153 /*
 154  * This adds the block group to the fs_info rb tree for the block group cache
 155  */
 156 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
 157                                        struct btrfs_block_group *block_group)
 158 {
 159         struct rb_node **p;
 160         struct rb_node *parent = NULL;
 161         struct btrfs_block_group *cache;
 162
 163         ASSERT(block_group->length != 0);
 164
 165         spin_lock(&info->block_group_cache_lock);
 166         p = &info->block_group_cache_tree.rb_node;
 167
 168         while (*p) {
 169                 parent = *p;
 170                 cache = rb_entry(parent, struct btrfs_block_group, cache_node);
 171                 if (block_group->start < cache->start) {
 172                         p = &(*p)->rb_left;
 173                 } else if (block_group->start > cache->start) {
 174                         p = &(*p)->rb_right;
 175                 } else {
 176                         spin_unlock(&info->block_group_cache_lock);
 177                         return -EEXIST;
 178                 }
 179         }
 180
 181         rb_link_node(&block_group->cache_node, parent, p);
 182         rb_insert_color(&block_group->cache_node,
 183                         &info->block_group_cache_tree);
 184
 185         if (info->first_logical_byte > block_group->start)
 186                 info->first_logical_byte = block_group->start;
 187
 188         spin_unlock(&info->block_group_cache_lock);
 189
 190         return 0;
 191 }
 192
 193 /*
 194  * This will return the block group at or after bytenr if contains is 0, else
 195  * it will return the block group that contains the bytenr
 196  */
 197 static struct btrfs_block_group *block_group_cache_tree_search(
 198                 struct btrfs_fs_info *info, u64 bytenr, int contains)
 199 {
 200         struct btrfs_block_group *cache, *ret = NULL;
 201         struct rb_node *n;
 202         u64 end, start;
 203
 204         spin_lock(&info->block_group_cache_lock);
 205         n = info->block_group_cache_tree.rb_node;
 206
 207         while (n) {
 208                 cache = rb_entry(n, struct btrfs_block_group, cache_node);
 209                 end = cache->start + cache->length - 1;
 210                 start = cache->start;
 211
 212                 if (bytenr < start) {
 213                         if (!contains && (!ret || start < ret->start))
 214                                 ret = cache;
 215                         n = n->rb_left;
 216                 } else if (bytenr > start) {
 217                         if (contains && bytenr <= end) {
 218                                 ret = cache;
 219                                 break;
 220                         }
 221                         n = n->rb_right;
 222                 } else {
 223                         ret = cache;
 224                         break;
 225                 }
 226         }
 227         if (ret) {
 228                 btrfs_get_block_group(ret);
 229                 if (bytenr == 0 && info->first_logical_byte > ret->start)
 230                         info->first_logical_byte = ret->start;
 231         }
 232         spin_unlock(&info->block_group_cache_lock);
 233
 234         return ret;
 235 }
 236
 237 /*
 238  * Return the block group that starts at or after bytenr
 239  */
 240 struct btrfs_block_group *btrfs_lookup_first_block_group(
 241                 struct btrfs_fs_info *info, u64 bytenr)
 242 {
 243         return block_group_cache_tree_search(info, bytenr, 0);
 244 }
 245
 246 /*
 247  * Return the block group that contains the given bytenr
 248  */
 249 struct btrfs_block_group *btrfs_lookup_block_group(
 250                 struct btrfs_fs_info *info, u64 bytenr)
 251 {
 252         return block_group_cache_tree_search(info, bytenr, 1);
 253 }
 254
 255 struct btrfs_block_group *btrfs_next_block_group(
 256                 struct btrfs_block_group *cache)
 257 {
 258         struct btrfs_fs_info *fs_info = cache->fs_info;
 259         struct rb_node *node;
 260
 261         spin_lock(&fs_info->block_group_cache_lock);
 262
 263         /* If our block group was removed, we need a full search. */
 264         if (RB_EMPTY_NODE(&cache->cache_node)) {
 265                 const u64 next_bytenr = cache->start + cache->length;
 266
 267                 spin_unlock(&fs_info->block_group_cache_lock);
 268                 btrfs_put_block_group(cache);
 269                 cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
 270         }
 271         node = rb_next(&cache->cache_node);
 272         btrfs_put_block_group(cache);
 273         if (node) {
 274                 cache = rb_entry(node, struct btrfs_block_group, cache_node);
 275                 btrfs_get_block_group(cache);
 276         } else
 277                 cache = NULL;
 278         spin_unlock(&fs_info->block_group_cache_lock);
 279         return cache;
 280 }
 281
 282 bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
 283 {
 284         struct btrfs_block_group *bg;
 285         bool ret = true;
 286
 287         bg = btrfs_lookup_block_group(fs_info, bytenr);
 288         if (!bg)
 289                 return false;
 290
 291         spin_lock(&bg->lock);
 292         if (bg->ro)
 293                 ret = false;
 294         else
 295                 atomic_inc(&bg->nocow_writers);
 296         spin_unlock(&bg->lock);
 297
 298         /* No put on block group, done by btrfs_dec_nocow_writers */
 299         if (!ret)
 300                 btrfs_put_block_group(bg);
 301
 302         return ret;
 303 }
 304
 305 void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
 306 {
 307         struct btrfs_block_group *bg;
 308
 309         bg = btrfs_lookup_block_group(fs_info, bytenr);
 310         ASSERT(bg);
 311         if (atomic_dec_and_test(&bg->nocow_writers))
 312                 wake_up_var(&bg->nocow_writers);
 313         /*
 314          * Once for our lookup and once for the lookup done by a previous call
 315          * to btrfs_inc_nocow_writers()
 316          */
 317         btrfs_put_block_group(bg);
 318         btrfs_put_block_group(bg);
 319 }
 320
 321 void btrfs_wait_nocow_writers(struct btrfs_block_group *bg)
 322 {
 323         wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
 324 }
 325
 326 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
 327                                         const u64 start)
 328 {
 329         struct btrfs_block_group *bg;
 330
 331         bg = btrfs_lookup_block_group(fs_info, start);
 332         ASSERT(bg);
 333         if (atomic_dec_and_test(&bg->reservations))
 334                 wake_up_var(&bg->reservations);
 335         btrfs_put_block_group(bg);
 336 }
 337
 338 void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg)
 339 {
 340         struct btrfs_space_info *space_info = bg->space_info;
 341
 342         ASSERT(bg->ro);
 343
 344         if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
 345                 return;
 346
 347         /*
 348          * Our block group is read only but before we set it to read only,
 349          * some task might have had allocated an extent from it already, but it
 350          * has not yet created a respective ordered extent (and added it to a
 351          * root's list of ordered extents).
 352          * Therefore wait for any task currently allocating extents, since the
 353          * block group's reservations counter is incremented while a read lock
 354          * on the groups' semaphore is held and decremented after releasing
 355          * the read access on that semaphore and creating the ordered extent.
 356          */
 357         down_write(&space_info->groups_sem);
 358         up_write(&space_info->groups_sem);
 359
 360         wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
 361 }
 362
 363 struct btrfs_caching_control *btrfs_get_caching_control(
 364                 struct btrfs_block_group *cache)
 365 {
 366         struct btrfs_caching_control *ctl;
 367
 368         spin_lock(&cache->lock);
 369         if (!cache->caching_ctl) {
 370                 spin_unlock(&cache->lock);
 371                 return NULL;
 372         }
 373
 374         ctl = cache->caching_ctl;
 375         refcount_inc(&ctl->count);
 376         spin_unlock(&cache->lock);
 377         return ctl;
 378 }
 379
 380 void btrfs_put_caching_control(struct btrfs_caching_control *ctl)
 381 {
 382         if (refcount_dec_and_test(&ctl->count))
 383                 kfree(ctl);
 384 }
 385
 386 /*
 387  * When we wait for progress in the block group caching, its because our
 388  * allocation attempt failed at least once.  So, we must sleep and let some
 389  * progress happen before we try again.
 390  *
 391  * This function will sleep at least once waiting for new free space to show
 392  * up, and then it will check the block group free space numbers for our min
 393  * num_bytes.  Another option is to have it go ahead and look in the rbtree for
 394  * a free extent of a given size, but this is a good start.
 395  *
 396  * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
 397  * any of the information in this block group.
 398  */
 399 void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
 400                                            u64 num_bytes)
 401 {
 402         struct btrfs_caching_control *caching_ctl;
 403
 404         caching_ctl = btrfs_get_caching_control(cache);
 405         if (!caching_ctl)
 406                 return;
 407
 408         wait_event(caching_ctl->wait, btrfs_block_group_done(cache) ||
 409                    (cache->free_space_ctl->free_space >= num_bytes));
 410
 411         btrfs_put_caching_control(caching_ctl);
 412 }
 413
 414 int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
 415 {
 416         struct btrfs_caching_control *caching_ctl;
 417         int ret = 0;
 418
 419         caching_ctl = btrfs_get_caching_control(cache);
 420         if (!caching_ctl)
 421                 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
 422
 423         wait_event(caching_ctl->wait, btrfs_block_group_done(cache));
 424         if (cache->cached == BTRFS_CACHE_ERROR)
 425                 ret = -EIO;
 426         btrfs_put_caching_control(caching_ctl);
 427         return ret;
 428 }
 429
 430 #ifdef CONFIG_BTRFS_DEBUG
 431 static void fragment_free_space(struct btrfs_block_group *block_group)
 432 {
 433         struct btrfs_fs_info *fs_info = block_group->fs_info;
 434         u64 start = block_group->start;
 435         u64 len = block_group->length;
 436         u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
 437                 fs_info->nodesize : fs_info->sectorsize;
 438         u64 step = chunk << 1;
 439
 440         while (len > chunk) {
 441                 btrfs_remove_free_space(block_group, start, chunk);
 442                 start += step;
 443                 if (len < step)
 444                         len = 0;
 445                 else
 446                         len -= step;
 447         }
 448 }
 449 #endif
 450
 451 /*
 452  * This is only called by btrfs_cache_block_group, since we could have freed
 453  * extents we need to check the pinned_extents for any extents that can't be
 454  * used yet since their free space will be released as soon as the transaction
 455  * commits.
 456  */
 457 u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end)
 458 {
 459         struct btrfs_fs_info *info = block_group->fs_info;
 460         u64 extent_start, extent_end, size, total_added = 0;
 461         int ret;
 462
 463         while (start < end) {
 464                 ret = find_first_extent_bit(&info->excluded_extents, start,
 465                                             &extent_start, &extent_end,
 466                                             EXTENT_DIRTY | EXTENT_UPTODATE,
 467                                             NULL);
 468                 if (ret)
 469                         break;
 470
 471                 if (extent_start <= start) {
 472                         start = extent_end + 1;
 473                 } else if (extent_start > start && extent_start < end) {
 474                         size = extent_start - start;
 475                         total_added += size;
 476                         ret = btrfs_add_free_space_async_trimmed(block_group,
 477                                                                  start, size);
 478                         BUG_ON(ret); /* -ENOMEM or logic error */
 479                         start = extent_end + 1;
 480                 } else {
 481                         break;
 482                 }
 483         }
 484
 485         if (start < end) {
 486                 size = end - start;
 487                 total_added += size;
 488                 ret = btrfs_add_free_space_async_trimmed(block_group, start,
 489                                                          size);
 490                 BUG_ON(ret); /* -ENOMEM or logic error */
 491         }
 492
 493         return total_added;
 494 }
 495
 496 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
 497 {
 498         struct btrfs_block_group *block_group = caching_ctl->block_group;
 499         struct btrfs_fs_info *fs_info = block_group->fs_info;
 500         struct btrfs_root *extent_root = fs_info->extent_root;
 501         struct btrfs_path *path;
 502         struct extent_buffer *leaf;
 503         struct btrfs_key key;
 504         u64 total_found = 0;
 505         u64 last = 0;
 506         u32 nritems;
 507         int ret;
 508         bool wakeup = true;
 509
 510         path = btrfs_alloc_path();
 511         if (!path)
 512                 return -ENOMEM;
 513
 514         last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET);
 515
 516 #ifdef CONFIG_BTRFS_DEBUG
 517         /*
 518          * If we're fragmenting we don't want to make anybody think we can
 519          * allocate from this block group until we've had a chance to fragment
 520          * the free space.
 521          */
 522         if (btrfs_should_fragment_free_space(block_group))
 523                 wakeup = false;
 524 #endif
 525         /*
 526          * We don't want to deadlock with somebody trying to allocate a new
 527          * extent for the extent root while also trying to search the extent
 528          * root to add free space.  So we skip locking and search the commit
 529          * root, since its read-only
 530          */
 531         path->skip_locking = 1;
 532         path->search_commit_root = 1;
 533         path->reada = READA_FORWARD;
 534
 535         key.objectid = last;
 536         key.offset = 0;
 537         key.type = BTRFS_EXTENT_ITEM_KEY;
 538
 539 next:
 540         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 541         if (ret < 0)
 542                 goto out;
 543
 544         leaf = path->nodes[0];
 545         nritems = btrfs_header_nritems(leaf);
 546
 547         while (1) {
 548                 if (btrfs_fs_closing(fs_info) > 1) {
 549                         last = (u64)-1;
 550                         break;
 551                 }
 552
 553                 if (path->slots[0] < nritems) {
 554                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 555                 } else {
 556                         ret = btrfs_find_next_key(extent_root, path, &key, 0, 0);
 557                         if (ret)
 558                                 break;
 559
 560                         if (need_resched() ||
 561                             rwsem_is_contended(&fs_info->commit_root_sem)) {
 562                                 if (wakeup)
 563                                         caching_ctl->progress = last;
 564                                 btrfs_release_path(path);
 565                                 up_read(&fs_info->commit_root_sem);
 566                                 mutex_unlock(&caching_ctl->mutex);
 567                                 cond_resched();
 568                                 mutex_lock(&caching_ctl->mutex);
 569                                 down_read(&fs_info->commit_root_sem);
 570                                 goto next;
 571                         }
 572
 573                         ret = btrfs_next_leaf(extent_root, path);
 574                         if (ret < 0)
 575                                 goto out;
 576                         if (ret)
 577                                 break;
 578                         leaf = path->nodes[0];
 579                         nritems = btrfs_header_nritems(leaf);
 580                         continue;
 581                 }
 582
 583                 if (key.objectid < last) {
 584                         key.objectid = last;
 585                         key.offset = 0;
 586                         key.type = BTRFS_EXTENT_ITEM_KEY;
 587
 588                         if (wakeup)
 589                                 caching_ctl->progress = last;
 590                         btrfs_release_path(path);
 591                         goto next;
 592                 }
 593
 594                 if (key.objectid < block_group->start) {
 595                         path->slots[0]++;
 596                         continue;
 597                 }
 598
 599                 if (key.objectid >= block_group->start + block_group->length)
 600                         break;
 601
 602                 if (key.type == BTRFS_EXTENT_ITEM_KEY ||
 603                     key.type == BTRFS_METADATA_ITEM_KEY) {
 604                         total_found += add_new_free_space(block_group, last,
 605                                                           key.objectid);
 606                         if (key.type == BTRFS_METADATA_ITEM_KEY)
 607                                 last = key.objectid +
 608                                         fs_info->nodesize;
 609                         else
 610                                 last = key.objectid + key.offset;
 611
 612                         if (total_found > CACHING_CTL_WAKE_UP) {
 613                                 total_found = 0;
 614                                 if (wakeup)
 615                                         wake_up(&caching_ctl->wait);
 616                         }
 617                 }
 618                 path->slots[0]++;
 619         }
 620         ret = 0;
 621
 622         total_found += add_new_free_space(block_group, last,
 623                                 block_group->start + block_group->length);
 624         caching_ctl->progress = (u64)-1;
 625
 626 out:
 627         btrfs_free_path(path);
 628         return ret;
 629 }
 630
 631 static noinline void caching_thread(struct btrfs_work *work)
 632 {
 633         struct btrfs_block_group *block_group;
 634         struct btrfs_fs_info *fs_info;
 635         struct btrfs_caching_control *caching_ctl;
 636         int ret;
 637
 638         caching_ctl = container_of(work, struct btrfs_caching_control, work);
 639         block_group = caching_ctl->block_group;
 640         fs_info = block_group->fs_info;
 641
 642         mutex_lock(&caching_ctl->mutex);
 643         down_read(&fs_info->commit_root_sem);
 644
 645         if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
 646                 ret = load_free_space_tree(caching_ctl);
 647         else
 648                 ret = load_extent_tree_free(caching_ctl);
 649
 650         spin_lock(&block_group->lock);
 651         block_group->caching_ctl = NULL;
 652         block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
 653         spin_unlock(&block_group->lock);
 654
 655 #ifdef CONFIG_BTRFS_DEBUG
 656         if (btrfs_should_fragment_free_space(block_group)) {
 657                 u64 bytes_used;
 658
 659                 spin_lock(&block_group->space_info->lock);
 660                 spin_lock(&block_group->lock);
 661                 bytes_used = block_group->length - block_group->used;
 662                 block_group->space_info->bytes_used += bytes_used >> 1;
 663                 spin_unlock(&block_group->lock);
 664                 spin_unlock(&block_group->space_info->lock);
 665                 fragment_free_space(block_group);
 666         }
 667 #endif
 668
 669         caching_ctl->progress = (u64)-1;
 670
 671         up_read(&fs_info->commit_root_sem);
 672         btrfs_free_excluded_extents(block_group);
 673         mutex_unlock(&caching_ctl->mutex);
 674
 675         wake_up(&caching_ctl->wait);
 676
 677         btrfs_put_caching_control(caching_ctl);
 678         btrfs_put_block_group(block_group);
 679 }
 680
 681 int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only)
 682 {
 683         DEFINE_WAIT(wait);
 684         struct btrfs_fs_info *fs_info = cache->fs_info;
 685         struct btrfs_caching_control *caching_ctl;
 686         int ret = 0;
 687
 688         caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
 689         if (!caching_ctl)
 690                 return -ENOMEM;
 691
 692         INIT_LIST_HEAD(&caching_ctl->list);
 693         mutex_init(&caching_ctl->mutex);
 694         init_waitqueue_head(&caching_ctl->wait);
 695         caching_ctl->block_group = cache;
 696         caching_ctl->progress = cache->start;
 697         refcount_set(&caching_ctl->count, 1);
 698         btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
 699
 700         spin_lock(&cache->lock);
 701         /*
 702          * This should be a rare occasion, but this could happen I think in the
 703          * case where one thread starts to load the space cache info, and then
 704          * some other thread starts a transaction commit which tries to do an
 705          * allocation while the other thread is still loading the space cache
 706          * info.  The previous loop should have kept us from choosing this block
 707          * group, but if we've moved to the state where we will wait on caching
 708          * block groups we need to first check if we're doing a fast load here,
 709          * so we can wait for it to finish, otherwise we could end up allocating
 710          * from a block group who's cache gets evicted for one reason or
 711          * another.
 712          */
 713         while (cache->cached == BTRFS_CACHE_FAST) {
 714                 struct btrfs_caching_control *ctl;
 715
 716                 ctl = cache->caching_ctl;
 717                 refcount_inc(&ctl->count);
 718                 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
 719                 spin_unlock(&cache->lock);
 720
 721                 schedule();
 722
 723                 finish_wait(&ctl->wait, &wait);
 724                 btrfs_put_caching_control(ctl);
 725                 spin_lock(&cache->lock);
 726         }
 727
 728         if (cache->cached != BTRFS_CACHE_NO) {
 729                 spin_unlock(&cache->lock);
 730                 kfree(caching_ctl);
 731                 return 0;
 732         }
 733         WARN_ON(cache->caching_ctl);
 734         cache->caching_ctl = caching_ctl;
 735         cache->cached = BTRFS_CACHE_FAST;
 736         spin_unlock(&cache->lock);
 737
 738         if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
 739                 mutex_lock(&caching_ctl->mutex);
 740                 ret = load_free_space_cache(cache);
 741
 742                 spin_lock(&cache->lock);
 743                 if (ret == 1) {
 744                         cache->caching_ctl = NULL;
 745                         cache->cached = BTRFS_CACHE_FINISHED;
 746                         cache->last_byte_to_unpin = (u64)-1;
 747                         caching_ctl->progress = (u64)-1;
 748                 } else {
 749                         if (load_cache_only) {
 750                                 cache->caching_ctl = NULL;
 751                                 cache->cached = BTRFS_CACHE_NO;
 752                         } else {
 753                                 cache->cached = BTRFS_CACHE_STARTED;
 754                                 cache->has_caching_ctl = 1;
 755                         }
 756                 }
 757                 spin_unlock(&cache->lock);
 758 #ifdef CONFIG_BTRFS_DEBUG
 759                 if (ret == 1 &&
 760                     btrfs_should_fragment_free_space(cache)) {
 761                         u64 bytes_used;
 762
 763                         spin_lock(&cache->space_info->lock);
 764                         spin_lock(&cache->lock);
 765                         bytes_used = cache->length - cache->used;
 766                         cache->space_info->bytes_used += bytes_used >> 1;
 767                         spin_unlock(&cache->lock);
 768                         spin_unlock(&cache->space_info->lock);
 769                         fragment_free_space(cache);
 770                 }
 771 #endif
 772                 mutex_unlock(&caching_ctl->mutex);
 773
 774                 wake_up(&caching_ctl->wait);
 775                 if (ret == 1) {
 776                         btrfs_put_caching_control(caching_ctl);
 777                         btrfs_free_excluded_extents(cache);
 778                         return 0;
 779                 }
 780         } else {
 781                 /*
 782                  * We're either using the free space tree or no caching at all.
 783                  * Set cached to the appropriate value and wakeup any waiters.
 784                  */
 785                 spin_lock(&cache->lock);
 786                 if (load_cache_only) {
 787                         cache->caching_ctl = NULL;
 788                         cache->cached = BTRFS_CACHE_NO;
 789                 } else {
 790                         cache->cached = BTRFS_CACHE_STARTED;
 791                         cache->has_caching_ctl = 1;
 792                 }
 793                 spin_unlock(&cache->lock);
 794                 wake_up(&caching_ctl->wait);
 795         }
 796
 797         if (load_cache_only) {
 798                 btrfs_put_caching_control(caching_ctl);
 799                 return 0;
 800         }
 801
 802         down_write(&fs_info->commit_root_sem);
 803         refcount_inc(&caching_ctl->count);
 804         list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
 805         up_write(&fs_info->commit_root_sem);
 806
 807         btrfs_get_block_group(cache);
 808
 809         btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
 810
 811         return ret;
 812 }
 813
 814 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 815 {
 816         u64 extra_flags = chunk_to_extended(flags) &
 817                                 BTRFS_EXTENDED_PROFILE_MASK;
 818
 819         write_seqlock(&fs_info->profiles_lock);
 820         if (flags & BTRFS_BLOCK_GROUP_DATA)
 821                 fs_info->avail_data_alloc_bits &= ~extra_flags;
 822         if (flags & BTRFS_BLOCK_GROUP_METADATA)
 823                 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
 824         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
 825                 fs_info->avail_system_alloc_bits &= ~extra_flags;
 826         write_sequnlock(&fs_info->profiles_lock);
 827 }
 828
 829 /*
 830  * Clear incompat bits for the following feature(s):
 831  *
 832  * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
 833  *            in the whole filesystem
 834  *
 835  * - RAID1C34 - same as above for RAID1C3 and RAID1C4 block groups
 836  */
 837 static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
 838 {
 839         bool found_raid56 = false;
 840         bool found_raid1c34 = false;
 841
 842         if ((flags & BTRFS_BLOCK_GROUP_RAID56_MASK) ||
 843             (flags & BTRFS_BLOCK_GROUP_RAID1C3) ||
 844             (flags & BTRFS_BLOCK_GROUP_RAID1C4)) {
 845                 struct list_head *head = &fs_info->space_info;
 846                 struct btrfs_space_info *sinfo;
 847
 848                 list_for_each_entry_rcu(sinfo, head, list) {
 849                         down_read(&sinfo->groups_sem);
 850                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5]))
 851                                 found_raid56 = true;
 852                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6]))
 853                                 found_raid56 = true;
 854                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C3]))
 855                                 found_raid1c34 = true;
 856                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C4]))
 857                                 found_raid1c34 = true;
 858                         up_read(&sinfo->groups_sem);
 859                 }
 860                 if (!found_raid56)
 861                         btrfs_clear_fs_incompat(fs_info, RAID56);
 862                 if (!found_raid1c34)
 863                         btrfs_clear_fs_incompat(fs_info, RAID1C34);
 864         }
 865 }
 866
 867 static int remove_block_group_item(struct btrfs_trans_handle *trans,
 868                                    struct btrfs_path *path,
 869                                    struct btrfs_block_group *block_group)
 870 {
 871         struct btrfs_fs_info *fs_info = trans->fs_info;
 872         struct btrfs_root *root;
 873         struct btrfs_key key;
 874         int ret;
 875
 876         root = fs_info->extent_root;
 877         key.objectid = block_group->start;
 878         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
 879         key.offset = block_group->length;
 880
 881         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 882         if (ret > 0)
 883                 ret = -ENOENT;
 884         if (ret < 0)
 885                 return ret;
 886
 887         ret = btrfs_del_item(trans, root, path);
 888         return ret;
 889 }
 890
 891 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 892                              u64 group_start, struct extent_map *em)
 893 {
 894         struct btrfs_fs_info *fs_info = trans->fs_info;
 895         struct btrfs_path *path;
 896         struct btrfs_block_group *block_group;
 897         struct btrfs_free_cluster *cluster;
 898         struct btrfs_root *tree_root = fs_info->tree_root;
 899         struct btrfs_key key;
 900         struct inode *inode;
 901         struct kobject *kobj = NULL;
 902         int ret;
 903         int index;
 904         int factor;
 905         struct btrfs_caching_control *caching_ctl = NULL;
 906         bool remove_em;
 907         bool remove_rsv = false;
 908
 909         block_group = btrfs_lookup_block_group(fs_info, group_start);
 910         BUG_ON(!block_group);
 911         BUG_ON(!block_group->ro);
 912
 913         trace_btrfs_remove_block_group(block_group);
 914         /*
 915          * Free the reserved super bytes from this block group before
 916          * remove it.
 917          */
 918         btrfs_free_excluded_extents(block_group);
 919         btrfs_free_ref_tree_range(fs_info, block_group->start,
 920                                   block_group->length);
 921
 922         index = btrfs_bg_flags_to_raid_index(block_group->flags);
 923         factor = btrfs_bg_type_to_factor(block_group->flags);
 924
 925         /* make sure this block group isn't part of an allocation cluster */
 926         cluster = &fs_info->data_alloc_cluster;
 927         spin_lock(&cluster->refill_lock);
 928         btrfs_return_cluster_to_free_space(block_group, cluster);
 929         spin_unlock(&cluster->refill_lock);
 930
 931         /*
 932          * make sure this block group isn't part of a metadata
 933          * allocation cluster
 934          */
 935         cluster = &fs_info->meta_alloc_cluster;
 936         spin_lock(&cluster->refill_lock);
 937         btrfs_return_cluster_to_free_space(block_group, cluster);
 938         spin_unlock(&cluster->refill_lock);
 939
 940         path = btrfs_alloc_path();
 941         if (!path) {
 942                 ret = -ENOMEM;
 943                 goto out_put_group;
 944         }
 945
 946         /*
 947          * get the inode first so any iput calls done for the io_list
 948          * aren't the final iput (no unlinks allowed now)
 949          */
 950         inode = lookup_free_space_inode(block_group, path);
 951
 952         mutex_lock(&trans->transaction->cache_write_mutex);
 953         /*
 954          * Make sure our free space cache IO is done before removing the
 955          * free space inode
 956          */
 957         spin_lock(&trans->transaction->dirty_bgs_lock);
 958         if (!list_empty(&block_group->io_list)) {
 959                 list_del_init(&block_group->io_list);
 960
 961                 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
 962
 963                 spin_unlock(&trans->transaction->dirty_bgs_lock);
 964                 btrfs_wait_cache_io(trans, block_group, path);
 965                 btrfs_put_block_group(block_group);
 966                 spin_lock(&trans->transaction->dirty_bgs_lock);
 967         }
 968
 969         if (!list_empty(&block_group->dirty_list)) {
 970                 list_del_init(&block_group->dirty_list);
 971                 remove_rsv = true;
 972                 btrfs_put_block_group(block_group);
 973         }
 974         spin_unlock(&trans->transaction->dirty_bgs_lock);
 975         mutex_unlock(&trans->transaction->cache_write_mutex);
 976
 977         if (!IS_ERR(inode)) {
 978                 ret = btrfs_orphan_add(trans, BTRFS_I(inode));
 979                 if (ret) {
 980                         btrfs_add_delayed_iput(inode);
 981                         goto out_put_group;
 982                 }
 983                 clear_nlink(inode);
 984                 /* One for the block groups ref */
 985                 spin_lock(&block_group->lock);
 986                 if (block_group->iref) {
 987                         block_group->iref = 0;
 988                         block_group->inode = NULL;
 989                         spin_unlock(&block_group->lock);
 990                         iput(inode);
 991                 } else {
 992                         spin_unlock(&block_group->lock);
 993                 }
 994                 /* One for our lookup ref */
 995                 btrfs_add_delayed_iput(inode);
 996         }
 997
 998         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
 999         key.type = 0;
1000         key.offset = block_group->start;
1001
1002         ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
1003         if (ret < 0)
1004                 goto out_put_group;
1005         if (ret > 0)
1006                 btrfs_release_path(path);
1007         if (ret == 0) {
1008                 ret = btrfs_del_item(trans, tree_root, path);
1009                 if (ret)
1010                         goto out_put_group;
1011                 btrfs_release_path(path);
1012         }
1013
1014         spin_lock(&fs_info->block_group_cache_lock);
1015         rb_erase(&block_group->cache_node,
1016                  &fs_info->block_group_cache_tree);
1017         RB_CLEAR_NODE(&block_group->cache_node);
1018
1019         if (fs_info->first_logical_byte == block_group->start)
1020                 fs_info->first_logical_byte = (u64)-1;
1021         spin_unlock(&fs_info->block_group_cache_lock);
1022
1023         down_write(&block_group->space_info->groups_sem);
1024         /*
1025          * we must use list_del_init so people can check to see if they
1026          * are still on the list after taking the semaphore
1027          */
1028         list_del_init(&block_group->list);
1029         if (list_empty(&block_group->space_info->block_groups[index])) {
1030                 kobj = block_group->space_info->block_group_kobjs[index];
1031                 block_group->space_info->block_group_kobjs[index] = NULL;
1032                 clear_avail_alloc_bits(fs_info, block_group->flags);
1033         }
1034         up_write(&block_group->space_info->groups_sem);
1035         clear_incompat_bg_bits(fs_info, block_group->flags);
1036         if (kobj) {
1037                 kobject_del(kobj);
1038                 kobject_put(kobj);
1039         }
1040
1041         if (block_group->has_caching_ctl)
1042                 caching_ctl = btrfs_get_caching_control(block_group);
1043         if (block_group->cached == BTRFS_CACHE_STARTED)
1044                 btrfs_wait_block_group_cache_done(block_group);
1045         if (block_group->has_caching_ctl) {
1046                 down_write(&fs_info->commit_root_sem);
1047                 if (!caching_ctl) {
1048                         struct btrfs_caching_control *ctl;
1049
1050                         list_for_each_entry(ctl,
1051                                     &fs_info->caching_block_groups, list)
1052                                 if (ctl->block_group == block_group) {
1053                                         caching_ctl = ctl;
1054                                         refcount_inc(&caching_ctl->count);
1055                                         break;
1056                                 }
1057                 }
1058                 if (caching_ctl)
1059                         list_del_init(&caching_ctl->list);
1060                 up_write(&fs_info->commit_root_sem);
1061                 if (caching_ctl) {
1062                         /* Once for the caching bgs list and once for us. */
1063                         btrfs_put_caching_control(caching_ctl);
1064                         btrfs_put_caching_control(caching_ctl);
1065                 }
1066         }
1067
1068         spin_lock(&trans->transaction->dirty_bgs_lock);
1069         WARN_ON(!list_empty(&block_group->dirty_list));
1070         WARN_ON(!list_empty(&block_group->io_list));
1071         spin_unlock(&trans->transaction->dirty_bgs_lock);
1072
1073         btrfs_remove_free_space_cache(block_group);
1074
1075         spin_lock(&block_group->space_info->lock);
1076         list_del_init(&block_group->ro_list);
1077
1078         if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
1079                 WARN_ON(block_group->space_info->total_bytes
1080                         < block_group->length);
1081                 WARN_ON(block_group->space_info->bytes_readonly
1082                         < block_group->length);
1083                 WARN_ON(block_group->space_info->disk_total
1084                         < block_group->length * factor);
1085         }
1086         block_group->space_info->total_bytes -= block_group->length;
1087         block_group->space_info->bytes_readonly -= block_group->length;
1088         block_group->space_info->disk_total -= block_group->length * factor;
1089
1090         spin_unlock(&block_group->space_info->lock);
1091
1092         mutex_lock(&fs_info->chunk_mutex);
1093         spin_lock(&block_group->lock);
1094         block_group->removed = 1;
1095         /*
1096          * At this point trimming or scrub can't start on this block group,
1097          * because we removed the block group from the rbtree
1098          * fs_info->block_group_cache_tree so no one can't find it anymore and
1099          * even if someone already got this block group before we removed it
1100          * from the rbtree, they have already incremented block_group->frozen -
1101          * if they didn't, for the trimming case they won't find any free space
1102          * entries because we already removed them all when we called
1103          * btrfs_remove_free_space_cache().
1104          *
1105          * And we must not remove the extent map from the fs_info->mapping_tree
1106          * to prevent the same logical address range and physical device space
1107          * ranges from being reused for a new block group. This is needed to
1108          * avoid races with trimming and scrub.
1109          *
1110          * An fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
1111          * completely transactionless, so while it is trimming a range the
1112          * currently running transaction might finish and a new one start,
1113          * allowing for new block groups to be created that can reuse the same
1114          * physical device locations unless we take this special care.
1115          *
1116          * There may also be an implicit trim operation if the file system
1117          * is mounted with -odiscard. The same protections must remain
1118          * in place until the extents have been discarded completely when
1119          * the transaction commit has completed.
1120          */
1121         remove_em = (atomic_read(&block_group->frozen) == 0);
1122         spin_unlock(&block_group->lock);
1123
1124         mutex_unlock(&fs_info->chunk_mutex);
1125
1126         ret = remove_block_group_free_space(trans, block_group);
1127         if (ret)
1128                 goto out_put_group;
1129
1130         /* Once for the block groups rbtree */
1131         btrfs_put_block_group(block_group);
1132
1133         ret = remove_block_group_item(trans, path, block_group);
1134         if (ret < 0)
1135                 goto out;
1136
1137         if (remove_em) {
1138                 struct extent_map_tree *em_tree;
1139
1140                 em_tree = &fs_info->mapping_tree;
1141                 write_lock(&em_tree->lock);
1142                 remove_extent_mapping(em_tree, em);
1143                 write_unlock(&em_tree->lock);
1144                 /* once for the tree */
1145                 free_extent_map(em);
1146         }
1147
1148 out_put_group:
1149         /* Once for the lookup reference */
1150         btrfs_put_block_group(block_group);
1151 out:
1152         if (remove_rsv)
1153                 btrfs_delayed_refs_rsv_release(fs_info, 1);
1154         btrfs_free_path(path);
1155         return ret;
1156 }
1157
1158 struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
1159                 struct btrfs_fs_info *fs_info, const u64 chunk_offset)
1160 {
1161         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
1162         struct extent_map *em;
1163         struct map_lookup *map;
1164         unsigned int num_items;
1165
1166         read_lock(&em_tree->lock);
1167         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
1168         read_unlock(&em_tree->lock);
1169         ASSERT(em && em->start == chunk_offset);
1170
1171         /*
1172          * We need to reserve 3 + N units from the metadata space info in order
1173          * to remove a block group (done at btrfs_remove_chunk() and at
1174          * btrfs_remove_block_group()), which are used for:
1175          *
1176          * 1 unit for adding the free space inode's orphan (located in the tree
1177          * of tree roots).
1178          * 1 unit for deleting the block group item (located in the extent
1179          * tree).
1180          * 1 unit for deleting the free space item (located in tree of tree
1181          * roots).
1182          * N units for deleting N device extent items corresponding to each
1183          * stripe (located in the device tree).
1184          *
1185          * In order to remove a block group we also need to reserve units in the
1186          * system space info in order to update the chunk tree (update one or
1187          * more device items and remove one chunk item), but this is done at
1188          * btrfs_remove_chunk() through a call to check_system_chunk().
1189          */
1190         map = em->map_lookup;
1191         num_items = 3 + map->num_stripes;
1192         free_extent_map(em);
1193
1194         return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
1195                                                            num_items);
1196 }
1197
1198 /*
1199  * Mark block group @cache read-only, so later write won't happen to block
1200  * group @cache.
1201  *
1202  * If @force is not set, this function will only mark the block group readonly
1203  * if we have enough free space (1M) in other metadata/system block groups.
1204  * If @force is not set, this function will mark the block group readonly
1205  * without checking free space.
1206  *
1207  * NOTE: This function doesn't care if other block groups can contain all the
1208  * data in this block group. That check should be done by relocation routine,
1209  * not this function.
1210  */
1211 static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
1212 {
1213         struct btrfs_space_info *sinfo = cache->space_info;
1214         u64 num_bytes;
1215         int ret = -ENOSPC;
1216
1217         spin_lock(&sinfo->lock);
1218         spin_lock(&cache->lock);
1219
1220         if (cache->ro) {
1221                 cache->ro++;
1222                 ret = 0;
1223                 goto out;
1224         }
1225
1226         num_bytes = cache->length - cache->reserved - cache->pinned -
1227                     cache->bytes_super - cache->used;
1228
1229         /*
1230          * Data never overcommits, even in mixed mode, so do just the straight
1231          * check of left over space in how much we have allocated.
1232          */
1233         if (force) {
1234                 ret = 0;
1235         } else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) {
1236                 u64 sinfo_used = btrfs_space_info_used(sinfo, true);
1237
1238                 /*
1239                  * Here we make sure if we mark this bg RO, we still have enough
1240                  * free space as buffer.
1241                  */
1242                 if (sinfo_used + num_bytes <= sinfo->total_bytes)
1243                         ret = 0;
1244         } else {
1245                 /*
1246                  * We overcommit metadata, so we need to do the
1247                  * btrfs_can_overcommit check here, and we need to pass in
1248                  * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of
1249                  * leeway to allow us to mark this block group as read only.
1250                  */
1251                 if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes,
1252                                          BTRFS_RESERVE_NO_FLUSH))
1253                         ret = 0;
1254         }
1255
1256         if (!ret) {
1257                 sinfo->bytes_readonly += num_bytes;
1258                 cache->ro++;
1259                 list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
1260         }
1261 out:
1262         spin_unlock(&cache->lock);
1263         spin_unlock(&sinfo->lock);
1264         if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
1265                 btrfs_info(cache->fs_info,
1266                         "unable to make block group %llu ro", cache->start);
1267                 btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
1268         }
1269         return ret;
1270 }
1271
1272 static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
1273                                  struct btrfs_block_group *bg)
1274 {
1275         struct btrfs_fs_info *fs_info = bg->fs_info;
1276         struct btrfs_transaction *prev_trans = NULL;
1277         const u64 start = bg->start;
1278         const u64 end = start + bg->length - 1;
1279         int ret;
1280
1281         spin_lock(&fs_info->trans_lock);
1282         if (trans->transaction->list.prev != &fs_info->trans_list) {
1283                 prev_trans = list_last_entry(&trans->transaction->list,
1284                                              struct btrfs_transaction, list);
1285                 refcount_inc(&prev_trans->use_count);
1286         }
1287         spin_unlock(&fs_info->trans_lock);
1288
1289         /*
1290          * Hold the unused_bg_unpin_mutex lock to avoid racing with
1291          * btrfs_finish_extent_commit(). If we are at transaction N, another
1292          * task might be running finish_extent_commit() for the previous
1293          * transaction N - 1, and have seen a range belonging to the block
1294          * group in pinned_extents before we were able to clear the whole block
1295          * group range from pinned_extents. This means that task can lookup for
1296          * the block group after we unpinned it from pinned_extents and removed
1297          * it, leading to a BUG_ON() at unpin_extent_range().
1298          */
1299         mutex_lock(&fs_info->unused_bg_unpin_mutex);
1300         if (prev_trans) {
1301                 ret = clear_extent_bits(&prev_trans->pinned_extents, start, end,
1302                                         EXTENT_DIRTY);
1303                 if (ret)
1304                         goto out;
1305         }
1306
1307         ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end,
1308                                 EXTENT_DIRTY);
1309 out:
1310         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
1311         if (prev_trans)
1312                 btrfs_put_transaction(prev_trans);
1313
1314         return ret == 0;
1315 }
1316
1317 /*
1318  * Process the unused_bgs list and remove any that don't have any allocated
1319  * space inside of them.
1320  */
1321 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
1322 {
1323         struct btrfs_block_group *block_group;
1324         struct btrfs_space_info *space_info;
1325         struct btrfs_trans_handle *trans;
1326         const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC);
1327         int ret = 0;
1328
1329         if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
1330                 return;
1331
1332         spin_lock(&fs_info->unused_bgs_lock);
1333         while (!list_empty(&fs_info->unused_bgs)) {
1334                 int trimming;
1335
1336                 block_group = list_first_entry(&fs_info->unused_bgs,
1337                                                struct btrfs_block_group,
1338                                                bg_list);
1339                 list_del_init(&block_group->bg_list);
1340
1341                 space_info = block_group->space_info;
1342
1343                 if (ret || btrfs_mixed_space_info(space_info)) {
1344                         btrfs_put_block_group(block_group);
1345                         continue;
1346                 }
1347                 spin_unlock(&fs_info->unused_bgs_lock);
1348
1349                 btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
1350
1351                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
1352
1353                 /* Don't want to race with allocators so take the groups_sem */
1354                 down_write(&space_info->groups_sem);
1355
1356                 /*
1357                  * Async discard moves the final block group discard to be prior
1358                  * to the unused_bgs code path.  Therefore, if it's not fully
1359                  * trimmed, punt it back to the async discard lists.
1360                  */
1361                 if (btrfs_test_opt(fs_info, DISCARD_ASYNC) &&
1362                     !btrfs_is_free_space_trimmed(block_group)) {
1363                         trace_btrfs_skip_unused_block_group(block_group);
1364                         up_write(&space_info->groups_sem);
1365                         /* Requeue if we failed because of async discard */
1366                         btrfs_discard_queue_work(&fs_info->discard_ctl,
1367                                                  block_group);
1368                         goto next;
1369                 }
1370
1371                 spin_lock(&block_group->lock);
1372                 if (block_group->reserved || block_group->pinned ||
1373                     block_group->used || block_group->ro ||
1374                     list_is_singular(&block_group->list)) {
1375                         /*
1376                          * We want to bail if we made new allocations or have
1377                          * outstanding allocations in this block group.  We do
1378                          * the ro check in case balance is currently acting on
1379                          * this block group.
1380                          */
1381                         trace_btrfs_skip_unused_block_group(block_group);
1382                         spin_unlock(&block_group->lock);
1383                         up_write(&space_info->groups_sem);
1384                         goto next;
1385                 }
1386                 spin_unlock(&block_group->lock);
1387
1388                 /* We don't want to force the issue, only flip if it's ok. */
1389                 ret = inc_block_group_ro(block_group, 0);
1390                 up_write(&space_info->groups_sem);
1391                 if (ret < 0) {
1392                         ret = 0;
1393                         goto next;
1394                 }
1395
1396                 /*
1397                  * Want to do this before we do anything else so we can recover
1398                  * properly if we fail to join the transaction.
1399                  */
1400                 trans = btrfs_start_trans_remove_block_group(fs_info,
1401                                                      block_group->start);
1402                 if (IS_ERR(trans)) {
1403                         btrfs_dec_block_group_ro(block_group);
1404                         ret = PTR_ERR(trans);
1405                         goto next;
1406                 }
1407
1408                 /*
1409                  * We could have pending pinned extents for this block group,
1410                  * just delete them, we don't care about them anymore.
1411                  */
1412                 if (!clean_pinned_extents(trans, block_group)) {
1413                         btrfs_dec_block_group_ro(block_group);
1414                         goto end_trans;
1415                 }
1416
1417                 /*
1418                  * At this point, the block_group is read only and should fail
1419                  * new allocations.  However, btrfs_finish_extent_commit() can
1420                  * cause this block_group to be placed back on the discard
1421                  * lists because now the block_group isn't fully discarded.
1422                  * Bail here and try again later after discarding everything.
1423                  */
1424                 spin_lock(&fs_info->discard_ctl.lock);
1425                 if (!list_empty(&block_group->discard_list)) {
1426                         spin_unlock(&fs_info->discard_ctl.lock);
1427                         btrfs_dec_block_group_ro(block_group);
1428                         btrfs_discard_queue_work(&fs_info->discard_ctl,
1429                                                  block_group);
1430                         goto end_trans;
1431                 }
1432                 spin_unlock(&fs_info->discard_ctl.lock);
1433
1434                 /* Reset pinned so btrfs_put_block_group doesn't complain */
1435                 spin_lock(&space_info->lock);
1436                 spin_lock(&block_group->lock);
1437
1438                 btrfs_space_info_update_bytes_pinned(fs_info, space_info,
1439                                                      -block_group->pinned);
1440                 space_info->bytes_readonly += block_group->pinned;
1441                 percpu_counter_add_batch(&space_info->total_bytes_pinned,
1442                                    -block_group->pinned,
1443                                    BTRFS_TOTAL_BYTES_PINNED_BATCH);
1444                 block_group->pinned = 0;
1445
1446                 spin_unlock(&block_group->lock);
1447                 spin_unlock(&space_info->lock);
1448
1449                 /*
1450                  * The normal path here is an unused block group is passed here,
1451                  * then trimming is handled in the transaction commit path.
1452                  * Async discard interposes before this to do the trimming
1453                  * before coming down the unused block group path as trimming
1454                  * will no longer be done later in the transaction commit path.
1455                  */
1456                 if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC))
1457                         goto flip_async;
1458
1459                 /* DISCARD can flip during remount */
1460                 trimming = btrfs_test_opt(fs_info, DISCARD_SYNC);
1461
1462                 /* Implicit trim during transaction commit. */
1463                 if (trimming)
1464                         btrfs_freeze_block_group(block_group);
1465
1466                 /*
1467                  * Btrfs_remove_chunk will abort the transaction if things go
1468                  * horribly wrong.
1469                  */
1470                 ret = btrfs_remove_chunk(trans, block_group->start);
1471
1472                 if (ret) {
1473                         if (trimming)
1474                                 btrfs_unfreeze_block_group(block_group);
1475                         goto end_trans;
1476                 }
1477
1478                 /*
1479                  * If we're not mounted with -odiscard, we can just forget
1480                  * about this block group. Otherwise we'll need to wait
1481                  * until transaction commit to do the actual discard.
1482                  */
1483                 if (trimming) {
1484                         spin_lock(&fs_info->unused_bgs_lock);
1485                         /*
1486                          * A concurrent scrub might have added us to the list
1487                          * fs_info->unused_bgs, so use a list_move operation
1488                          * to add the block group to the deleted_bgs list.
1489                          */
1490                         list_move(&block_group->bg_list,
1491                                   &trans->transaction->deleted_bgs);
1492                         spin_unlock(&fs_info->unused_bgs_lock);
1493                         btrfs_get_block_group(block_group);
1494                 }
1495 end_trans:
1496                 btrfs_end_transaction(trans);
1497 next:
1498                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
1499                 btrfs_put_block_group(block_group);
1500                 spin_lock(&fs_info->unused_bgs_lock);
1501         }
1502         spin_unlock(&fs_info->unused_bgs_lock);
1503         return;
1504
1505 flip_async:
1506         btrfs_end_transaction(trans);
1507         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
1508         btrfs_put_block_group(block_group);
1509         btrfs_discard_punt_unused_bgs_list(fs_info);
1510 }
1511
1512 void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
1513 {
1514         struct btrfs_fs_info *fs_info = bg->fs_info;
1515
1516         spin_lock(&fs_info->unused_bgs_lock);
1517         if (list_empty(&bg->bg_list)) {
1518                 btrfs_get_block_group(bg);
1519                 trace_btrfs_add_unused_block_group(bg);
1520                 list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
1521         }
1522         spin_unlock(&fs_info->unused_bgs_lock);
1523 }
1524
1525 static int find_first_block_group(struct btrfs_fs_info *fs_info,
1526                                   struct btrfs_path *path,
1527                                   struct btrfs_key *key)
1528 {
1529         struct btrfs_root *root = fs_info->extent_root;
1530         int ret = 0;
1531         struct btrfs_key found_key;
1532         struct extent_buffer *leaf;
1533         struct btrfs_block_group_item bg;
1534         u64 flags;
1535         int slot;
1536
1537         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
1538         if (ret < 0)
1539                 goto out;
1540
1541         while (1) {
1542                 slot = path->slots[0];
1543                 leaf = path->nodes[0];
1544                 if (slot >= btrfs_header_nritems(leaf)) {
1545                         ret = btrfs_next_leaf(root, path);
1546                         if (ret == 0)
1547                                 continue;
1548                         if (ret < 0)
1549                                 goto out;
1550                         break;
1551                 }
1552                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
1553
1554                 if (found_key.objectid >= key->objectid &&
1555                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
1556                         struct extent_map_tree *em_tree;
1557                         struct extent_map *em;
1558
1559                         em_tree = &root->fs_info->mapping_tree;
1560                         read_lock(&em_tree->lock);
1561                         em = lookup_extent_mapping(em_tree, found_key.objectid,
1562                                                    found_key.offset);
1563                         read_unlock(&em_tree->lock);
1564                         if (!em) {
1565                                 btrfs_err(fs_info,
1566                         "logical %llu len %llu found bg but no related chunk",
1567                                           found_key.objectid, found_key.offset);
1568                                 ret = -ENOENT;
1569                         } else if (em->start != found_key.objectid ||
1570                                    em->len != found_key.offset) {
1571                                 btrfs_err(fs_info,
1572                 "block group %llu len %llu mismatch with chunk %llu len %llu",
1573                                           found_key.objectid, found_key.offset,
1574                                           em->start, em->len);
1575                                 ret = -EUCLEAN;
1576                         } else {
1577                                 read_extent_buffer(leaf, &bg,
1578                                         btrfs_item_ptr_offset(leaf, slot),
1579                                         sizeof(bg));
1580                                 flags = btrfs_stack_block_group_flags(&bg) &
1581                                         BTRFS_BLOCK_GROUP_TYPE_MASK;
1582
1583                                 if (flags != (em->map_lookup->type &
1584                                               BTRFS_BLOCK_GROUP_TYPE_MASK)) {
1585                                         btrfs_err(fs_info,
1586 "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
1587                                                 found_key.objectid,
1588                                                 found_key.offset, flags,
1589                                                 (BTRFS_BLOCK_GROUP_TYPE_MASK &
1590                                                  em->map_lookup->type));
1591                                         ret = -EUCLEAN;
1592                                 } else {
1593                                         ret = 0;
1594                                 }
1595                         }
1596                         free_extent_map(em);
1597                         goto out;
1598                 }
1599                 path->slots[0]++;
1600         }
1601 out:
1602         return ret;
1603 }
1604
1605 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
1606 {
1607         u64 extra_flags = chunk_to_extended(flags) &
1608                                 BTRFS_EXTENDED_PROFILE_MASK;
1609
1610         write_seqlock(&fs_info->profiles_lock);
1611         if (flags & BTRFS_BLOCK_GROUP_DATA)
1612                 fs_info->avail_data_alloc_bits |= extra_flags;
1613         if (flags & BTRFS_BLOCK_GROUP_METADATA)
1614                 fs_info->avail_metadata_alloc_bits |= extra_flags;
1615         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
1616                 fs_info->avail_system_alloc_bits |= extra_flags;
1617         write_sequnlock(&fs_info->profiles_lock);
1618 }
1619
1620 /**
1621  * btrfs_rmap_block - Map a physical disk address to a list of logical addresses
1622  * @chunk_start:   logical address of block group
1623  * @physical:      physical address to map to logical addresses
1624  * @logical:       return array of logical addresses which map to @physical
1625  * @naddrs:        length of @logical
1626  * @stripe_len:    size of IO stripe for the given block group
1627  *
1628  * Maps a particular @physical disk address to a list of @logical addresses.
1629  * Used primarily to exclude those portions of a block group that contain super
1630  * block copies.
1631  */
1632 EXPORT_FOR_TESTS
1633 int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
1634                      u64 physical, u64 **logical, int *naddrs, int *stripe_len)
1635 {
1636         struct extent_map *em;
1637         struct map_lookup *map;
1638         u64 *buf;
1639         u64 bytenr;
1640         u64 data_stripe_length;
1641         u64 io_stripe_size;
1642         int i, nr = 0;
1643         int ret = 0;
1644
1645         em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
1646         if (IS_ERR(em))
1647                 return -EIO;
1648
1649         map = em->map_lookup;
1650         data_stripe_length = em->len;
1651         io_stripe_size = map->stripe_len;
1652
1653         if (map->type & BTRFS_BLOCK_GROUP_RAID10)
1654                 data_stripe_length = div_u64(data_stripe_length,
1655                                              map->num_stripes / map->sub_stripes);
1656         else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
1657                 data_stripe_length = div_u64(data_stripe_length, map->num_stripes);
1658         else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1659                 data_stripe_length = div_u64(data_stripe_length,
1660                                              nr_data_stripes(map));
1661                 io_stripe_size = map->stripe_len * nr_data_stripes(map);
1662         }
1663
1664         buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
1665         if (!buf) {
1666                 ret = -ENOMEM;
1667                 goto out;
1668         }
1669
1670         for (i = 0; i < map->num_stripes; i++) {
1671                 bool already_inserted = false;
1672                 u64 stripe_nr;
1673                 int j;
1674
1675                 if (!in_range(physical, map->stripes[i].physical,
1676                               data_stripe_length))
1677                         continue;
1678
1679                 stripe_nr = physical - map->stripes[i].physical;
1680                 stripe_nr = div64_u64(stripe_nr, map->stripe_len);
1681
1682                 if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
1683                         stripe_nr = stripe_nr * map->num_stripes + i;
1684                         stripe_nr = div_u64(stripe_nr, map->sub_stripes);
1685                 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
1686                         stripe_nr = stripe_nr * map->num_stripes + i;
1687                 }
1688                 /*
1689                  * The remaining case would be for RAID56, multiply by
1690                  * nr_data_stripes().  Alternatively, just use rmap_len below
1691                  * instead of map->stripe_len
1692                  */
1693
1694                 bytenr = chunk_start + stripe_nr * io_stripe_size;
1695
1696                 /* Ensure we don't add duplicate addresses */
1697                 for (j = 0; j < nr; j++) {
1698                         if (buf[j] == bytenr) {
1699                                 already_inserted = true;
1700                                 break;
1701                         }
1702                 }
1703
1704                 if (!already_inserted)
1705                         buf[nr++] = bytenr;
1706         }
1707
1708         *logical = buf;
1709         *naddrs = nr;
1710         *stripe_len = io_stripe_size;
1711 out:
1712         free_extent_map(em);
1713         return ret;
1714 }
1715
1716 static int exclude_super_stripes(struct btrfs_block_group *cache)
1717 {
1718         struct btrfs_fs_info *fs_info = cache->fs_info;
1719         u64 bytenr;
1720         u64 *logical;
1721         int stripe_len;
1722         int i, nr, ret;
1723
1724         if (cache->start < BTRFS_SUPER_INFO_OFFSET) {
1725                 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start;
1726                 cache->bytes_super += stripe_len;
1727                 ret = btrfs_add_excluded_extent(fs_info, cache->start,
1728                                                 stripe_len);
1729                 if (ret)
1730                         return ret;
1731         }
1732
1733         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
1734                 bytenr = btrfs_sb_offset(i);
1735                 ret = btrfs_rmap_block(fs_info, cache->start,
1736                                        bytenr, &logical, &nr, &stripe_len);
1737                 if (ret)
1738                         return ret;
1739
1740                 while (nr--) {
1741                         u64 start, len;
1742
1743                         if (logical[nr] > cache->start + cache->length)
1744                                 continue;
1745
1746                         if (logical[nr] + stripe_len <= cache->start)
1747                                 continue;
1748
1749                         start = logical[nr];
1750                         if (start < cache->start) {
1751                                 start = cache->start;
1752                                 len = (logical[nr] + stripe_len) - start;
1753                         } else {
1754                                 len = min_t(u64, stripe_len,
1755                                             cache->start + cache->length - start);
1756                         }
1757
1758                         cache->bytes_super += len;
1759                         ret = btrfs_add_excluded_extent(fs_info, start, len);
1760                         if (ret) {
1761                                 kfree(logical);
1762                                 return ret;
1763                         }
1764                 }
1765
1766                 kfree(logical);
1767         }
1768         return 0;
1769 }
1770
1771 static void link_block_group(struct btrfs_block_group *cache)
1772 {
1773         struct btrfs_space_info *space_info = cache->space_info;
1774         int index = btrfs_bg_flags_to_raid_index(cache->flags);
1775         bool first = false;
1776
1777         down_write(&space_info->groups_sem);
1778         if (list_empty(&space_info->block_groups[index]))
1779                 first = true;
1780         list_add_tail(&cache->list, &space_info->block_groups[index]);
1781         up_write(&space_info->groups_sem);
1782
1783         if (first)
1784                 btrfs_sysfs_add_block_group_type(cache);
1785 }
1786
1787 static struct btrfs_block_group *btrfs_create_block_group_cache(
1788                 struct btrfs_fs_info *fs_info, u64 start)
1789 {
1790         struct btrfs_block_group *cache;
1791
1792         cache = kzalloc(sizeof(*cache), GFP_NOFS);
1793         if (!cache)
1794                 return NULL;
1795
1796         cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
1797                                         GFP_NOFS);
1798         if (!cache->free_space_ctl) {
1799                 kfree(cache);
1800                 return NULL;
1801         }
1802
1803         cache->start = start;
1804
1805         cache->fs_info = fs_info;
1806         cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
1807         set_free_space_tree_thresholds(cache);
1808
1809         cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
1810
1811         atomic_set(&cache->count, 1);
1812         spin_lock_init(&cache->lock);
1813         init_rwsem(&cache->data_rwsem);
1814         INIT_LIST_HEAD(&cache->list);
1815         INIT_LIST_HEAD(&cache->cluster_list);
1816         INIT_LIST_HEAD(&cache->bg_list);
1817         INIT_LIST_HEAD(&cache->ro_list);
1818         INIT_LIST_HEAD(&cache->discard_list);
1819         INIT_LIST_HEAD(&cache->dirty_list);
1820         INIT_LIST_HEAD(&cache->io_list);
1821         btrfs_init_free_space_ctl(cache);
1822         atomic_set(&cache->frozen, 0);
1823         mutex_init(&cache->free_space_lock);
1824         btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
1825
1826         return cache;
1827 }
1828
1829 /*
1830  * Iterate all chunks and verify that each of them has the corresponding block
1831  * group
1832  */
1833 static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
1834 {
1835         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
1836         struct extent_map *em;
1837         struct btrfs_block_group *bg;
1838         u64 start = 0;
1839         int ret = 0;
1840
1841         while (1) {
1842                 read_lock(&map_tree->lock);
1843                 /*
1844                  * lookup_extent_mapping will return the first extent map
1845                  * intersecting the range, so setting @len to 1 is enough to
1846                  * get the first chunk.
1847                  */
1848                 em = lookup_extent_mapping(map_tree, start, 1);
1849                 read_unlock(&map_tree->lock);
1850                 if (!em)
1851                         break;
1852
1853                 bg = btrfs_lookup_block_group(fs_info, em->start);
1854                 if (!bg) {
1855                         btrfs_err(fs_info,
1856         "chunk start=%llu len=%llu doesn't have corresponding block group",
1857                                      em->start, em->len);
1858                         ret = -EUCLEAN;
1859                         free_extent_map(em);
1860                         break;
1861                 }
1862                 if (bg->start != em->start || bg->length != em->len ||
1863                     (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
1864                     (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
1865                         btrfs_err(fs_info,
1866 "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
1867                                 em->start, em->len,
1868                                 em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
1869                                 bg->start, bg->length,
1870                                 bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
1871                         ret = -EUCLEAN;
1872                         free_extent_map(em);
1873                         btrfs_put_block_group(bg);
1874                         break;
1875                 }
1876                 start = em->start + em->len;
1877                 free_extent_map(em);
1878                 btrfs_put_block_group(bg);
1879         }
1880         return ret;
1881 }
1882
1883 static int read_block_group_item(struct btrfs_block_group *cache,
1884                                  struct btrfs_path *path,
1885                                  const struct btrfs_key *key)
1886 {
1887         struct extent_buffer *leaf = path->nodes[0];
1888         struct btrfs_block_group_item bgi;
1889         int slot = path->slots[0];
1890
1891         cache->length = key->offset;
1892
1893         read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
1894                            sizeof(bgi));
1895         cache->used = btrfs_stack_block_group_used(&bgi);
1896         cache->flags = btrfs_stack_block_group_flags(&bgi);
1897
1898         return 0;
1899 }
1900
1901 static int read_one_block_group(struct btrfs_fs_info *info,
1902                                 struct btrfs_path *path,
1903                                 const struct btrfs_key *key,
1904                                 int need_clear)
1905 {
1906         struct btrfs_block_group *cache;
1907         struct btrfs_space_info *space_info;
1908         const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS);
1909         int ret;
1910
1911         ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY);
1912
1913         cache = btrfs_create_block_group_cache(info, key->objectid);
1914         if (!cache)
1915                 return -ENOMEM;
1916
1917         ret = read_block_group_item(cache, path, key);
1918         if (ret < 0)
1919                 goto error;
1920
1921         if (need_clear) {
1922                 /*
1923                  * When we mount with old space cache, we need to
1924                  * set BTRFS_DC_CLEAR and set dirty flag.
1925                  *
1926                  * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
1927                  *    truncate the old free space cache inode and
1928                  *    setup a new one.
1929                  * b) Setting 'dirty flag' makes sure that we flush
1930                  *    the new space cache info onto disk.
1931                  */
1932                 if (btrfs_test_opt(info, SPACE_CACHE))
1933                         cache->disk_cache_state = BTRFS_DC_CLEAR;
1934         }
1935         if (!mixed && ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
1936             (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
1937                         btrfs_err(info,
1938 "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
1939                                   cache->start);
1940                         ret = -EINVAL;
1941                         goto error;
1942         }
1943
1944         /*
1945          * We need to exclude the super stripes now so that the space info has
1946          * super bytes accounted for, otherwise we'll think we have more space
1947          * than we actually do.
1948          */
1949         ret = exclude_super_stripes(cache);
1950         if (ret) {
1951                 /* We may have excluded something, so call this just in case. */
1952                 btrfs_free_excluded_extents(cache);
1953                 goto error;
1954         }
1955
1956         /*
1957          * Check for two cases, either we are full, and therefore don't need
1958          * to bother with the caching work since we won't find any space, or we
1959          * are empty, and we can just add all the space in and be done with it.
1960          * This saves us _a_lot_ of time, particularly in the full case.
1961          */
1962         if (cache->length == cache->used) {
1963                 cache->last_byte_to_unpin = (u64)-1;
1964                 cache->cached = BTRFS_CACHE_FINISHED;
1965                 btrfs_free_excluded_extents(cache);
1966         } else if (cache->used == 0) {
1967                 cache->last_byte_to_unpin = (u64)-1;
1968                 cache->cached = BTRFS_CACHE_FINISHED;
1969                 add_new_free_space(cache, cache->start,
1970                                    cache->start + cache->length);
1971                 btrfs_free_excluded_extents(cache);
1972         }
1973
1974         ret = btrfs_add_block_group_cache(info, cache);
1975         if (ret) {
1976                 btrfs_remove_free_space_cache(cache);
1977                 goto error;
1978         }
1979         trace_btrfs_add_block_group(info, cache, 0);
1980         btrfs_update_space_info(info, cache->flags, cache->length,
1981                                 cache->used, cache->bytes_super, &space_info);
1982
1983         cache->space_info = space_info;
1984
1985         link_block_group(cache);
1986
1987         set_avail_alloc_bits(info, cache->flags);
1988         if (btrfs_chunk_readonly(info, cache->start)) {
1989                 inc_block_group_ro(cache, 1);
1990         } else if (cache->used == 0) {
1991                 ASSERT(list_empty(&cache->bg_list));
1992                 if (btrfs_test_opt(info, DISCARD_ASYNC))
1993                         btrfs_discard_queue_work(&info->discard_ctl, cache);
1994                 else
1995                         btrfs_mark_bg_unused(cache);
1996         }
1997         return 0;
1998 error:
1999         btrfs_put_block_group(cache);
2000         return ret;
2001 }
2002
2003 int btrfs_read_block_groups(struct btrfs_fs_info *info)
2004 {
2005         struct btrfs_path *path;
2006         int ret;
2007         struct btrfs_block_group *cache;
2008         struct btrfs_space_info *space_info;
2009         struct btrfs_key key;
2010         int need_clear = 0;
2011         u64 cache_gen;
2012
2013         key.objectid = 0;
2014         key.offset = 0;
2015         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
2016         path = btrfs_alloc_path();
2017         if (!path)
2018                 return -ENOMEM;
2019
2020         cache_gen = btrfs_super_cache_generation(info->super_copy);
2021         if (btrfs_test_opt(info, SPACE_CACHE) &&
2022             btrfs_super_generation(info->super_copy) != cache_gen)
2023                 need_clear = 1;
2024         if (btrfs_test_opt(info, CLEAR_CACHE))
2025                 need_clear = 1;
2026
2027         while (1) {
2028                 ret = find_first_block_group(info, path, &key);
2029                 if (ret > 0)
2030                         break;
2031                 if (ret != 0)
2032                         goto error;
2033
2034                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2035                 ret = read_one_block_group(info, path, &key, need_clear);
2036                 if (ret < 0)
2037                         goto error;
2038                 key.objectid += key.offset;
2039                 key.offset = 0;
2040                 btrfs_release_path(path);
2041         }
2042
2043         rcu_read_lock();
2044         list_for_each_entry_rcu(space_info, &info->space_info, list) {
2045                 if (!(btrfs_get_alloc_profile(info, space_info->flags) &
2046                       (BTRFS_BLOCK_GROUP_RAID10 |
2047                        BTRFS_BLOCK_GROUP_RAID1_MASK |
2048                        BTRFS_BLOCK_GROUP_RAID56_MASK |
2049                        BTRFS_BLOCK_GROUP_DUP)))
2050                         continue;
2051                 /*
2052                  * Avoid allocating from un-mirrored block group if there are
2053                  * mirrored block groups.
2054                  */
2055                 list_for_each_entry(cache,
2056                                 &space_info->block_groups[BTRFS_RAID_RAID0],
2057                                 list)
2058                         inc_block_group_ro(cache, 1);
2059                 list_for_each_entry(cache,
2060                                 &space_info->block_groups[BTRFS_RAID_SINGLE],
2061                                 list)
2062                         inc_block_group_ro(cache, 1);
2063         }
2064         rcu_read_unlock();
2065
2066         btrfs_init_global_block_rsv(info);
2067         ret = check_chunk_block_group_mappings(info);
2068 error:
2069         btrfs_free_path(path);
2070         return ret;
2071 }
2072
2073 static int insert_block_group_item(struct btrfs_trans_handle *trans,
2074                                    struct btrfs_block_group *block_group)
2075 {
2076         struct btrfs_fs_info *fs_info = trans->fs_info;
2077         struct btrfs_block_group_item bgi;
2078         struct btrfs_root *root;
2079         struct btrfs_key key;
2080
2081         spin_lock(&block_group->lock);
2082         btrfs_set_stack_block_group_used(&bgi, block_group->used);
2083         btrfs_set_stack_block_group_chunk_objectid(&bgi,
2084                                 BTRFS_FIRST_CHUNK_TREE_OBJECTID);
2085         btrfs_set_stack_block_group_flags(&bgi, block_group->flags);
2086         key.objectid = block_group->start;
2087         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
2088         key.offset = block_group->length;
2089         spin_unlock(&block_group->lock);
2090
2091         root = fs_info->extent_root;
2092         return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
2093 }
2094
2095 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
2096 {
2097         struct btrfs_fs_info *fs_info = trans->fs_info;
2098         struct btrfs_block_group *block_group;
2099         int ret = 0;
2100
2101         if (!trans->can_flush_pending_bgs)
2102                 return;
2103
2104         while (!list_empty(&trans->new_bgs)) {
2105                 block_group = list_first_entry(&trans->new_bgs,
2106                                                struct btrfs_block_group,
2107                                                bg_list);
2108                 if (ret)
2109                         goto next;
2110
2111                 ret = insert_block_group_item(trans, block_group);
2112                 if (ret)
2113                         btrfs_abort_transaction(trans, ret);
2114                 ret = btrfs_finish_chunk_alloc(trans, block_group->start,
2115                                         block_group->length);
2116                 if (ret)
2117                         btrfs_abort_transaction(trans, ret);
2118                 add_block_group_free_space(trans, block_group);
2119                 /* Already aborted the transaction if it failed. */
2120 next:
2121                 btrfs_delayed_refs_rsv_release(fs_info, 1);
2122                 list_del_init(&block_group->bg_list);
2123         }
2124         btrfs_trans_release_chunk_metadata(trans);
2125 }
2126
2127 int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
2128                            u64 type, u64 chunk_offset, u64 size)
2129 {
2130         struct btrfs_fs_info *fs_info = trans->fs_info;
2131         struct btrfs_block_group *cache;
2132         int ret;
2133
2134         btrfs_set_log_full_commit(trans);
2135
2136         cache = btrfs_create_block_group_cache(fs_info, chunk_offset);
2137         if (!cache)
2138                 return -ENOMEM;
2139
2140         cache->length = size;
2141         cache->used = bytes_used;
2142         cache->flags = type;
2143         cache->last_byte_to_unpin = (u64)-1;
2144         cache->cached = BTRFS_CACHE_FINISHED;
2145         cache->needs_free_space = 1;
2146         ret = exclude_super_stripes(cache);
2147         if (ret) {
2148                 /* We may have excluded something, so call this just in case */
2149                 btrfs_free_excluded_extents(cache);
2150                 btrfs_put_block_group(cache);
2151                 return ret;
2152         }
2153
2154         add_new_free_space(cache, chunk_offset, chunk_offset + size);
2155
2156         btrfs_free_excluded_extents(cache);
2157
2158 #ifdef CONFIG_BTRFS_DEBUG
2159         if (btrfs_should_fragment_free_space(cache)) {
2160                 u64 new_bytes_used = size - bytes_used;
2161
2162                 bytes_used += new_bytes_used >> 1;
2163                 fragment_free_space(cache);
2164         }
2165 #endif
2166         /*
2167          * Ensure the corresponding space_info object is created and
2168          * assigned to our block group. We want our bg to be added to the rbtree
2169          * with its ->space_info set.
2170          */
2171         cache->space_info = btrfs_find_space_info(fs_info, cache->flags);
2172         ASSERT(cache->space_info);
2173
2174         ret = btrfs_add_block_group_cache(fs_info, cache);
2175         if (ret) {
2176                 btrfs_remove_free_space_cache(cache);
2177                 btrfs_put_block_group(cache);
2178                 return ret;
2179         }
2180
2181         /*
2182          * Now that our block group has its ->space_info set and is inserted in
2183          * the rbtree, update the space info's counters.
2184          */
2185         trace_btrfs_add_block_group(fs_info, cache, 1);
2186         btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
2187                                 cache->bytes_super, &cache->space_info);
2188         btrfs_update_global_block_rsv(fs_info);
2189
2190         link_block_group(cache);
2191
2192         list_add_tail(&cache->bg_list, &trans->new_bgs);
2193         trans->delayed_ref_updates++;
2194         btrfs_update_delayed_refs_rsv(trans);
2195
2196         set_avail_alloc_bits(fs_info, type);
2197         return 0;
2198 }
2199
2200 static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
2201 {
2202         u64 num_devices;
2203         u64 stripped;
2204
2205         /*
2206          * if restripe for this chunk_type is on pick target profile and
2207          * return, otherwise do the usual balance
2208          */
2209         stripped = get_restripe_target(fs_info, flags);
2210         if (stripped)
2211                 return extended_to_chunk(stripped);
2212
2213         num_devices = fs_info->fs_devices->rw_devices;
2214
2215         stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID56_MASK |
2216                 BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10;
2217
2218         if (num_devices == 1) {
2219                 stripped |= BTRFS_BLOCK_GROUP_DUP;
2220                 stripped = flags & ~stripped;
2221
2222                 /* turn raid0 into single device chunks */
2223                 if (flags & BTRFS_BLOCK_GROUP_RAID0)
2224                         return stripped;
2225
2226                 /* turn mirroring into duplication */
2227                 if (flags & (BTRFS_BLOCK_GROUP_RAID1_MASK |
2228                              BTRFS_BLOCK_GROUP_RAID10))
2229                         return stripped | BTRFS_BLOCK_GROUP_DUP;
2230         } else {
2231                 /* they already had raid on here, just return */
2232                 if (flags & stripped)
2233                         return flags;
2234
2235                 stripped |= BTRFS_BLOCK_GROUP_DUP;
2236                 stripped = flags & ~stripped;
2237
2238                 /* switch duplicated blocks with raid1 */
2239                 if (flags & BTRFS_BLOCK_GROUP_DUP)
2240                         return stripped | BTRFS_BLOCK_GROUP_RAID1;
2241
2242                 /* this is drive concat, leave it alone */
2243         }
2244
2245         return flags;
2246 }
2247
2248 /*
2249  * Mark one block group RO, can be called several times for the same block
2250  * group.
2251  *
2252  * @cache:              the destination block group
2253  * @do_chunk_alloc:     whether need to do chunk pre-allocation, this is to
2254  *                      ensure we still have some free space after marking this
2255  *                      block group RO.
2256  */
2257 int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
2258                              bool do_chunk_alloc)
2259 {
2260         struct btrfs_fs_info *fs_info = cache->fs_info;
2261         struct btrfs_trans_handle *trans;
2262         u64 alloc_flags;
2263         int ret;
2264
2265 again:
2266         trans = btrfs_join_transaction(fs_info->extent_root);
2267         if (IS_ERR(trans))
2268                 return PTR_ERR(trans);
2269
2270         /*
2271          * we're not allowed to set block groups readonly after the dirty
2272          * block groups cache has started writing.  If it already started,
2273          * back off and let this transaction commit
2274          */
2275         mutex_lock(&fs_info->ro_block_group_mutex);
2276         if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
2277                 u64 transid = trans->transid;
2278
2279                 mutex_unlock(&fs_info->ro_block_group_mutex);
2280                 btrfs_end_transaction(trans);
2281
2282                 ret = btrfs_wait_for_commit(fs_info, transid);
2283                 if (ret)
2284                         return ret;
2285                 goto again;
2286         }
2287
2288         if (do_chunk_alloc) {
2289                 /*
2290                  * If we are changing raid levels, try to allocate a
2291                  * corresponding block group with the new raid level.
2292                  */
2293                 alloc_flags = update_block_group_flags(fs_info, cache->flags);
2294                 if (alloc_flags != cache->flags) {
2295                         ret = btrfs_chunk_alloc(trans, alloc_flags,
2296                                                 CHUNK_ALLOC_FORCE);
2297                         /*
2298                          * ENOSPC is allowed here, we may have enough space
2299                          * already allocated at the new raid level to carry on
2300                          */
2301                         if (ret == -ENOSPC)
2302                                 ret = 0;
2303                         if (ret < 0)
2304                                 goto out;
2305                 }
2306         }
2307
2308         ret = inc_block_group_ro(cache, 0);
2309         if (!do_chunk_alloc)
2310                 goto unlock_out;
2311         if (!ret)
2312                 goto out;
2313         alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags);
2314         ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
2315         if (ret < 0)
2316                 goto out;
2317         ret = inc_block_group_ro(cache, 0);
2318 out:
2319         if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
2320                 alloc_flags = update_block_group_flags(fs_info, cache->flags);
2321                 mutex_lock(&fs_info->chunk_mutex);
2322                 check_system_chunk(trans, alloc_flags);
2323                 mutex_unlock(&fs_info->chunk_mutex);
2324         }
2325 unlock_out:
2326         mutex_unlock(&fs_info->ro_block_group_mutex);
2327
2328         btrfs_end_transaction(trans);
2329         return ret;
2330 }
2331
2332 void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
2333 {
2334         struct btrfs_space_info *sinfo = cache->space_info;
2335         u64 num_bytes;
2336
2337         BUG_ON(!cache->ro);
2338
2339         spin_lock(&sinfo->lock);
2340         spin_lock(&cache->lock);
2341         if (!--cache->ro) {
2342                 num_bytes = cache->length - cache->reserved -
2343                             cache->pinned - cache->bytes_super - cache->used;
2344                 sinfo->bytes_readonly -= num_bytes;
2345                 list_del_init(&cache->ro_list);
2346         }
2347         spin_unlock(&cache->lock);
2348         spin_unlock(&sinfo->lock);
2349 }
2350
2351 static int update_block_group_item(struct btrfs_trans_handle *trans,
2352                                    struct btrfs_path *path,
2353                                    struct btrfs_block_group *cache)
2354 {
2355         struct btrfs_fs_info *fs_info = trans->fs_info;
2356         int ret;
2357         struct btrfs_root *root = fs_info->extent_root;
2358         unsigned long bi;
2359         struct extent_buffer *leaf;
2360         struct btrfs_block_group_item bgi;
2361         struct btrfs_key key;
2362
2363         key.objectid = cache->start;
2364         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
2365         key.offset = cache->length;
2366
2367         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2368         if (ret) {
2369                 if (ret > 0)
2370                         ret = -ENOENT;
2371                 goto fail;
2372         }
2373
2374         leaf = path->nodes[0];
2375         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
2376         btrfs_set_stack_block_group_used(&bgi, cache->used);
2377         btrfs_set_stack_block_group_chunk_objectid(&bgi,
2378                         BTRFS_FIRST_CHUNK_TREE_OBJECTID);
2379         btrfs_set_stack_block_group_flags(&bgi, cache->flags);
2380         write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
2381         btrfs_mark_buffer_dirty(leaf);
2382 fail:
2383         btrfs_release_path(path);
2384         return ret;
2385
2386 }
2387
2388 static int cache_save_setup(struct btrfs_block_group *block_group,
2389                             struct btrfs_trans_handle *trans,
2390                             struct btrfs_path *path)
2391 {
2392         struct btrfs_fs_info *fs_info = block_group->fs_info;
2393         struct btrfs_root *root = fs_info->tree_root;
2394         struct inode *inode = NULL;
2395         struct extent_changeset *data_reserved = NULL;
2396         u64 alloc_hint = 0;
2397         int dcs = BTRFS_DC_ERROR;
2398         u64 num_pages = 0;
2399         int retries = 0;
2400         int ret = 0;
2401
2402         /*
2403          * If this block group is smaller than 100 megs don't bother caching the
2404          * block group.
2405          */
2406         if (block_group->length < (100 * SZ_1M)) {
2407                 spin_lock(&block_group->lock);
2408                 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
2409                 spin_unlock(&block_group->lock);
2410                 return 0;
2411         }
2412
2413         if (TRANS_ABORTED(trans))
2414                 return 0;
2415 again:
2416         inode = lookup_free_space_inode(block_group, path);
2417         if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
2418                 ret = PTR_ERR(inode);
2419                 btrfs_release_path(path);
2420                 goto out;
2421         }
2422
2423         if (IS_ERR(inode)) {
2424                 BUG_ON(retries);
2425                 retries++;
2426
2427                 if (block_group->ro)
2428                         goto out_free;
2429
2430                 ret = create_free_space_inode(trans, block_group, path);
2431                 if (ret)
2432                         goto out_free;
2433                 goto again;
2434         }
2435
2436         /*
2437          * We want to set the generation to 0, that way if anything goes wrong
2438          * from here on out we know not to trust this cache when we load up next
2439          * time.
2440          */
2441         BTRFS_I(inode)->generation = 0;
2442         ret = btrfs_update_inode(trans, root, inode);
2443         if (ret) {
2444                 /*
2445                  * So theoretically we could recover from this, simply set the
2446                  * super cache generation to 0 so we know to invalidate the
2447                  * cache, but then we'd have to keep track of the block groups
2448                  * that fail this way so we know we _have_ to reset this cache
2449                  * before the next commit or risk reading stale cache.  So to
2450                  * limit our exposure to horrible edge cases lets just abort the
2451                  * transaction, this only happens in really bad situations
2452                  * anyway.
2453                  */
2454                 btrfs_abort_transaction(trans, ret);
2455                 goto out_put;
2456         }
2457         WARN_ON(ret);
2458
2459         /* We've already setup this transaction, go ahead and exit */
2460         if (block_group->cache_generation == trans->transid &&
2461             i_size_read(inode)) {
2462                 dcs = BTRFS_DC_SETUP;
2463                 goto out_put;
2464         }
2465
2466         if (i_size_read(inode) > 0) {
2467                 ret = btrfs_check_trunc_cache_free_space(fs_info,
2468                                         &fs_info->global_block_rsv);
2469                 if (ret)
2470                         goto out_put;
2471
2472                 ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
2473                 if (ret)
2474                         goto out_put;
2475         }
2476
2477         spin_lock(&block_group->lock);
2478         if (block_group->cached != BTRFS_CACHE_FINISHED ||
2479             !btrfs_test_opt(fs_info, SPACE_CACHE)) {
2480                 /*
2481                  * don't bother trying to write stuff out _if_
2482                  * a) we're not cached,
2483                  * b) we're with nospace_cache mount option,
2484                  * c) we're with v2 space_cache (FREE_SPACE_TREE).
2485                  */
2486                 dcs = BTRFS_DC_WRITTEN;
2487                 spin_unlock(&block_group->lock);
2488                 goto out_put;
2489         }
2490         spin_unlock(&block_group->lock);
2491
2492         /*
2493          * We hit an ENOSPC when setting up the cache in this transaction, just
2494          * skip doing the setup, we've already cleared the cache so we're safe.
2495          */
2496         if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
2497                 ret = -ENOSPC;
2498                 goto out_put;
2499         }
2500
2501         /*
2502          * Try to preallocate enough space based on how big the block group is.
2503          * Keep in mind this has to include any pinned space which could end up
2504          * taking up quite a bit since it's not folded into the other space
2505          * cache.
2506          */
2507         num_pages = div_u64(block_group->length, SZ_256M);
2508         if (!num_pages)
2509                 num_pages = 1;
2510
2511         num_pages *= 16;
2512         num_pages *= PAGE_SIZE;
2513
2514         ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
2515         if (ret)
2516                 goto out_put;
2517
2518         ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
2519                                               num_pages, num_pages,
2520                                               &alloc_hint);
2521         /*
2522          * Our cache requires contiguous chunks so that we don't modify a bunch
2523          * of metadata or split extents when writing the cache out, which means
2524          * we can enospc if we are heavily fragmented in addition to just normal
2525          * out of space conditions.  So if we hit this just skip setting up any
2526          * other block groups for this transaction, maybe we'll unpin enough
2527          * space the next time around.
2528          */
2529         if (!ret)
2530                 dcs = BTRFS_DC_SETUP;
2531         else if (ret == -ENOSPC)
2532                 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
2533
2534 out_put:
2535         iput(inode);
2536 out_free:
2537         btrfs_release_path(path);
2538 out:
2539         spin_lock(&block_group->lock);
2540         if (!ret && dcs == BTRFS_DC_SETUP)
2541                 block_group->cache_generation = trans->transid;
2542         block_group->disk_cache_state = dcs;
2543         spin_unlock(&block_group->lock);
2544
2545         extent_changeset_free(data_reserved);
2546         return ret;
2547 }
2548
2549 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
2550 {
2551         struct btrfs_fs_info *fs_info = trans->fs_info;
2552         struct btrfs_block_group *cache, *tmp;
2553         struct btrfs_transaction *cur_trans = trans->transaction;
2554         struct btrfs_path *path;
2555
2556         if (list_empty(&cur_trans->dirty_bgs) ||
2557             !btrfs_test_opt(fs_info, SPACE_CACHE))
2558                 return 0;
2559
2560         path = btrfs_alloc_path();
2561         if (!path)
2562                 return -ENOMEM;
2563
2564         /* Could add new block groups, use _safe just in case */
2565         list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
2566                                  dirty_list) {
2567                 if (cache->disk_cache_state == BTRFS_DC_CLEAR)
2568                         cache_save_setup(cache, trans, path);
2569         }
2570
2571         btrfs_free_path(path);
2572         return 0;
2573 }
2574
2575 /*
2576  * Transaction commit does final block group cache writeback during a critical
2577  * section where nothing is allowed to change the FS.  This is required in
2578  * order for the cache to actually match the block group, but can introduce a
2579  * lot of latency into the commit.
2580  *
2581  * So, btrfs_start_dirty_block_groups is here to kick off block group cache IO.
2582  * There's a chance we'll have to redo some of it if the block group changes
2583  * again during the commit, but it greatly reduces the commit latency by
2584  * getting rid of the easy block groups while we're still allowing others to
2585  * join the commit.
2586  */
2587 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
2588 {
2589         struct btrfs_fs_info *fs_info = trans->fs_info;
2590         struct btrfs_block_group *cache;
2591         struct btrfs_transaction *cur_trans = trans->transaction;
2592         int ret = 0;
2593         int should_put;
2594         struct btrfs_path *path = NULL;
2595         LIST_HEAD(dirty);
2596         struct list_head *io = &cur_trans->io_bgs;
2597         int num_started = 0;
2598         int loops = 0;
2599
2600         spin_lock(&cur_trans->dirty_bgs_lock);
2601         if (list_empty(&cur_trans->dirty_bgs)) {
2602                 spin_unlock(&cur_trans->dirty_bgs_lock);
2603                 return 0;
2604         }
2605         list_splice_init(&cur_trans->dirty_bgs, &dirty);
2606         spin_unlock(&cur_trans->dirty_bgs_lock);
2607
2608 again:
2609         /* Make sure all the block groups on our dirty list actually exist */
2610         btrfs_create_pending_block_groups(trans);
2611
2612         if (!path) {
2613                 path = btrfs_alloc_path();
2614                 if (!path)
2615                         return -ENOMEM;
2616         }
2617
2618         /*
2619          * cache_write_mutex is here only to save us from balance or automatic
2620          * removal of empty block groups deleting this block group while we are
2621          * writing out the cache
2622          */
2623         mutex_lock(&trans->transaction->cache_write_mutex);
2624         while (!list_empty(&dirty)) {
2625                 bool drop_reserve = true;
2626
2627                 cache = list_first_entry(&dirty, struct btrfs_block_group,
2628                                          dirty_list);
2629                 /*
2630                  * This can happen if something re-dirties a block group that
2631                  * is already under IO.  Just wait for it to finish and then do
2632                  * it all again
2633                  */
2634                 if (!list_empty(&cache->io_list)) {
2635                         list_del_init(&cache->io_list);
2636                         btrfs_wait_cache_io(trans, cache, path);
2637                         btrfs_put_block_group(cache);
2638                 }
2639
2640
2641                 /*
2642                  * btrfs_wait_cache_io uses the cache->dirty_list to decide if
2643                  * it should update the cache_state.  Don't delete until after
2644                  * we wait.
2645                  *
2646                  * Since we're not running in the commit critical section
2647                  * we need the dirty_bgs_lock to protect from update_block_group
2648                  */
2649                 spin_lock(&cur_trans->dirty_bgs_lock);
2650                 list_del_init(&cache->dirty_list);
2651                 spin_unlock(&cur_trans->dirty_bgs_lock);
2652
2653                 should_put = 1;
2654
2655                 cache_save_setup(cache, trans, path);
2656
2657                 if (cache->disk_cache_state == BTRFS_DC_SETUP) {
2658                         cache->io_ctl.inode = NULL;
2659                         ret = btrfs_write_out_cache(trans, cache, path);
2660                         if (ret == 0 && cache->io_ctl.inode) {
2661                                 num_started++;
2662                                 should_put = 0;
2663
2664                                 /*
2665                                  * The cache_write_mutex is protecting the
2666                                  * io_list, also refer to the definition of
2667                                  * btrfs_transaction::io_bgs for more details
2668                                  */
2669                                 list_add_tail(&cache->io_list, io);
2670                         } else {
2671                                 /*
2672                                  * If we failed to write the cache, the
2673                                  * generation will be bad and life goes on
2674                                  */
2675                                 ret = 0;
2676                         }
2677                 }
2678                 if (!ret) {
2679                         ret = update_block_group_item(trans, path, cache);
2680                         /*
2681                          * Our block group might still be attached to the list
2682                          * of new block groups in the transaction handle of some
2683                          * other task (struct btrfs_trans_handle->new_bgs). This
2684                          * means its block group item isn't yet in the extent
2685                          * tree. If this happens ignore the error, as we will
2686                          * try again later in the critical section of the
2687                          * transaction commit.
2688                          */
2689                         if (ret == -ENOENT) {
2690                                 ret = 0;
2691                                 spin_lock(&cur_trans->dirty_bgs_lock);
2692                                 if (list_empty(&cache->dirty_list)) {
2693                                         list_add_tail(&cache->dirty_list,
2694                                                       &cur_trans->dirty_bgs);
2695                                         btrfs_get_block_group(cache);
2696                                         drop_reserve = false;
2697                                 }
2698                                 spin_unlock(&cur_trans->dirty_bgs_lock);
2699                         } else if (ret) {
2700                                 btrfs_abort_transaction(trans, ret);
2701                         }
2702                 }
2703
2704                 /* If it's not on the io list, we need to put the block group */
2705                 if (should_put)
2706                         btrfs_put_block_group(cache);
2707                 if (drop_reserve)
2708                         btrfs_delayed_refs_rsv_release(fs_info, 1);
2709
2710                 if (ret)
2711                         break;
2712
2713                 /*
2714                  * Avoid blocking other tasks for too long. It might even save
2715                  * us from writing caches for block groups that are going to be
2716                  * removed.
2717                  */
2718                 mutex_unlock(&trans->transaction->cache_write_mutex);
2719                 mutex_lock(&trans->transaction->cache_write_mutex);
2720         }
2721         mutex_unlock(&trans->transaction->cache_write_mutex);
2722
2723         /*
2724          * Go through delayed refs for all the stuff we've just kicked off
2725          * and then loop back (just once)
2726          */
2727         ret = btrfs_run_delayed_refs(trans, 0);
2728         if (!ret && loops == 0) {
2729                 loops++;
2730                 spin_lock(&cur_trans->dirty_bgs_lock);
2731                 list_splice_init(&cur_trans->dirty_bgs, &dirty);
2732                 /*
2733                  * dirty_bgs_lock protects us from concurrent block group
2734                  * deletes too (not just cache_write_mutex).
2735                  */
2736                 if (!list_empty(&dirty)) {
2737                         spin_unlock(&cur_trans->dirty_bgs_lock);
2738                         goto again;
2739                 }
2740                 spin_unlock(&cur_trans->dirty_bgs_lock);
2741         } else if (ret < 0) {
2742                 btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
2743         }
2744
2745         btrfs_free_path(path);
2746         return ret;
2747 }
2748
2749 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
2750 {
2751         struct btrfs_fs_info *fs_info = trans->fs_info;
2752         struct btrfs_block_group *cache;
2753         struct btrfs_transaction *cur_trans = trans->transaction;
2754         int ret = 0;
2755         int should_put;
2756         struct btrfs_path *path;
2757         struct list_head *io = &cur_trans->io_bgs;
2758         int num_started = 0;
2759
2760         path = btrfs_alloc_path();
2761         if (!path)
2762                 return -ENOMEM;
2763
2764         /*
2765          * Even though we are in the critical section of the transaction commit,
2766          * we can still have concurrent tasks adding elements to this
2767          * transaction's list of dirty block groups. These tasks correspond to
2768          * endio free space workers started when writeback finishes for a
2769          * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
2770          * allocate new block groups as a result of COWing nodes of the root
2771          * tree when updating the free space inode. The writeback for the space
2772          * caches is triggered by an earlier call to
2773          * btrfs_start_dirty_block_groups() and iterations of the following
2774          * loop.
2775          * Also we want to do the cache_save_setup first and then run the
2776          * delayed refs to make sure we have the best chance at doing this all
2777          * in one shot.
2778          */
2779         spin_lock(&cur_trans->dirty_bgs_lock);
2780         while (!list_empty(&cur_trans->dirty_bgs)) {
2781                 cache = list_first_entry(&cur_trans->dirty_bgs,
2782                                          struct btrfs_block_group,
2783                                          dirty_list);
2784
2785                 /*
2786                  * This can happen if cache_save_setup re-dirties a block group
2787                  * that is already under IO.  Just wait for it to finish and
2788                  * then do it all again
2789                  */
2790                 if (!list_empty(&cache->io_list)) {
2791                         spin_unlock(&cur_trans->dirty_bgs_lock);
2792                         list_del_init(&cache->io_list);
2793                         btrfs_wait_cache_io(trans, cache, path);
2794                         btrfs_put_block_group(cache);
2795                         spin_lock(&cur_trans->dirty_bgs_lock);
2796                 }
2797
2798                 /*
2799                  * Don't remove from the dirty list until after we've waited on
2800                  * any pending IO
2801                  */
2802                 list_del_init(&cache->dirty_list);
2803                 spin_unlock(&cur_trans->dirty_bgs_lock);
2804                 should_put = 1;
2805
2806                 cache_save_setup(cache, trans, path);
2807
2808                 if (!ret)
2809                         ret = btrfs_run_delayed_refs(trans,
2810                                                      (unsigned long) -1);
2811
2812                 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
2813                         cache->io_ctl.inode = NULL;
2814                         ret = btrfs_write_out_cache(trans, cache, path);
2815                         if (ret == 0 && cache->io_ctl.inode) {
2816                                 num_started++;
2817                                 should_put = 0;
2818                                 list_add_tail(&cache->io_list, io);
2819                         } else {
2820                                 /*
2821                                  * If we failed to write the cache, the
2822                                  * generation will be bad and life goes on
2823                                  */
2824                                 ret = 0;
2825                         }
2826                 }
2827                 if (!ret) {
2828                         ret = update_block_group_item(trans, path, cache);
2829                         /*
2830                          * One of the free space endio workers might have
2831                          * created a new block group while updating a free space
2832                          * cache's inode (at inode.c:btrfs_finish_ordered_io())
2833                          * and hasn't released its transaction handle yet, in
2834                          * which case the new block group is still attached to
2835                          * its transaction handle and its creation has not
2836                          * finished yet (no block group item in the extent tree
2837                          * yet, etc). If this is the case, wait for all free
2838                          * space endio workers to finish and retry. This is a
2839                          * a very rare case so no need for a more efficient and
2840                          * complex approach.
2841                          */
2842                         if (ret == -ENOENT) {
2843                                 wait_event(cur_trans->writer_wait,
2844                                    atomic_read(&cur_trans->num_writers) == 1);
2845                                 ret = update_block_group_item(trans, path, cache);
2846                         }
2847                         if (ret)
2848                                 btrfs_abort_transaction(trans, ret);
2849                 }
2850
2851                 /* If its not on the io list, we need to put the block group */
2852                 if (should_put)
2853                         btrfs_put_block_group(cache);
2854                 btrfs_delayed_refs_rsv_release(fs_info, 1);
2855                 spin_lock(&cur_trans->dirty_bgs_lock);
2856         }
2857         spin_unlock(&cur_trans->dirty_bgs_lock);
2858
2859         /*
2860          * Refer to the definition of io_bgs member for details why it's safe
2861          * to use it without any locking
2862          */
2863         while (!list_empty(io)) {
2864                 cache = list_first_entry(io, struct btrfs_block_group,
2865                                          io_list);
2866                 list_del_init(&cache->io_list);
2867                 btrfs_wait_cache_io(trans, cache, path);
2868                 btrfs_put_block_group(cache);
2869         }
2870
2871         btrfs_free_path(path);
2872         return ret;
2873 }
2874
2875 int btrfs_update_block_group(struct btrfs_trans_handle *trans,
2876                              u64 bytenr, u64 num_bytes, int alloc)
2877 {
2878         struct btrfs_fs_info *info = trans->fs_info;
2879         struct btrfs_block_group *cache = NULL;
2880         u64 total = num_bytes;
2881         u64 old_val;
2882         u64 byte_in_group;
2883         int factor;
2884         int ret = 0;
2885
2886         /* Block accounting for super block */
2887         spin_lock(&info->delalloc_root_lock);
2888         old_val = btrfs_super_bytes_used(info->super_copy);
2889         if (alloc)
2890                 old_val += num_bytes;
2891         else
2892                 old_val -= num_bytes;
2893         btrfs_set_super_bytes_used(info->super_copy, old_val);
2894         spin_unlock(&info->delalloc_root_lock);
2895
2896         while (total) {
2897                 cache = btrfs_lookup_block_group(info, bytenr);
2898                 if (!cache) {
2899                         ret = -ENOENT;
2900                         break;
2901                 }
2902                 factor = btrfs_bg_type_to_factor(cache->flags);
2903
2904                 /*
2905                  * If this block group has free space cache written out, we
2906                  * need to make sure to load it if we are removing space.  This
2907                  * is because we need the unpinning stage to actually add the
2908                  * space back to the block group, otherwise we will leak space.
2909                  */
2910                 if (!alloc && !btrfs_block_group_done(cache))
2911                         btrfs_cache_block_group(cache, 1);
2912
2913                 byte_in_group = bytenr - cache->start;
2914                 WARN_ON(byte_in_group > cache->length);
2915
2916                 spin_lock(&cache->space_info->lock);
2917                 spin_lock(&cache->lock);
2918
2919                 if (btrfs_test_opt(info, SPACE_CACHE) &&
2920                     cache->disk_cache_state < BTRFS_DC_CLEAR)
2921                         cache->disk_cache_state = BTRFS_DC_CLEAR;
2922
2923                 old_val = cache->used;
2924                 num_bytes = min(total, cache->length - byte_in_group);
2925                 if (alloc) {
2926                         old_val += num_bytes;
2927                         cache->used = old_val;
2928                         cache->reserved -= num_bytes;
2929                         cache->space_info->bytes_reserved -= num_bytes;
2930                         cache->space_info->bytes_used += num_bytes;
2931                         cache->space_info->disk_used += num_bytes * factor;
2932                         spin_unlock(&cache->lock);
2933                         spin_unlock(&cache->space_info->lock);
2934                 } else {
2935                         old_val -= num_bytes;
2936                         cache->used = old_val;
2937                         cache->pinned += num_bytes;
2938                         btrfs_space_info_update_bytes_pinned(info,
2939                                         cache->space_info, num_bytes);
2940                         cache->space_info->bytes_used -= num_bytes;
2941                         cache->space_info->disk_used -= num_bytes * factor;
2942                         spin_unlock(&cache->lock);
2943                         spin_unlock(&cache->space_info->lock);
2944
2945                         percpu_counter_add_batch(
2946                                         &cache->space_info->total_bytes_pinned,
2947                                         num_bytes,
2948                                         BTRFS_TOTAL_BYTES_PINNED_BATCH);
2949                         set_extent_dirty(&trans->transaction->pinned_extents,
2950                                          bytenr, bytenr + num_bytes - 1,
2951                                          GFP_NOFS | __GFP_NOFAIL);
2952                 }
2953
2954                 spin_lock(&trans->transaction->dirty_bgs_lock);
2955                 if (list_empty(&cache->dirty_list)) {
2956                         list_add_tail(&cache->dirty_list,
2957                                       &trans->transaction->dirty_bgs);
2958                         trans->delayed_ref_updates++;
2959                         btrfs_get_block_group(cache);
2960                 }
2961                 spin_unlock(&trans->transaction->dirty_bgs_lock);
2962
2963                 /*
2964                  * No longer have used bytes in this block group, queue it for
2965                  * deletion. We do this after adding the block group to the
2966                  * dirty list to avoid races between cleaner kthread and space
2967                  * cache writeout.
2968                  */
2969                 if (!alloc && old_val == 0) {
2970                         if (!btrfs_test_opt(info, DISCARD_ASYNC))
2971                                 btrfs_mark_bg_unused(cache);
2972                 }
2973
2974                 btrfs_put_block_group(cache);
2975                 total -= num_bytes;
2976                 bytenr += num_bytes;
2977         }
2978
2979         /* Modified block groups are accounted for in the delayed_refs_rsv. */
2980         btrfs_update_delayed_refs_rsv(trans);
2981         return ret;
2982 }
2983
2984 /**
2985  * btrfs_add_reserved_bytes - update the block_group and space info counters
2986  * @cache:      The cache we are manipulating
2987  * @ram_bytes:  The number of bytes of file content, and will be same to
2988  *              @num_bytes except for the compress path.
2989  * @num_bytes:  The number of bytes in question
2990  * @delalloc:   The blocks are allocated for the delalloc write
2991  *
2992  * This is called by the allocator when it reserves space. If this is a
2993  * reservation and the block group has become read only we cannot make the
2994  * reservation and return -EAGAIN, otherwise this function always succeeds.
2995  */
2996 int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
2997                              u64 ram_bytes, u64 num_bytes, int delalloc)
2998 {
2999         struct btrfs_space_info *space_info = cache->space_info;
3000         int ret = 0;
3001
3002         spin_lock(&space_info->lock);
3003         spin_lock(&cache->lock);
3004         if (cache->ro) {
3005                 ret = -EAGAIN;
3006         } else {
3007                 cache->reserved += num_bytes;
3008                 space_info->bytes_reserved += num_bytes;
3009                 trace_btrfs_space_reservation(cache->fs_info, "space_info",
3010                                               space_info->flags, num_bytes, 1);
3011                 btrfs_space_info_update_bytes_may_use(cache->fs_info,
3012                                                       space_info, -ram_bytes);
3013                 if (delalloc)
3014                         cache->delalloc_bytes += num_bytes;
3015         }
3016         spin_unlock(&cache->lock);
3017         spin_unlock(&space_info->lock);
3018         return ret;
3019 }
3020
3021 /**
3022  * btrfs_free_reserved_bytes - update the block_group and space info counters
3023  * @cache:      The cache we are manipulating
3024  * @num_bytes:  The number of bytes in question
3025  * @delalloc:   The blocks are allocated for the delalloc write
3026  *
3027  * This is called by somebody who is freeing space that was never actually used
3028  * on disk.  For example if you reserve some space for a new leaf in transaction
3029  * A and before transaction A commits you free that leaf, you call this with
3030  * reserve set to 0 in order to clear the reservation.
3031  */
3032 void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
3033                                u64 num_bytes, int delalloc)
3034 {
3035         struct btrfs_space_info *space_info = cache->space_info;
3036
3037         spin_lock(&space_info->lock);
3038         spin_lock(&cache->lock);
3039         if (cache->ro)
3040                 space_info->bytes_readonly += num_bytes;
3041         cache->reserved -= num_bytes;
3042         space_info->bytes_reserved -= num_bytes;
3043         space_info->max_extent_size = 0;
3044
3045         if (delalloc)
3046                 cache->delalloc_bytes -= num_bytes;
3047         spin_unlock(&cache->lock);
3048         spin_unlock(&space_info->lock);
3049 }
3050
3051 static void force_metadata_allocation(struct btrfs_fs_info *info)
3052 {
3053         struct list_head *head = &info->space_info;
3054         struct btrfs_space_info *found;
3055
3056         rcu_read_lock();
3057         list_for_each_entry_rcu(found, head, list) {
3058                 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3059                         found->force_alloc = CHUNK_ALLOC_FORCE;
3060         }
3061         rcu_read_unlock();
3062 }
3063
3064 static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
3065                               struct btrfs_space_info *sinfo, int force)
3066 {
3067         u64 bytes_used = btrfs_space_info_used(sinfo, false);
3068         u64 thresh;
3069
3070         if (force == CHUNK_ALLOC_FORCE)
3071                 return 1;
3072
3073         /*
3074          * in limited mode, we want to have some free space up to
3075          * about 1% of the FS size.
3076          */
3077         if (force == CHUNK_ALLOC_LIMITED) {
3078                 thresh = btrfs_super_total_bytes(fs_info->super_copy);
3079                 thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
3080
3081                 if (sinfo->total_bytes - bytes_used < thresh)
3082                         return 1;
3083         }
3084
3085         if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
3086                 return 0;
3087         return 1;
3088 }
3089
3090 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
3091 {
3092         u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type);
3093
3094         return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
3095 }
3096
3097 /*
3098  * If force is CHUNK_ALLOC_FORCE:
3099  *    - return 1 if it successfully allocates a chunk,
3100  *    - return errors including -ENOSPC otherwise.
3101  * If force is NOT CHUNK_ALLOC_FORCE:
3102  *    - return 0 if it doesn't need to allocate a new chunk,
3103  *    - return 1 if it successfully allocates a chunk,
3104  *    - return errors including -ENOSPC otherwise.
3105  */
3106 int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
3107                       enum btrfs_chunk_alloc_enum force)
3108 {
3109         struct btrfs_fs_info *fs_info = trans->fs_info;
3110         struct btrfs_space_info *space_info;
3111         bool wait_for_alloc = false;
3112         bool should_alloc = false;
3113         int ret = 0;
3114
3115         /* Don't re-enter if we're already allocating a chunk */
3116         if (trans->allocating_chunk)
3117                 return -ENOSPC;
3118
3119         space_info = btrfs_find_space_info(fs_info, flags);
3120         ASSERT(space_info);
3121
3122         do {
3123                 spin_lock(&space_info->lock);
3124                 if (force < space_info->force_alloc)
3125                         force = space_info->force_alloc;
3126                 should_alloc = should_alloc_chunk(fs_info, space_info, force);
3127                 if (space_info->full) {
3128                         /* No more free physical space */
3129                         if (should_alloc)
3130                                 ret = -ENOSPC;
3131                         else
3132                                 ret = 0;
3133                         spin_unlock(&space_info->lock);
3134                         return ret;
3135                 } else if (!should_alloc) {
3136                         spin_unlock(&space_info->lock);
3137                         return 0;
3138                 } else if (space_info->chunk_alloc) {
3139                         /*
3140                          * Someone is already allocating, so we need to block
3141                          * until this someone is finished and then loop to
3142                          * recheck if we should continue with our allocation
3143                          * attempt.
3144                          */
3145                         wait_for_alloc = true;
3146                         spin_unlock(&space_info->lock);
3147                         mutex_lock(&fs_info->chunk_mutex);
3148                         mutex_unlock(&fs_info->chunk_mutex);
3149                 } else {
3150                         /* Proceed with allocation */
3151                         space_info->chunk_alloc = 1;
3152                         wait_for_alloc = false;
3153                         spin_unlock(&space_info->lock);
3154                 }
3155
3156                 cond_resched();
3157         } while (wait_for_alloc);
3158
3159         mutex_lock(&fs_info->chunk_mutex);
3160         trans->allocating_chunk = true;
3161
3162         /*
3163          * If we have mixed data/metadata chunks we want to make sure we keep
3164          * allocating mixed chunks instead of individual chunks.
3165          */
3166         if (btrfs_mixed_space_info(space_info))
3167                 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
3168
3169         /*
3170          * if we're doing a data chunk, go ahead and make sure that
3171          * we keep a reasonable number of metadata chunks allocated in the
3172          * FS as well.
3173          */
3174         if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3175                 fs_info->data_chunk_allocations++;
3176                 if (!(fs_info->data_chunk_allocations %
3177                       fs_info->metadata_ratio))
3178                         force_metadata_allocation(fs_info);
3179         }
3180
3181         /*
3182          * Check if we have enough space in SYSTEM chunk because we may need
3183          * to update devices.
3184          */
3185         check_system_chunk(trans, flags);
3186
3187         ret = btrfs_alloc_chunk(trans, flags);
3188         trans->allocating_chunk = false;
3189
3190         spin_lock(&space_info->lock);
3191         if (ret < 0) {
3192                 if (ret == -ENOSPC)
3193                         space_info->full = 1;
3194                 else
3195                         goto out;
3196         } else {
3197                 ret = 1;
3198                 space_info->max_extent_size = 0;
3199         }
3200
3201         space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3202 out:
3203         space_info->chunk_alloc = 0;
3204         spin_unlock(&space_info->lock);
3205         mutex_unlock(&fs_info->chunk_mutex);
3206         /*
3207          * When we allocate a new chunk we reserve space in the chunk block
3208          * reserve to make sure we can COW nodes/leafs in the chunk tree or
3209          * add new nodes/leafs to it if we end up needing to do it when
3210          * inserting the chunk item and updating device items as part of the
3211          * second phase of chunk allocation, performed by
3212          * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
3213          * large number of new block groups to create in our transaction
3214          * handle's new_bgs list to avoid exhausting the chunk block reserve
3215          * in extreme cases - like having a single transaction create many new
3216          * block groups when starting to write out the free space caches of all
3217          * the block groups that were made dirty during the lifetime of the
3218          * transaction.
3219          */
3220         if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
3221                 btrfs_create_pending_block_groups(trans);
3222
3223         return ret;
3224 }
3225
3226 static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
3227 {
3228         u64 num_dev;
3229
3230         num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max;
3231         if (!num_dev)
3232                 num_dev = fs_info->fs_devices->rw_devices;
3233
3234         return num_dev;
3235 }
3236
3237 /*
3238  * Reserve space in the system space for allocating or removing a chunk
3239  */
3240 void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
3241 {
3242         struct btrfs_fs_info *fs_info = trans->fs_info;
3243         struct btrfs_space_info *info;
3244         u64 left;
3245         u64 thresh;
3246         int ret = 0;
3247         u64 num_devs;
3248
3249         /*
3250          * Needed because we can end up allocating a system chunk and for an
3251          * atomic and race free space reservation in the chunk block reserve.
3252          */
3253         lockdep_assert_held(&fs_info->chunk_mutex);
3254
3255         info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3256         spin_lock(&info->lock);
3257         left = info->total_bytes - btrfs_space_info_used(info, true);
3258         spin_unlock(&info->lock);
3259
3260         num_devs = get_profile_num_devs(fs_info, type);
3261
3262         /* num_devs device items to update and 1 chunk item to add or remove */
3263         thresh = btrfs_calc_metadata_size(fs_info, num_devs) +
3264                 btrfs_calc_insert_metadata_size(fs_info, 1);
3265
3266         if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
3267                 btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
3268                            left, thresh, type);
3269                 btrfs_dump_space_info(fs_info, info, 0, 0);
3270         }
3271
3272         if (left < thresh) {
3273                 u64 flags = btrfs_system_alloc_profile(fs_info);
3274
3275                 /*
3276                  * Ignore failure to create system chunk. We might end up not
3277                  * needing it, as we might not need to COW all nodes/leafs from
3278                  * the paths we visit in the chunk tree (they were already COWed
3279                  * or created in the current transaction for example).
3280                  */
3281                 ret = btrfs_alloc_chunk(trans, flags);
3282         }
3283
3284         if (!ret) {
3285                 ret = btrfs_block_rsv_add(fs_info->chunk_root,
3286                                           &fs_info->chunk_block_rsv,
3287                                           thresh, BTRFS_RESERVE_NO_FLUSH);
3288                 if (!ret)
3289                         trans->chunk_bytes_reserved += thresh;
3290         }
3291 }
3292
3293 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
3294 {
3295         struct btrfs_block_group *block_group;
3296         u64 last = 0;
3297
3298         while (1) {
3299                 struct inode *inode;
3300
3301                 block_group = btrfs_lookup_first_block_group(info, last);
3302                 while (block_group) {
3303                         btrfs_wait_block_group_cache_done(block_group);
3304                         spin_lock(&block_group->lock);
3305                         if (block_group->iref)
3306                                 break;
3307                         spin_unlock(&block_group->lock);
3308                         block_group = btrfs_next_block_group(block_group);
3309                 }
3310                 if (!block_group) {
3311                         if (last == 0)
3312                                 break;
3313                         last = 0;
3314                         continue;
3315                 }
3316
3317                 inode = block_group->inode;
3318                 block_group->iref = 0;
3319                 block_group->inode = NULL;
3320                 spin_unlock(&block_group->lock);
3321                 ASSERT(block_group->io_ctl.inode == NULL);
3322                 iput(inode);
3323                 last = block_group->start + block_group->length;
3324                 btrfs_put_block_group(block_group);
3325         }
3326 }
3327
3328 /*
3329  * Must be called only after stopping all workers, since we could have block
3330  * group caching kthreads running, and therefore they could race with us if we
3331  * freed the block groups before stopping them.
3332  */
3333 int btrfs_free_block_groups(struct btrfs_fs_info *info)
3334 {
3335         struct btrfs_block_group *block_group;
3336         struct btrfs_space_info *space_info;
3337         struct btrfs_caching_control *caching_ctl;
3338         struct rb_node *n;
3339
3340         down_write(&info->commit_root_sem);
3341         while (!list_empty(&info->caching_block_groups)) {
3342                 caching_ctl = list_entry(info->caching_block_groups.next,
3343                                          struct btrfs_caching_control, list);
3344                 list_del(&caching_ctl->list);
3345                 btrfs_put_caching_control(caching_ctl);
3346         }
3347         up_write(&info->commit_root_sem);
3348
3349         spin_lock(&info->unused_bgs_lock);
3350         while (!list_empty(&info->unused_bgs)) {
3351                 block_group = list_first_entry(&info->unused_bgs,
3352                                                struct btrfs_block_group,
3353                                                bg_list);
3354                 list_del_init(&block_group->bg_list);
3355                 btrfs_put_block_group(block_group);
3356         }
3357         spin_unlock(&info->unused_bgs_lock);
3358
3359         spin_lock(&info->block_group_cache_lock);
3360         while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
3361                 block_group = rb_entry(n, struct btrfs_block_group,
3362                                        cache_node);
3363                 rb_erase(&block_group->cache_node,
3364                          &info->block_group_cache_tree);
3365                 RB_CLEAR_NODE(&block_group->cache_node);
3366                 spin_unlock(&info->block_group_cache_lock);
3367
3368                 down_write(&block_group->space_info->groups_sem);
3369                 list_del(&block_group->list);
3370                 up_write(&block_group->space_info->groups_sem);
3371
3372                 /*
3373                  * We haven't cached this block group, which means we could
3374                  * possibly have excluded extents on this block group.
3375                  */
3376                 if (block_group->cached == BTRFS_CACHE_NO ||
3377                     block_group->cached == BTRFS_CACHE_ERROR)
3378                         btrfs_free_excluded_extents(block_group);
3379
3380                 btrfs_remove_free_space_cache(block_group);
3381                 ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
3382                 ASSERT(list_empty(&block_group->dirty_list));
3383                 ASSERT(list_empty(&block_group->io_list));
3384                 ASSERT(list_empty(&block_group->bg_list));
3385                 ASSERT(atomic_read(&block_group->count) == 1);
3386                 btrfs_put_block_group(block_group);
3387
3388                 spin_lock(&info->block_group_cache_lock);
3389         }
3390         spin_unlock(&info->block_group_cache_lock);
3391
3392         /*
3393          * Now that all the block groups are freed, go through and free all the
3394          * space_info structs.  This is only called during the final stages of
3395          * unmount, and so we know nobody is using them.  We call
3396          * synchronize_rcu() once before we start, just to be on the safe side.
3397          */
3398         synchronize_rcu();
3399
3400         btrfs_release_global_block_rsv(info);
3401
3402         while (!list_empty(&info->space_info)) {
3403                 space_info = list_entry(info->space_info.next,
3404                                         struct btrfs_space_info,
3405                                         list);
3406
3407                 /*
3408                  * Do not hide this behind enospc_debug, this is actually
3409                  * important and indicates a real bug if this happens.
3410                  */
3411                 if (WARN_ON(space_info->bytes_pinned > 0 ||
3412                             space_info->bytes_reserved > 0 ||
3413                             space_info->bytes_may_use > 0))
3414                         btrfs_dump_space_info(info, space_info, 0, 0);
3415                 WARN_ON(space_info->reclaim_size > 0);
3416                 list_del(&space_info->list);
3417                 btrfs_sysfs_remove_space_info(space_info);
3418         }
3419         return 0;
3420 }
3421
3422 void btrfs_freeze_block_group(struct btrfs_block_group *cache)
3423 {
3424         atomic_inc(&cache->frozen);
3425 }
3426
3427 void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
3428 {
3429         struct btrfs_fs_info *fs_info = block_group->fs_info;
3430         struct extent_map_tree *em_tree;
3431         struct extent_map *em;
3432         bool cleanup;
3433
3434         spin_lock(&block_group->lock);
3435         cleanup = (atomic_dec_and_test(&block_group->frozen) &&
3436                    block_group->removed);
3437         spin_unlock(&block_group->lock);
3438
3439         if (cleanup) {
3440                 mutex_lock(&fs_info->chunk_mutex);
3441                 em_tree = &fs_info->mapping_tree;
3442                 write_lock(&em_tree->lock);
3443                 em = lookup_extent_mapping(em_tree, block_group->start,
3444                                            1);
3445                 BUG_ON(!em); /* logic error, can't happen */
3446                 remove_extent_mapping(em_tree, em);
3447                 write_unlock(&em_tree->lock);
3448                 mutex_unlock(&fs_info->chunk_mutex);
3449
3450                 /* once for us and once for the tree */
3451                 free_extent_map(em);
3452                 free_extent_map(em);
3453
3454                 /*
3455                  * We may have left one free space entry and other possible
3456                  * tasks trimming this block group have left 1 entry each one.
3457                  * Free them if any.
3458                  */
3459                 __btrfs_remove_free_space_cache(block_group->free_space_ctl);
3460         }
3461 }